# KPI Matrics 

In [None]:
import pandas as pd

# Load the .txt file with '|' as the delimiter
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')

# Adding calculated columns for metrics
data['ClaimFrequency'] = data['TotalClaims'] / data['TotalPremium']
data['LossRatio'] = data['TotalClaims'] / data['TotalPremium']

# 1. Risk differences across provinces
province_kpis = data.groupby('Province').agg(
    avg_claim_frequency=('ClaimFrequency', 'mean'),
    avg_loss_ratio=('LossRatio', 'mean'),
    total_claims=('TotalClaims', 'sum'),
    total_premium=('TotalPremium', 'sum')
).reset_index()

print("Risk KPIs by Province:")
print(province_kpis)

# 2. Risk differences between zip codes
zip_code_kpis = data.groupby('PostalCode').agg(
    avg_claim_frequency=('ClaimFrequency', 'mean'),
    avg_loss_ratio=('LossRatio', 'mean'),
    total_claims=('TotalClaims', 'sum'),
    total_premium=('TotalPremium', 'sum')
).reset_index()

print("\nRisk KPIs by Zip Code:")
print(zip_code_kpis)

# 3. Margin differences between zip codes
zip_code_margin_kpis = data.groupby('PostalCode').agg(
    total_premium=('TotalPremium', 'sum'),
    total_claims=('TotalClaims', 'sum')
).reset_index()
zip_code_margin_kpis['gross_margin_percentage'] = (
    (zip_code_margin_kpis['total_premium'] - zip_code_margin_kpis['total_claims'])
    / zip_code_margin_kpis['total_premium'] * 100
)

print("\nMargin KPIs by Zip Code:")
print(zip_code_margin_kpis)

# 4. Risk differences between Women and Men
gender_kpis = data.groupby('Gender').agg(
    avg_claim_frequency=('ClaimFrequency', 'mean'),
    avg_loss_ratio=('LossRatio', 'mean'),
    total_claims=('TotalClaims', 'sum'),
    total_premium=('TotalPremium', 'sum')
).reset_index()

print("\nRisk KPIs by Gender:")
print(gender_kpis)


# Data Segmentation

## for risk differences across provinces

In [23]:
import pandas as pd
import sys
sys.path.append('../scripts')

from KPI_segment import calculate_kpis



# Example usage in the notebook

# Load your dataset (since you already load it in the notebook)
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')

# Add calculated columns for risk metrics
data['ClaimFrequency'] = data['TotalClaims'] / data['TotalPremium']
data['LossRatio'] = data['TotalClaims'] / data['TotalPremium']

# Set the segmentation feature
segmentation_feature = 'Province'  # Replace with your segmentation feature

# Call the function to calculate KPIs
kpi_results = calculate_kpis(data, segmentation_feature)

# Print the results
print("\nRisk KPIs by Province:")
print(kpi_results['province_kpis'])

#save the dataset
output_file = '../assets/data/segmented_kpis_by_province.csv'
kpi_results['province_kpis'].to_csv(output_file, index=False)






  data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')



Risk KPIs by Province:
  Group       Province  avg_claim_frequency  avg_loss_ratio  total_claims  \
0     B   Eastern Cape                  inf             inf  1.356427e+06   
1     B     Free State                  inf             inf  3.549223e+05   
2     B        Gauteng                  NaN             NaN  2.939415e+07   
3     B  KwaZulu-Natal                  inf             inf  1.430138e+07   
4     B        Limpopo                  inf             inf  1.016477e+06   
5     B     Mpumalanga                  NaN             NaN  2.044675e+06   
6     B     North West                  inf             inf  5.920250e+06   
7     B  Northern Cape             0.203831        0.203831  8.949051e+04   
8     B   Western Cape                  inf             inf  1.038977e+07   

   total_premium  
0   2.140104e+06  
1   5.213632e+05  
2   2.405377e+07  
3   1.320908e+07  
4   1.537324e+06  
5   2.836292e+06  
6   7.490508e+06  
7   3.165581e+05  
8   9.806559e+06  


## for risk differences between zip codes

In [24]:
import pandas as pd
import sys
import pandas as pd  # Ensure pandas is imported

# Add the path to the 'scripts' folder if it's not already in the Python path
sys.path.append('../scripts')

# Import the function from KPI_segment.py
from KPI_segment import calculate_kpis

# Example usage in the notebook
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')

# Add calculated columns for risk metrics
data['ClaimFrequency'] = data['TotalClaims'] / data['TotalPremium']
data['LossRatio'] = data['TotalClaims'] / data['TotalPremium']

# Insert the feature for segmentation manually in the notebook
segmentation_feature = 'PostalCode'  # Insert your specific feature here

# Call the function to calculate KPIs
kpi_results = calculate_kpis(data, segmentation_feature)

# Print the results
print("\nRisk KPIs by Zip Code:")
print(kpi_results['zip_code_kpis'])

#save the dataset
output_file = '../assets/data/segmented_kpis_by_zip_code.csv'
kpi_results['zip_code_kpis'].to_csv(output_file, index=False)



  data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')



Risk KPIs by Zip Code:
    Group  PostalCode  avg_claim_frequency  avg_loss_ratio   total_claims  \
0       B           1                  inf             inf  307583.342105   
1       B           2             0.852758        0.852758   61885.298246   
2       B           4             0.000000        0.000000       0.000000   
3       B           5             0.698901        0.698901   82951.526316   
4       B           6             0.122927        0.122927    8628.596491   
..    ...         ...                  ...             ...            ...   
883     B        9781                  inf             inf   89698.245614   
884     B        9830             0.000000        0.000000       0.000000   
885     B        9868             0.000000        0.000000       0.000000   
886     B        9869             0.120710        0.120710    2236.842105   
887     B        9870             0.000000        0.000000       0.000000   

     total_premium  
0    273035.326595  
1     608

## for Margin differences between zip codes

In [18]:
import sys
import pandas as pd  # Ensure pandas is imported

# Add the path to the 'scripts' folder if it's not already in the Python path
sys.path.append('../scripts')

# Import the function from KPI_segment.py
from KPI_segment import calculate_kpis

# Example usage in the notebook
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')

# Add calculated columns for risk metrics
data['ClaimFrequency'] = data['TotalClaims'] / data['TotalPremium']
data['LossRatio'] = data['TotalClaims'] / data['TotalPremium']

# Insert the feature for segmentation manually in the notebook
segmentation_feature = 'PostalCode'  # Insert your specific feature here

# Call the function to calculate KPIs
kpi_results = calculate_kpis(data, segmentation_feature)

# Print the results
print("\nMargin KPIs by Zip Code:")
print(kpi_results['zip_code_margin_kpis'])
#save the dataset
output_file = '../assets/data/segmented_kpis_by_margin_zip_code.csv'
kpi_results['zip_code_margin_kpis'].to_csv(output_file, index=False)



  data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')



Margin KPIs by Zip Code:
    Group  PostalCode  total_premium   total_claims  gross_margin_percentage
0       B           1  273035.326595  307583.342105               -12.653313
1       B           2   60861.729133   61885.298246                -1.681794
2       B           4    8773.975714       0.000000               100.000000
3       B           5   24661.450526   82951.526316              -236.361100
4       B           6   22260.230088    8628.596491                61.237613
..    ...         ...            ...            ...                      ...
883     B        9781   35077.787598   89698.245614              -155.712380
884     B        9830    7378.610100       0.000000               100.000000
885     B        9868   11604.237719       0.000000               100.000000
886     B        9869   63355.830081    2236.842105                96.469398
887     B        9870   17703.244175       0.000000               100.000000

[888 rows x 5 columns]


## for risk differences between Women and Men

In [25]:
import sys
import pandas as pd  # Ensure pandas is imported

# Add the path to the 'scripts' folder if it's not already in the Python path
sys.path.append('../scripts')

# Import the function from KPI_segment.py
from KPI_segment import calculate_kpis

# Example usage in the notebook
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')

# Add calculated columns for risk metrics
data['ClaimFrequency'] = data['TotalClaims'] / data['TotalPremium']
data['LossRatio'] = data['TotalClaims'] / data['TotalPremium']

# Insert the feature for segmentation manually in the notebook
segmentation_feature = 'Gender'  # Insert your specific feature here

# Call the function to calculate KPIs
kpi_results = calculate_kpis(data, segmentation_feature)

# Print the results
print("\nRisk KPIs by Gender:")
print(kpi_results['gender_kpis'])

#save the dataset
output_file = '../assets/data/segmented_kpis_by_gender.csv'
kpi_results['gender_kpis'].to_csv(output_file, index=False)



  data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter='|')



Risk KPIs by Gender:
  Group         Gender  avg_claim_frequency  avg_loss_ratio  total_claims  \
0     B         Female                  inf             inf  2.502461e+05   
1     B           Male                  inf             inf  1.396704e+06   
2     B  Not specified                  NaN             NaN  6.271410e+07   

   total_premium  
0   3.044806e+05  
1   1.580143e+06  
2   5.920275e+07  


# Statistical Analysis

In [26]:
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency

def t_test_kpis(control_group, test_group, column):
    """
    Conduct a t-test between the Control and Test groups for a given KPI.
    
    Parameters:
    - control_group: DataFrame for the control group.
    - test_group: DataFrame for the test group.
    - column: The column (KPI) to test.
    
    Returns:
    - p-value from the t-test.
    """
    control_values = control_group[column].dropna()  # Remove NaN values
    test_values = test_group[column].dropna()  # Remove NaN values
    
    t_stat, p_val = stats.ttest_ind(control_values, test_values)
    return p_val

def chi_squared_test(control_group, test_group, column):
    """
    Perform a Chi-squared test for independence between two categorical variables.
    
    Parameters:
    - control_group: DataFrame for the control group.
    - test_group: DataFrame for the test group.
    - column: The categorical column to test (e.g., 'Gender').
    
    Returns:
    - p-value from the Chi-squared test.
    """
    # Create contingency table
    contingency_table = pd.crosstab(control_group[column], test_group[column])
    
    # Perform Chi-squared test
    chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
    return p_val

def interpret_p_value(p_value):
    
    if p_value < 0.05:
        return "Reject null hypothesis: Significant impact"
    else:
        return "Fail to reject null hypothesis: No significant impact"

# Main function to run the statistical testing
def main(province_kpis, zip_code_kpis, zip_code_margin_kpis, gender_kpis):
    # Segment data into control and test groups
    control_group = province_kpis[province_kpis['Group'] == 'A']
    test_group = province_kpis[province_kpis['Group'] == 'B']

    # Perform statistical tests for KPIs (numerical and categorical)
    
    # For numerical columns, perform t-test (e.g., avg_claim_frequency, avg_loss_ratio)
    numerical_columns = ['avg_claim_frequency', 'avg_loss_ratio', 'total_claims', 'total_premium']  # Add other numerical KPIs
    for col in numerical_columns:
        p_value = t_test_kpis(control_group, test_group, col)
        print(f"p-value for {col}: {p_value}")
        print(f"Interpretation: {interpret_p_value(p_value)}\n")

    # For categorical columns, perform Chi-Squared test (e.g., Gender, PostalCode)
    categorical_columns = ['Gender', 'PostalCode']  # Add other categorical KPIs
    for col in categorical_columns:
        p_value = chi_squared_test(control_group, test_group, col)
        print(f"p-value for {col}: {p_value}")
        print(f"Interpretation: {interpret_p_value(p_value)}\n")
    
    # Optionally, print the KPIs for review
    print("\nRisk KPIs by Province:")
    print(province_kpis)

    print("\nRisk KPIs by Zip Code:")
    print(zip_code_kpis)

    print("\nMargin KPIs by Zip Code:")
    print(zip_code_margin_kpis)

    print("\nRisk KPIs by Gender:")
    print(gender_kpis)

# Example usage
if __name__ == '__main__':
    # Assuming the following DataFrames already contain your KPI results
    # Replace these with the actual KPI DataFrames you have computed
    province_kpis = pd.read_csv('../assets/data/segmented_kpis_by_province.csv')
    zip_code_kpis = pd.read_csv('../assets/data/segmented_kpis_by_zip_code.csv')
    zip_code_margin_kpis = pd.read_csv('../assets/data/segmented_kpis_by_margin_zip_code.csv')
    gender_kpis = pd.read_csv('../assets/data/segmented_kpis_by_gender.csv')

    # Call the main function to perform statistical tests
    main(province_kpis, zip_code_kpis, zip_code_margin_kpis, gender_kpis)


p-value for avg_claim_frequency: nan
Interpretation: Fail to reject null hypothesis: No significant impact

p-value for avg_loss_ratio: nan
Interpretation: Fail to reject null hypothesis: No significant impact

p-value for total_claims: nan
Interpretation: Fail to reject null hypothesis: No significant impact

p-value for total_premium: nan
Interpretation: Fail to reject null hypothesis: No significant impact



  t_stat, p_val = stats.ttest_ind(control_values, test_values)


KeyError: 'Gender'