In [1]:
import sys
import os
import numpy as np
import pandas as pd
sys.path.append(os.path.abspath('../scripts'))

In [2]:
from ab_testing import *

In [3]:
# Load the data
data = pd.read_csv('../data/MachineLearningRating_v3.txt', delimiter='|', low_memory=False) 

# Display the first few rows of the DataFrame
data.head()


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [4]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
data[['TotalPremium', 'TotalClaims']] = imputer.fit_transform(data[['TotalPremium', 'TotalClaims']])

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
data[['TotalPremium', 'TotalClaims']] = scaler.fit_transform(data[['TotalPremium', 'TotalClaims']])



In [5]:
print(data['TotalPremium'].isnull().sum())  # Number of missing values
print(data['TotalClaims'].isnull().sum())  # Number of missing values

print(data['TotalPremium'].describe())
print(data['TotalClaims'].describe())

calculate_risk_ratio(data)
# Summary statistics of RiskRatio
print(data['RiskRatio'].describe())

0
0
count    1.000098e+06
mean     2.723559e+00
std      1.050097e+01
min     -3.578483e+01
25%     -9.933200e-02
50%      0.000000e+00
75%      9.006680e-01
max      2.976787e+03
Name: TotalPremium, dtype: float64
count    1.000098e+06
mean     6.486119e+01
std      2.384075e+03
min     -1.200241e+04
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.930921e+05
Name: TotalClaims, dtype: float64
count    9.998700e+05
mean    -2.306152e+01
std      3.984520e+03
min     -1.536156e+06
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.838018e+05
Name: RiskRatio, dtype: float64


## Accept or reject the following Null Hypotheses: 

## A/b hypothesis 1

- Null Hypothesis: There are no risk differences across provinces.

- Alternative Hypothesis: There are significant risk differences across provinces.

- Metric: Risk can be calculated as the ratio of total claims (TotalClaims) to total premium (TotalPremium) or through claims frequency. 

- the goal is to compare risk differences across multiple provinces.

In [7]:
# Step 1: Calculate Risk Ratio
calculate_risk_ratio(data)

# Step 2: Perform ANOVA to test for significant risk differences across provinces
f_stat, p_value = anova_test(data, 'RiskRatio', 'Province')

# Step 3: Interpret the results
alpha = 0.05  # significance level
print(f'ANOVA F-statistic: {f_stat}')
print(f'p-value: {p_value}')

if p_value < alpha:
    print("Reject the null hypothesis: There are significant differences in risk across provinces.")
else:
    print("Fail to reject the null hypothesis: There are no significant differences in risk across provinces.")

ANOVA F-statistic: 2.087543710223074
p-value: 0.03338606757495004
Reject the null hypothesis: There are significant differences in risk across provinces.


An ANOVA test was performed to analyze whether there are significant differences in the RiskRatio across different provinces. The results of the test indicate the following:

- F-statistic: 2.0875
- p-value: 0.0334

Since the p-value is below the significance threshold of 0.05, I rejected the null hypothesis, concluding that there are statistically significant differences in risk across provinces. This finding implies that risk is not equally distributed geographically.

## A/b hypothesis 2

- Null Hypothesis: There are no risk differences between zip codes.

- Alternative Hypothesis: There are significant risk differences between zip codes (represented by PostalCode).

- Metric: Similar to the first hypothesis, risk can be evaluated using TotalClaims to TotalPremium. 

- The goal is to assess whether significant risk differences exist between zip codes.

In [8]:
# Step 1: Calculate Risk Ratio
calculate_risk_ratio(data)

# Step 2: Perform ANOVA to test for significant risk differences between zip codes
f_stat, p_value = anova_test(data, 'RiskRatio', 'PostalCode')

# Step 3: Interpret the results
alpha = 0.05  # significance level
print(f'ANOVA F-statistic: {f_stat}')
print(f'p-value: {p_value}')

if p_value < alpha:
    print("Reject the null hypothesis: There are significant differences in risk between zip codes.")
else:
    print("Fail to reject the null hypothesis: There are no significant differences in risk between zip codes.")

ANOVA F-statistic: 0.6249159710069903
p-value: 0.9999999999999999
Fail to reject the null hypothesis: There are no significant differences in risk between zip codes.


An ANOVA test was conducted to assess whether there are significant differences in the RiskRatio across different zip codes. The findings are as follows:

- F-statistic: 0.6249
- p-value: 1.0
With a p-value of 1.0, which is much higher than the significance threshold of 0.05, we fail to reject the null hypothesis. This indicates that there are no statistically significant differences in risk across zip codes. Consequently, the risk appears to be evenly distributed across zip codes in the data.

## A/b hypothesis 3

- Null Hypothesis: There are no significant margin (profit) differences between zip codes.

- Alternative Hypothesis: There are significant profit margin differences between zip codes.

- Metric: Profit margin can be calculated as the difference between total premium (TotalPremium) and total claims (TotalClaims).

- The goal is to compare the mean profit margins across different zip codes.

In [9]:
# Step 1: Calculate Profit Margin and remove NaN values
data = calculate_profit_margin(data)

# Step 2: Inspect summary statistics after calculating profit margin
print(data['ProfitMargin'].describe())

# Step 3: Perform ANOVA to test for significant profit margin differences across zip codes
try:
    f_stat, p_value = anova_test(data, 'ProfitMargin', 'PostalCode')
    # Interpret the results
    alpha = 0.05  # significance level
    print(f'ANOVA F-statistic: {f_stat}')
    print(f'p-value: {p_value}')
    if p_value < alpha:
        print("Reject the null hypothesis: There are significant profit margin differences between zip codes.")
    else:
        print("Fail to reject the null hypothesis: There are no significant profit margin differences between zip codes.")
except ValueError as e:
    print(f'Error: {e}')

count    1.000098e+06
mean    -6.213763e+01
std      2.382821e+03
min     -3.930811e+05
25%     -9.933200e-02
50%     -9.560000e-04
75%      9.006680e-01
max      1.203158e+04
Name: ProfitMargin, dtype: float64
ANOVA F-statistic: 0.9347938292669827
p-value: 0.9173934317420169
Fail to reject the null hypothesis: There are no significant profit margin differences between zip codes.


An ANOVA test was performed to determine if there are significant differences in the ProfitMargin across different zip codes. The analysis yielded the following results:

- F-statistic: 0.9348
- p-value: 0.9174
The p-value of 0.9174 is well above the significance level of 0.05, leading us to fail to reject the null hypothesis. This result indicates that there are no statistically significant differences in profit margins between zip codes. The profit margin appears to be uniformly distributed across the zip codes in the dataset.

## A/b hypothesis 4

- Null Hypothesis: There are no significant risk differences between Women and Men.

- Alternative Hypothesis: There are significant risk differences between Women and Men.

- Metric: Risk can again be evaluated as the ratio of TotalClaims to TotalPremium.

- The goal id to compare risk differences between men and women.

In [6]:
# Perform T-test to compare risk between men and women
try:
    t_stat, p_value = t_test(data, 'Gender', 'Male', 'Female', 'RiskRatio')
    # Interpret the results
    alpha = 0.05  # significance level
    print(f'T-test statistic: {t_stat}')
    print(f'p-value: {p_value}')
    if p_value < alpha:
        print("Reject the null hypothesis: There are significant risk differences between Women and Men.")
    else:
        print("Fail to reject the null hypothesis: There are no significant risk differences between Women and Men.")
except ValueError as e:
    print(f'Error: {e}')

T-test statistic: -0.7668261804240528
p-value: 0.4431885220785591
Fail to reject the null hypothesis: There are no significant risk differences between Women and Men.


A T-test was conducted to assess whether there are significant differences in the RiskRatio between women and men. The results are as follows:

- T-test Statistic: -0.7668
- p-value: 0.4432
Given the p-value of 0.4432, which is greater than the significance threshold of 0.05, we fail to reject the null hypothesis. This indicates that there are no statistically significant differences in risk between women and men. The risk levels are similar across these gender groups in the dataset.