In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest


In [4]:
# Load the dataset 
df = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|', low_memory=False)
df.head()


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [5]:
# Create new columns for analysis
df['AtLeastOneClaim'] = (df['TotalClaims'] > 0).astype(int)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']


In [6]:
# Province: Claim Frequency (Chi-Square Test)
prov_tab = pd.crosstab(df['Province'], df['AtLeastOneClaim'])
chi2_prov, p_prov, _, _ = stats.chi2_contingency(prov_tab)

# Province: Claim Severity (ANOVA)
prov_sev = [g['TotalClaims'] for _, g in df[df['AtLeastOneClaim'] == 1].groupby('Province') if len(g) > 1]
anova_stat, p_sev_prov = (stats.f_oneway(*prov_sev) if len(prov_sev) > 1 else (np.nan, np.nan))


In [7]:
# Select top 2 Zip Codes by volume
zipA, zipB = df['PostalCode'].value_counts().index[:2]
sub = df[df['PostalCode'].isin([zipA, zipB])]

# Zip: Claim Frequency (Z-Test)
zip_tab = pd.crosstab(sub['PostalCode'], sub['AtLeastOneClaim'])
count = zip_tab[1].values
nobs = zip_tab.sum(axis=1).values
z_zip_freq, p_zip_freq = proportions_ztest(count, nobs)

# Zip: Claim Severity (T-Test)
sevA = sub[(sub['PostalCode'] == zipA) & (sub['AtLeastOneClaim'] == 1)]['TotalClaims']
sevB = sub[(sub['PostalCode'] == zipB) & (sub['AtLeastOneClaim'] == 1)]['TotalClaims']
t_zip_sev, p_zip_sev = (stats.ttest_ind(sevA, sevB, equal_var=False) if len(sevA) > 1 and len(sevB) > 1 else (np.nan, np.nan))

# Zip: Margin (T-Test)
mA = sub[sub['PostalCode'] == zipA]['Margin']
mB = sub[sub['PostalCode'] == zipB]['Margin']
t_zip_mar, p_zip_mar = stats.ttest_ind(mA, mB, equal_var=False)


In [8]:
# Gender: Claim Frequency (Chi-Square Test)
gen_tab = pd.crosstab(df['Gender'], df['AtLeastOneClaim'])
chi2_gen, p_gen, _, _ = stats.chi2_contingency(gen_tab)

# Gender: Claim Severity (T-Test)
genders = df['Gender'].dropna().unique()
if len(genders) >= 2:
    g1, g2 = genders[:2]
    g1sev = df[(df['Gender'] == g1) & (df['AtLeastOneClaim'] == 1)]['TotalClaims']
    g2sev = df[(df['Gender'] == g2) & (df['AtLeastOneClaim'] == 1)]['TotalClaims']
    t_gen_sev, p_gen_sev = (stats.ttest_ind(g1sev, g2sev, equal_var=False) if len(g1sev) > 1 and len(g2sev) > 1 else (np.nan, np.nan))
else:
    t_gen_sev, p_gen_sev = np.nan, np.nan


In [9]:
# Set significance level
alpha = 0.05

# Hypothesis results
records = [
    ('Province Frequency', chi2_prov, p_prov),
    ('Province Severity', anova_stat, p_sev_prov),
    ('Zip Frequency', z_zip_freq, p_zip_freq),
    ('Zip Severity', t_zip_sev, p_zip_sev),
    ('Zip Margin', t_zip_mar, p_zip_mar),
    ('Gender Frequency', chi2_gen, p_gen),
    ('Gender Severity', t_gen_sev, p_gen_sev),
]

# Interpret results
rows = []
for hypo, stat, p in records:
    rej = p < alpha
    interp = ""
    if rej:
        if hypo == 'Province Frequency':
            freq = df.groupby('Province')['AtLeastOneClaim'].mean()
            top = freq.idxmax()
            diff = (freq.max() - freq.min()) * 100
            interp = (
                f"❗ Rejected (p={p:.3f}). {top} has ~{diff:.1f}% higher claim rate. "
                "Consider region-based premium loading."
            )
        elif hypo == 'Province Severity':
            interp = (
                f"❗ Rejected (p={p:.3f}). Claim severity varies by province. Adjust pricing."
            )
        elif hypo == 'Zip Frequency':
            interp = (
                f"❗ Rejected (p={p:.3f}). Frequencies differ in zips {zipA} vs {zipB}. Adjust loading."
            )
        elif hypo == 'Zip Severity':
            interp = (
                f"❗ Rejected (p={p:.3f}). Claim amounts differ by zip. Include in pricing."
            )
        elif hypo == 'Zip Margin':
            interp = (
                f"❗ Rejected (p={p:.3f}). Margins differ. Adjust underwriting or pricing."
            )
        elif hypo == 'Gender Frequency':
            interp = (
                f"❗ Rejected (p={p:.3f}). Frequency differs by gender. Consider gender-loading."
            )
        elif hypo == 'Gender Severity':
            interp = (
                f"❗ Rejected (p={p:.3f}). Severity differs. Consider adjustment."
            )
    else:
        interp = f"No significant difference (p={p:.3f}); no action needed."

    rows.append({
        'Hypothesis': hypo,
        'Statistic': stat,
        'p_value': p,
        'Reject Null': rej,
        'Business Interpretation': interp
    })

# Final Report
report = pd.DataFrame(rows)
report


Unnamed: 0,Hypothesis,Statistic,p_value,Reject Null,Business Interpretation
0,Province Frequency,104.190881,5.925510999999999e-19,True,❗ Rejected (p=0.000). Gauteng has ~0.2% higher...
1,Province Severity,4.830166,6.304917e-06,True,❗ Rejected (p=0.000). Claim severity varies by...
2,Zip Frequency,1.939401,0.05245248,False,No significant difference (p=0.052); no action...
3,Zip Severity,0.385376,0.700208,False,No significant difference (p=0.700); no action...
4,Zip Margin,1.163915,0.2444624,False,No significant difference (p=0.244); no action...
5,Gender Frequency,7.255926,0.02657025,True,❗ Rejected (p=0.027). Frequency differs by gen...
6,Gender Severity,3.102179,0.002449821,True,❗ Rejected (p=0.002). Severity differs. Consid...


In [13]:
# Styling the report
styled_report = report.style\
    .background_gradient(subset=["p_value"], cmap="RdYlGn_r")\
    .format({"p_value": "{:.3f}", "Statistic": "{:.2f}"})\
    .applymap(lambda v: "color: red" if v is True else "color: green", subset=["Reject Null"])\
    .set_caption("📊 Hypothesis Testing Summary Report")

styled_report



  styled_report = report.style\


Unnamed: 0,Hypothesis,Statistic,p_value,Reject Null,Business Interpretation
0,Province Frequency,104.19,0.0,True,❗ Rejected (p=0.000). Gauteng has ~0.2% higher claim rate. Consider region-based premium loading.
1,Province Severity,4.83,0.0,True,❗ Rejected (p=0.000). Claim severity varies by province. Adjust pricing.
2,Zip Frequency,1.94,0.052,False,No significant difference (p=0.052); no action needed.
3,Zip Severity,0.39,0.7,False,No significant difference (p=0.700); no action needed.
4,Zip Margin,1.16,0.244,False,No significant difference (p=0.244); no action needed.
5,Gender Frequency,7.26,0.027,True,❗ Rejected (p=0.027). Frequency differs by gender. Consider gender-loading.
6,Gender Severity,3.1,0.002,True,❗ Rejected (p=0.002). Severity differs. Consider adjustment.
