# Statistical Hypothesis Testing

Validate risk driver hypotheses with fixes for warnings and sample size issues.


In [9]:
import pandas as pd
import numpy as np
from scipy import stats
from src.data_preprocessor import load_and_preprocess

# Load preprocessed data
df, _ = load_and_preprocess('../data/MachineLearningRating_v3.csv')
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)
df_claims = df[df['TotalClaims'] > 0]
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# Data Segmentation
# Provinces: Western Cape vs. Gauteng
wcape = df[df['Province_Western Cape'] == 1]
gauteng = df[df['Province_Gauteng'] == 1]

# Zip Codes: Select zip codes with sufficient claim data
zip_claims = df_claims.groupby('PostalCode').filter(lambda x: len(x) > 1)  # Filter for valid claim data
zip_freq = zip_claims.groupby('PostalCode')['HasClaim'].mean().sort_values()
if len(zip_freq) >= 2:
    zip_low_code = zip_freq.index[0]  # Lowest claim frequency with claims
    zip_high_code = zip_freq.index[-1]  # Highest claim frequency with claims
    zip_low = df[df['PostalCode'] == zip_low_code]
    zip_high = df[df['PostalCode'] == zip_high_code]
else:
    print("Insufficient zip codes with claims; using all available.")
    zip_low = df[df['PostalCode'] == df['PostalCode'].value_counts().idxmin()]
    zip_high = df[df['PostalCode'] == df['PostalCode'].value_counts().idxmax()]

# Gender: Female vs. Male
female = df[df['Gender_Male'] == 0]
male = df[df['Gender_Male'] == 1]

# Equivalence Check
t_sumins_wc_gauteng = stats.ttest_ind(wcape['SumInsured'].dropna(), gauteng['SumInsured'].dropna(), equal_var=False)
print(f'SumInsured equivalence (WC vs Gauteng) p-value: {t_sumins_wc_gauteng.pvalue}')


  df = pd.read_csv(csv_path)


SumInsured equivalence (WC vs Gauteng) p-value: 0.07971320414125885


In [None]:
# Statistical Testing with Error Handling

# Provinces
prov_data = df[['Province_Free State', 'Province_Gauteng', 'Province_KwaZulu-Natal', 'Province_Limpopo',
                'Province_Mpumalanga', 'Province_North West', 'Province_Northern Cape', 'Province_Western Cape']]
prov_has_claim = df.loc[prov_data.index, 'HasClaim']
prov_contingency = pd.crosstab(prov_data.idxmax(axis=1), prov_has_claim)
chi2_prov_freq, p_prov_freq, _, _ = stats.chi2_contingency(prov_contingency) if not prov_contingency.empty and prov_contingency.shape[1] > 1 else (np.nan, np.nan, 0, 0)

sev_gauteng = df_claims[df_claims.index.isin(gauteng.index)]['TotalClaims'].dropna()
sev_wcape = df_claims[df_claims.index.isin(wcape.index)]['TotalClaims'].dropna()
t_sev_prov = stats.ttest_ind(sev_gauteng, sev_wcape, equal_var=False) if len(sev_gauteng) > 1 and len(sev_wcape) > 1 else (np.nan, np.nan)
p_sev_prov = t_sev_prov[1] if not np.isnan(t_sev_prov[1]) else np.nan

margin_gauteng = df[df.index.isin(gauteng.index)]['Margin'].dropna()
margin_wcape = df[df.index.isin(wcape.index)]['Margin'].dropna()
t_margin_prov = stats.ttest_ind(margin_gauteng, margin_wcape, equal_var=False) if len(margin_gauteng) > 1 and len(margin_wcape) > 1 else (np.nan, np.nan)
p_margin_prov = t_margin_prov[1] if not np.isnan(t_margin_prov[1]) else np.nan

# Zip Codes
zip_contingency = pd.crosstab(df.loc[zip_low.index.union(zip_high.index), 'PostalCode'], df.loc[zip_low.index.union(zip_high.index), 'HasClaim'])
chi2_zip_freq, p_zip_freq, _, _ = stats.chi2_contingency(zip_contingency) if not zip_contingency.empty and zip_contingency.shape[1] > 1 else (np.nan, np.nan, 0, 0)

sev_zip_high = df_claims[df_claims.index.isin(zip_high.index)]['TotalClaims'].dropna()
sev_zip_low = df_claims[df_claims.index.isin(zip_low.index)]['TotalClaims'].dropna()
t_sev_zip = stats.ttest_ind(sev_zip_high, sev_zip_low, equal_var=False) if len(sev_zip_high) > 1 and len(sev_zip_low) > 1 else (np.nan, np.nan)
p_sev_zip = t_sev_zip[1] if not np.isnan(t_sev_zip[1]) else np.nan

margin_zip_high = df[df.index.isin(zip_high.index)]['Margin'].dropna()
margin_zip_low = df[df.index.isin(zip_low.index)]['Margin'].dropna()
t_margin_zip = stats.ttest_ind(margin_zip_high, margin_zip_low, equal_var=False) if len(margin_zip_high) > 1 and len(margin_zip_low) > 1 else (np.nan, np.nan)
p_margin_zip = t_margin_zip[1] if not np.isnan(t_margin_zip[1]) else np.nan

# Gender
gender_contingency = pd.crosstab(pd.concat([female['Gender_Male'], male['Gender_Male']]), df.loc[female.index.union(male.index), 'HasClaim'])
chi2_gender_freq, p_gender_freq, _, _ = stats.chi2_contingency(gender_contingency) if not gender_contingency.empty and gender_contingency.shape[1] > 1 else (np.nan, np.nan, 0, 0)

# Results
results = {
    'Provinces': {'Claim Frequency': {'p': p_prov_freq}, 'Claim Severity': {'p': p_sev_prov}, 'Margin': {'p': p_margin_prov}},
    'Zip Codes': {'Claim Frequency': {'p': p_zip_freq}, 'Claim Severity': {'p': p_sev_zip}, 'Margin': {'p': p_margin_zip}},
    'Gender': {'Claim Frequency': {'p': p_gender_freq}}
}
for feature, metrics in results.items():
    for metric, value in metrics.items():
        print(f"{feature} - {metric} p-value: {value['p']:.4f}")

# Sample sizes for debugging
print(f"Gauteng Claims: {len(sev_gauteng)}, WCape Claims: {len(sev_wcape)}")
print(f"Zip High Claims: {len(sev_zip_high)}, Zip Low Claims: {len(sev_zip_low)}")


# Analysis and Report

- **Provinces**: 
  - Claim Frequency p = {p_prov_freq:.4f}, {('fail to reject' if p_prov_freq >= 0.05 else 'reject')} H₀.
  - Claim Severity p = {p_sev_prov:.4f}, {('fail to reject' if p_sev_prov >= 0.05 else 'reject')} H₀.
  - Margin p = {p_margin_prov:.4f}, {('fail to reject' if p_margin_prov >= 0.05 else 'reject')} H₀.
- **Zip Codes**: 
  - Claim Frequency p = {p_zip_freq:.4f}, {('fail to reject' if p_zip_freq >= 0.05 else 'reject')} H₀.
  - Claim Severity p = {p_sev_zip:.4f}, {('fail to reject' if p_sev_zip >= 0.05 else 'reject')} H₀ (if valid data).
  - Margin p = {p_margin_zip:.4f}, {('fail to reject' if p_margin_zip >= 0.05 else 'reject')} H₀.
- **Gender**: 
  - Claim Frequency p = {p_gender_freq:.4f}, {('fail to reject' if p_gender_freq >= 0.05 else 'reject')} H₀.

# Interpretation & Business Recommendation

- **Provinces**: If Claim Severity H₀ is rejected (e.g., p = 0.0306), Gauteng may have higher claim severity, suggesting a premium increase of 5-10% in high-risk provinces.
- **Zip Codes**: If Claim Frequency H₀ is rejected (e.g., p = 0.0242), high-claim zip codes may need targeted risk mitigation or higher premiums. Investigate Claim Severity if data becomes available.
- **Gender**: If Claim Frequency H₀ is rejected (e.g., p = 0.0198), a gender-based pricing adjustment could be considered, pending regulatory review.
