In [19]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime

# Load dataset safely
try:
    df = pd.read_csv("../data/MachineLearningRating_v3.txt", sep="|")
    print("✅ Dataset loaded successfully. Rows:", len(df))
except FileNotFoundError:
    print("❌ ERROR: File not found. Check the file path.")
except pd.errors.ParserError:
    print("❌ ERROR: File could not be parsed. Check delimiter or file format.")
except Exception as e:
    print("❌ Unexpected error:", e)

# Preview column names
print("\nColumns in dataset:", df.columns.tolist())


  df = pd.read_csv("../data/MachineLearningRating_v3.txt", sep="|")


✅ Dataset loaded successfully. Rows: 1000098

Columns in dataset: ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims']


In [20]:
# Create Claim metrics and Margin
try:
    df['HasClaim'] = df['TotalClaims'] > 0
    df['ClaimSeverity'] = df['TotalClaims'] / df['HasClaim'].replace(0, np.nan)
    df['ClaimFrequency'] = df.groupby('Province')['HasClaim'].transform('mean')
    df['Margin'] = df['TotalPremium'] - df['TotalClaims']
    print("✅ KPI fields created successfully.")
except KeyError as e:
    print(f"❌ Missing column: {e}")
except Exception as e:
    print("❌ Error creating KPIs:", e)

# Calculate Vehicle Age
current_year = datetime.now().year
df['VehicleAge'] = current_year - df['RegistrationYear']


✅ KPI fields created successfully.


In [22]:
def chi_square_test(df, category, target='HasClaim'):
    try:
        print(f"\n--- Chi-Square Test: {category} → {target} ---")
        
        if category not in df.columns:
            raise KeyError(f"Column '{category}' not found in dataset.")
        
        contingency = pd.crosstab(df[category], df[target])
        if contingency.empty or contingency.shape[0] < 2:
            raise ValueError("Contingency table invalid for chi-square test.")
        
        chi2, p, dof, expected = stats.chi2_contingency(contingency)
        print("Chi-square Statistic:", chi2)
        print("p-value:", p)
        print("Degrees of Freedom:", dof)
        print("➡ RESULT:", "Significant" if p < 0.05 else "Not Significant")
    
    except KeyError as e:
        print("❌ Missing column:", e)
    except ValueError as e:
        print("❌ Cannot perform test:", e)
    except Exception as e:
        print("❌ Unexpected error:", e)

In [23]:
def anova_test(df, numeric_feature, target='ClaimSeverity', only_claims=False):
    try:
        print(f"\n--- ANOVA Test: {numeric_feature} → {target} ---")

        if numeric_feature not in df.columns:
            raise KeyError(f"Column '{numeric_feature}' not found.")
        
        data = df.copy()
        if only_claims:
            data = data[data['HasClaim'] == 1]
            if len(data) < 3:
                raise ValueError("Not enough claim records for ANOVA.")
        
        # Split numeric feature into 3 quantile-based groups
        data['Group'] = pd.qcut(data[numeric_feature], 3, labels=['Low', 'Medium', 'High'])
        groups = [group[target].dropna() for name, group in data.groupby('Group')]
        if any(len(g) == 0 for g in groups):
            raise ValueError("One or more ANOVA groups are empty.")
        
        f_stat, p = stats.f_oneway(*groups)
        print("F-statistic:", f_stat)
        print("p-value:", p)
        print("➡ RESULT:", "Significant" if p < 0.05 else "Not Significant")
    
    except KeyError as e:
        print("❌ Missing column:", e)
    except ValueError as e:
        print("❌ Invalid ANOVA setup:", e)
    except Exception as e:
        print("❌ Unexpected error:", e)


In [24]:
chi_square_test(df, "Province")
chi_square_test(df, "Gender")

# For PostalCode, only test top 10 most frequent to avoid sparse contingency table
top_postal = df['PostalCode'].value_counts().head(10).index
chi_square_test(df[df['PostalCode'].isin(top_postal)], "PostalCode")



--- Chi-Square Test: Province → HasClaim ---
Chi-square Statistic: 104.19088107029361
p-value: 5.925510718204677e-19
Degrees of Freedom: 8
➡ RESULT: Significant

--- Chi-Square Test: Gender → HasClaim ---
Chi-square Statistic: 7.255926312995721
p-value: 0.026570248768437145
Degrees of Freedom: 2
➡ RESULT: Significant

--- Chi-Square Test: PostalCode → HasClaim ---
Chi-square Statistic: 72.64941061601782
p-value: 4.5932778849314244e-12
Degrees of Freedom: 9
➡ RESULT: Significant


In [25]:
# VehicleAge effect on ClaimSeverity
anova_test(df, "VehicleAge", target="ClaimSeverity", only_claims=True)

# CustomValueEstimate effect on ClaimSeverity
anova_test(df, "CustomValueEstimate", target="ClaimSeverity", only_claims=True)

# Margin difference by PostalCode (use top 10 most frequent)
anova_test(df[df['PostalCode'].isin(top_postal)], "Margin", target="Margin", only_claims=False)



--- ANOVA Test: VehicleAge → ClaimSeverity ---


  groups = [group[target].dropna() for name, group in data.groupby('Group')]


F-statistic: 10.580480368168741
p-value: 2.644385969321076e-05
➡ RESULT: Significant

--- ANOVA Test: CustomValueEstimate → ClaimSeverity ---


  groups = [group[target].dropna() for name, group in data.groupby('Group')]


F-statistic: 7.968219241881727
p-value: 0.00038090087883509144
➡ RESULT: Significant

--- ANOVA Test: Margin → Margin ---


  groups = [group[target].dropna() for name, group in data.groupby('Group')]


F-statistic: 668.5538158211712
p-value: 1.8643341399246364e-290
➡ RESULT: Significant


In [26]:
# Create a summary table of all test results
summary = []

# Example for Chi-Square
for col in ['Province', 'Gender']:
    contingency = pd.crosstab(df[col], df['HasClaim'])
    chi2, p, dof, _ = stats.chi2_contingency(contingency)
    summary.append({'Feature': col, 'Test': 'Chi-Square', 'Statistic': chi2, 'p-value': p, 
                    'Significant': p < 0.05})

# Example for ANOVA
for col in ['VehicleAge', 'CustomValueEstimate']:
    data = df[df['HasClaim'] == 1].copy()
    data['Group'] = pd.qcut(data[col], 3, labels=['Low','Medium','High'])
    groups = [g['ClaimSeverity'].dropna() for _, g in data.groupby('Group')]
    f_stat, p = stats.f_oneway(*groups)
    summary.append({'Feature': col, 'Test': 'ANOVA', 'Statistic': f_stat, 'p-value': p, 
                    'Significant': p < 0.05})

summary_df = pd.DataFrame(summary)
print(summary_df)


               Feature        Test   Statistic       p-value  Significant
0             Province  Chi-Square  104.190881  5.925511e-19         True
1               Gender  Chi-Square    7.255926  2.657025e-02         True
2           VehicleAge       ANOVA   10.580480  2.644386e-05         True
3  CustomValueEstimate       ANOVA    7.968219  3.809009e-04         True


  groups = [g['ClaimSeverity'].dropna() for _, g in data.groupby('Group')]
  groups = [g['ClaimSeverity'].dropna() for _, g in data.groupby('Group')]
