In [None]:
# Cell 1
import pandas as pd
from src.stat_tests import load_data, prepare_metrics, run_all
df = load_data("data/processed_data.csv")
df = prepare_metrics(df)
df.shape


In [None]:
# Cell 2: descriptive stats & missing values
display(df.describe(include='all').T)
display(df.isna().sum())


In [None]:
# Cell 3: KPIs
overall_loss_ratio = df['TotalClaims'].sum() / df['TotalPremium'].sum()
claim_freq = df['has_claim'].mean()
claim_sev = df.loc[df['has_claim']==1, 'TotalClaims'].mean()
print("Overall loss ratio: ", overall_loss_ratio)
print("Claim frequency: ", claim_freq)
print("Claim severity (given claim): ", claim_sev)


In [None]:
# Cell 4: run all tests and show results
res = run_all(path="data/processed_data.csv")
res


In [None]:
# Cell 5: show province frequencies and loss ratios
prov = df.groupby('Province').agg(
    policies=('PolicyID','nunique'),
    claims=('has_claim','sum'),
    total_claims=('TotalClaims','sum'),
    total_premium=('TotalPremium','sum')
)
prov['loss_ratio'] = prov['total_claims'] / prov['total_premium']
prov = prov.sort_values('loss_ratio', ascending=False)
prov.head(15)


In [None]:
# Cell 6: visual - proportion claims by Gender
import matplotlib.pyplot as plt
prop = df.groupby('Gender')['has_claim'].mean().sort_values()
plt.figure(figsize=(6,4))
plt.bar(prop.index.astype(str), prop.values)
plt.title("Claim Frequency by Gender")
plt.ylabel("Proportion with at least one claim")
plt.show()


In [None]:
import numpy as np
def bootstrap_mean_diff_ci(x, y, n_boot=1000, alpha=0.05):
    diffs = []
    for _ in range(n_boot):
        x_s = np.random.choice(x, size=len(x), replace=True)
        y_s = np.random.choice(y, size=len(y), replace=True)
        diffs.append(np.mean(x_s)-np.mean(y_s))
    lo = np.percentile(diffs, 100*alpha/2)
    hi = np.percentile(diffs, 100*(1-alpha/2))
    return np.mean(diffs), lo, hi
