In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency

# Load the dataset
# df = pd.read_csv("insurance_data.csv")

# Example Metrics for KPI
kpi_risk = df["TotalClaims"]  # Proxy for risk assessment
df["kpi_risk" ]= df["TotalClaims"]  # Proxy for risk assessment
kpi_margin = df["TotalPremium"] - df["TotalClaims"]  # Profit margin proxy
df['kpi_margin'] = df['TotalPremium'] - df['TotalClaims']


# Null Hypotheses
null_hypotheses = [
    "There are no risk differences across provinces",
    "There are no risk differences between zip codes",
    "There are no significant margin (profit) difference between zip codes",
    "There are no significant risk differences between Women and Men"
]

# Data Segmentation
def segment_data(df, group_col, feature_val1, feature_val2):
    group_a = df[df[group_col] == feature_val1]
    group_b = df[df[group_col] == feature_val2]
    return group_a, group_b

# Statistical Testing Functions
def t_test(group_a, group_b, metric):
    # stat, p_value = ttest_ind(group_a[metric], group_b[metric], nan_policy="omit")
    # return p_value
    stat, p_value = ttest_ind(group_a[metric].values, group_b[metric].values, nan_policy="omit")
    return p_value
# def chi_square_test(group_a, group_b, feature_col):
#     contingency_table = pd.crosstab(group_a[feature_col], group_b[feature_col])
#     chi2, p_value, _, _ = chi2_contingency(contingency_table)
#     return p_value
def chi_square_test(group_a, group_b, feature_col):
    # Concatenate the groups before creating the contingency table
    combined_data = pd.concat([group_a, group_b])

    # Create the contingency table from the combined data
    contingency_table = pd.crosstab(combined_data[feature_col], combined_data[feature_col])

    # Perform the chi-squared test
    chi2, p_value, _, _ = chi2_contingency(contingency_table)

    return p_value

# Perform A/B Testing
# Gauteng          393865
# Western Cape     170796
results = []
for hypothesis in null_hypotheses:
    if hypothesis == "There are no risk differences across provinces":
        group_a, group_b = segment_data(df, "Province", "Western Cape", "Gauteng")
        p_value = t_test(group_a, group_b, "kpi_risk")
        results.append((hypothesis, p_value))

    elif hypothesis == "There are no risk differences between zip codes":
        group_a, group_b = segment_data(df, "PostalCode", "1000", "2000")
        p_value = t_test(group_a, group_b, "kpi_risk")
        results.append((hypothesis, p_value))

    elif hypothesis == "There are no significant margin (profit) difference between zip codes":
        group_a, group_b = segment_data(df, "PostalCode", "1000", "2000")
        p_value = t_test(group_a, group_b, "kpi_margin")
        results.append((hypothesis, p_value))

    elif hypothesis == "There are no significant risk differences between Women and Men":
        group_a, group_b = segment_data(df, "Gender", "Female", "Male")
        p_value = chi_square_test(group_a, group_b, "Gender")
        results.append((hypothesis, p_value))

# Analyze and Report Results
report = []
for hypothesis, p_value in results:
    if p_value < 0.05:
        report.append((hypothesis, "Reject Null Hypothesis", f"P-value: {p_value}"))
    else:
        report.append((hypothesis, "Fail to Reject Null Hypothesis", f"P-value: {p_value}"))

# Display Results
print("A/B Testing Results:")
for r in report:
    print(f"{r[0]}: {r[1]} ({r[2]})")
