In [6]:
import pandas as pd 
from scipy import stats
import matplotlib.pyplot as plt


In [3]:
file_path = "../Data/MachineLearningRating_v3.txt"

df = pd.read_csv(file_path, sep='|', low_memory=False)

In [7]:
def test_hypothesis(group_a, group_b, feature, kpi, hypothesis_count):
    """
    Performs a two-sample t-test to compare the means of two groups.

    Args:
      group_a: Data for group A.
      group_b: Data for group B.
      feature: Feature being tested.
      kpi: Key performance indicator.
      hypothesis_count: Counter variable for the number of hypotheses displayed.

    Returns:
      p-value from the t-test.
    """

    # Perform t-test
    p_value = stats.ttest_ind(group_a[kpi], group_b[kpi]).pvalue

    # Increment hypothesis count
    hypothesis_count += 1

    print(f"Hypothesis Test {hypothesis_count} for {feature}:")
    print(f"  - p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("  - Reject the null hypothesis: There is a significant difference in risk.")
    else:
        print("  - Fail to reject the null hypothesis: No significant difference in risk found.")
    print("-" * 40)

In [8]:
# 1. Risk Differences Across Provinces
# -------------------------------------

# Group data by province
provinces = df['Province'].unique()
hypothesis_count = 0
for i in range(len(provinces)):
    for j in range(i + 1, len(provinces)):
        if hypothesis_count >= 5:
            break
        province_a = provinces[i]
        province_b = provinces[j]
        group_a = df[df['Province'] == province_a]
        group_b = df[df['Province'] == province_b]
        test_hypothesis(group_a, group_b, f"{province_a} vs. {province_b}", 'TotalClaims', hypothesis_count)
        hypothesis_count += 1
    if hypothesis_count >= 5:
        break

# Repeat the same modification for the remaining hypothesis testing sections (2-5)

Hypothesis Test 1 for Gauteng vs. KwaZulu-Natal:
  - p-value: 0.1928
  - Fail to reject the null hypothesis: No significant difference in risk found.
----------------------------------------
Hypothesis Test 2 for Gauteng vs. Mpumalanga:
  - p-value: 0.0011
  - Reject the null hypothesis: There is a significant difference in risk.
----------------------------------------
Hypothesis Test 3 for Gauteng vs. Eastern Cape:
  - p-value: 0.0398
  - Reject the null hypothesis: There is a significant difference in risk.
----------------------------------------
Hypothesis Test 4 for Gauteng vs. Western Cape:
  - p-value: 0.0563
  - Fail to reject the null hypothesis: No significant difference in risk found.
----------------------------------------
Hypothesis Test 5 for Gauteng vs. Limpopo:
  - p-value: 0.0325
  - Reject the null hypothesis: There is a significant difference in risk.
----------------------------------------


In [10]:
# 3. Risk Difference Between Women and Men
# -----------------------------------------

# Group data by gender
group_female = df[df['Gender'] == 'Female']
group_male = df[df['Gender'] == 'Male']
test_hypothesis(group_female, group_male, "Female vs. Male", 'TotalClaims', hypothesis_count)
hypothesis_count += 1

Hypothesis Test 6 for Female vs. Male:
  - p-value: 0.8041
  - Fail to reject the null hypothesis: No significant difference in risk found.
----------------------------------------


In [11]:
# 4. Risk Differences Across Vehicle Types
# -----------------------------------------

vehicle_types = df['VehicleType'].unique()
for i in range(len(vehicle_types)):
    for j in range(i + 1, len(vehicle_types)):
        if hypothesis_count >= 5:
            break
        vehicle_a = vehicle_types[i]
        vehicle_b = vehicle_types[j]
        group_a = df[df['VehicleType'] == vehicle_a]
        group_b = df[df['VehicleType'] == vehicle_b]
        test_hypothesis(group_a, group_b, f"{vehicle_a} vs. {vehicle_b}", 'TotalClaims', hypothesis_count)
        hypothesis_count += 1
    if hypothesis_count >= 5:
        break