In [2]:
import pandas as pd 
import scipy.stats as stats

df = pd.read_csv('../data/raw/MachineLearningRating_v3.csv')

df['IsVATRegistered'] = df['IsVATRegistered'].astype('bool')

# split data into two groups
group_a = df[df['IsVATRegistered'] == False]['TotalPremium']
group_b = df[df['IsVATRegistered'] == True]['TotalPremium']

# calculate summary statistics
print("Group A (Not VAT Registered):")
print(group_a.describe())

print("\nGroup B (VAT Registered):")
print(group_b.describe())

# perform an independent t-test
t_stat, p_value = stats.ttest_ind(group_a, group_b, nan_policy='omit')

print("\nT-statistic:", t_stat)
print("P-value:", p_value)

# interpret the results
if p_value < 0.05:
    print("\nThe difference in TotalPremium between the two groups is statistically significant.")
else:   
    print("\nThe difference in TotalPremium between the two groups is not statistically significant.")

  df = pd.read_csv('../data/raw/MachineLearningRating_v3.csv')


Group A (Not VAT Registered):
count    995075.000000
mean         61.967615
std         230.721341
min        -782.576754
25%           0.000000
50%           2.178333
75%          21.929825
max       65282.603420
Name: TotalPremium, dtype: float64

Group B (VAT Registered):
count    5023.000000
mean       49.599579
std       113.961412
min       -25.466893
25%         0.000000
50%         2.003772
75%        35.160702
max       772.472017
Name: TotalPremium, dtype: float64

T-statistic: 3.796882191780002
P-value: 0.00014653602633548027

The difference in TotalPremium between the two groups is statistically significant.


In [3]:
df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')

# filter out rows where TotalPremium is missing
df = df[df['TotalPremium'] > 0]

# split data into two groups
group_a = df[df['IsVATRegistered'] == False]['TotalPremium']
group_b = df[df['IsVATRegistered'] == True]['TotalPremium']

# calculate summary statistics
print("\nGroup A (Not VAT Registered):")
print(group_a.describe())

print("\nGroup B (VAT Registered):")
print(group_b.describe())

# Perform an independent t-test
t_stat, p_value = stats.ttest_ind(group_a, group_b, nan_policy='omit')

print("\nT-statistic:", t_stat)
print("P-value:", p_value)

# Interpret the results
if p_value < 0.05:
    print("\nThe difference in TotalPremium between the two groups is statistically significant.")
else:
    print("\nThe difference in TotalPremium between the two groups is not statistically significant.")


Group A (Not VAT Registered):
count    614864.000000
mean        100.337548
std         286.833342
min           0.000011
25%           2.632127
50%           7.275088
75%          78.947368
max       65282.603420
Name: TotalPremium, dtype: float64

Group B (VAT Registered):
count    3312.000000
mean       75.230722
std       133.300756
min         0.065127
25%         2.171491
50%        15.456989
75%        91.373860
max       772.472017
Name: TotalPremium, dtype: float64

T-statistic: 5.034479611067607
P-value: 4.792828460835898e-07

The difference in TotalPremium between the two groups is statistically significant.


In [7]:
df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')
df['TotalClaims'] = pd.to_numeric(df['TotalClaims'], errors='coerce')
df['RegistrationYear'] = pd.to_numeric(df['RegistrationYear'], errors='coerce')

# filter out invalid or 0 values for TotalPremium, TotalClaims
df = df[(df['TotalPremium'] >= 0) & (df['TotalClaims'] >= 0)]

# define hypothesis
hypotheses = [
    {
        "name": "VAT registration hypothesis",
        "group_a": df[df['IsVATRegistered'] == False]['TotalPremium'],
        "group_b": df[df['IsVATRegistered'] == True]['TotalPremium']
    },
    {
        "name": "Tracking device hypothesis",
        "group_a": df[df['TrackingDevice'] == "No"]['TotalClaims'],
        "group_b": df[df['TrackingDevice'] == "Yes"]['TotalClaims']
    },
    {
        "name": "New vehicle hypothesis",
        "group_a": df[df['NewVehicle'] == "No"]['TotalPremium'],
        "group_b": df[df['NewVehicle'] == "Yes"]['TotalPremium']
    },
    {
        "name": "Alarm immobilizer hypothesis",
        "group_a": df[df['AlarmImmobiliser'] == "No"]['TotalClaims'],
        "group_b": df[df['AlarmImmobiliser'] == "Yes"]['TotalClaims']
    },
    {
        "name": "Vehicle age hypothesis",
        "group_a": df[df['RegistrationYear'] <= 2018]['TotalPremium'],
        "group_b": df[df['RegistrationYear'] > 2018]['TotalPremium']
    },
]

# run A/B tests for each hypothesis
for hypothesis in hypotheses:
    group_a = hypothesis["group_a"].dropna()
    group_b = hypothesis["group_b"].dropna()

    # calculate summary statistics
    print(f"\n--- {hypothesis['name']} ---")
    print("Group A Summary:")
    print(group_a.describe())

    print("\nGroup B Summary:")
    print(group_b.describe())

    # perform t-test
    t_stat, p_value = stats.ttest_ind(group_a, group_b, nan_policy='omit')
    print("\nT-statistic: {t_stat}")
    print("P-value: {p_value}")

    if p_value < 0.05:
        print("Result: Statistically Significant Difference (Reject Null Hypothesis)")
    else:
        print("Result: No Statistically Significant Difference (Fail to Reject Null Hypothesis)")


--- VAT registration hypothesis ---
Group A Summary:
count    614862.000000
mean        100.335902
std         286.832350
min           0.000011
25%           2.631996
50%           7.275088
75%          78.947368
max       65282.603420
Name: TotalPremium, dtype: float64

Group B Summary:
count    3312.000000
mean       75.230722
std       133.300756
min         0.065127
25%         2.171491
50%        15.456989
75%        91.373860
max       772.472017
Name: TotalPremium, dtype: float64

T-statistic: {t_stat}
P-value: {p_value}
Result: Statistically Significant Difference (Reject Null Hypothesis)

--- Tracking device hypothesis ---
Group A Summary:
count    401819.000000
mean         96.749814
std        2858.707275
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max      393092.105300
Name: TotalClaims, dtype: float64

Group B Summary:
count    216355.000000
mean        107.299809
std        3219.493878
min           0.000000
25%          

  t_stat, p_value = stats.ttest_ind(group_a, group_b, nan_policy='omit')
  t_stat, p_value = stats.ttest_ind(group_a, group_b, nan_policy='omit')
