In [62]:
import pandas as pd
from scipy import stats

In [63]:
# Define the correct column names
column_names = ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 
                'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 
                'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 
                'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 
                'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 
                'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 
                'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 
                'TotalPremium', 'TotalClaims']

In [64]:
# Load the data
data = pd.read_csv('../data/cleaned_data.csv', header=None,low_memory=False)


In [65]:
# Set the column names
data.columns = column_names

In [71]:
# Convert TotalPremium and TotalClaims to numeric
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce')
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce')

In [82]:
# Hypothesis 1: Test risk differences across provinces
# Define Metrics
# For the purpose of hypothesis testing, let's choose 'TotalPremium' as our key performance indicator (KPI).

# Data Segmentation
# Let's segment the data based on the feature we want to test. 
# For example, if we want to test the impact of provinces on TotalPremium:
# Group A (Control Group): Plans with Province = 'Gauteng'
control_group_province = data[data['Province'] == 'Gauteng']['TotalPremium']
# Group B (Test Group): Plans with Province = 'Western Cape'
test_group_province = data[data['Province'] == 'Western Cape']['TotalPremium']

# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_group_province, test_group_province)

# Print results
print("Hypothesis 1: Test risk differences across provinces")
print("-----------------------------------------------------")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Analyze p-value
alpha = 0.05
if p_value < alpha:
    print("Reject Null Hypothesis: There are significant risk differences across provinces.")
else:
    print("Fail to reject Null Hypothesis: There are no significant risk differences across provinces.")


Hypothesis 1: Test risk differences across provinces
-----------------------------------------------------
T-statistic: 4.6537430154774375
P-value: 3.260368633786302e-06
Reject Null Hypothesis: There are significant risk differences across provinces.


In [84]:
#Hypothesis 2:
# Define Metrics
# For the purpose of hypothesis testing, let's choose 'TotalPremium' as our key performance indicator (KPI).

# Data Segmentation
# Let's segment the data based on the feature we want to test. 
# For example, if we want to test the impact of zip codes on TotalPremium:
# Group A (Control Group): Plans with ZipCode = 'ZipCode_A'
control_group_zipcode = data[data['PostalCode'] == '1459']
# Group B (Test Group): Plans with ZipCode = 'ZipCode_B'
test_group_zipcode = data[data['PostalCode'] == '7784']

# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_group_zipcode['TotalPremium'], test_group_zipcode['TotalPremium'])

# Print results
print("Hypothesis 2: There are no risk differences between zip codes")
print("--------------------------------------------------------------")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Analyze p-value
if p_value < alpha:
    print("Reject Null Hypothesis: There are significant risk differences between zip codes.")
else:
    print("Fail to reject Null Hypothesis: There are no significant risk differences between zip codes.")


Hypothesis 2: There are no risk differences between zip codes
--------------------------------------------------------------
T-statistic: 4.496320715526884
P-value: 6.9405225411919286e-06
Reject Null Hypothesis: There are significant risk differences between zip codes.


In [86]:
#Hypothesis 3: Test significant margin (profit) difference between zip codes
# For the purpose of hypothesis testing, let's choose 'ProfitMargin' as our key performance indicator (KPI).
data['ProfitMargin'] = data['TotalPremium'] - data['TotalClaims']

# Data Segmentation
# Let's segment the data based on the feature we want to test. 
# For example, if we want to test the impact of zip codes on ProfitMargin:
# Group A (Control Group): Plans with ZipCode = 'ZipCode_A'
control_group_zipcode = data[data['PostalCode'] == '1459']
# Group B (Test Group): Plans with ZipCode = 'ZipCode_B'
test_group_zipcode = data[data['PostalCode'] == '7784']

# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_group_zipcode['ProfitMargin'], test_group_zipcode['ProfitMargin'])

# Print results
print("Hypothesis 2: Test significant margin (profit) difference between zip codes")
print("------------------------------------------------------------------------------")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Analyze p-value
if p_value < alpha:
    print("Reject Null Hypothesis: There are significant margin (profit) differences between zip codes.")
else:
    print("Fail to reject Null Hypothesis: There are no significant margin (profit) differences between zip codes.")

Hypothesis 2: Test significant margin (profit) difference between zip codes
------------------------------------------------------------------------------
T-statistic: 0.7861667534417327
P-value: 0.43177617933297485
Fail to reject Null Hypothesis: There are no significant margin (profit) differences between zip codes.


In [87]:
#Hypothesis 4: Test significant margin (profit) differences between zip codes
# Let's segment the data based on the feature we want to test. 
# For example, if we want to test the impact of Gender on TotalPremium:
# Group A (Control Group): Plans with Gender = 'Male'
control_group_gender = data[data['Gender'] == 'Male']
# Group B (Test Group): Plans with Gender = 'Female'
test_group_gender = data[data['Gender'] == 'Female']

# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_group_gender['TotalPremium'], test_group_gender['TotalPremium'])

# Print results
print("Hypothesis 4: Test significant risk differences between Women and Men")
print("-----------------------------------------------------------------------")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Analyze p-value
if p_value < alpha:
    print("Reject Null Hypothesis: There are significant risk differences between Women and Men.")
else:
    print("Fail to reject Null Hypothesis: There are no significant risk differences between Women and Men.")

Hypothesis 4: Test significant risk differences between Women and Men
-----------------------------------------------------------------------
T-statistic: -5.118420932688848
P-value: 3.0925282750010697e-07
Reject Null Hypothesis: There are significant risk differences between Women and Men.
