In [41]:
# Import necessary libraries
import pandas as pd
from scipy import stats

In [42]:
# Define the correct column names
column_names = ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 
                'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 
                'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 
                'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 
                'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 
                'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 
                'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 
                'TotalPremium', 'TotalClaims']

In [43]:
# Load the data
data = pd.read_csv('../data/cleaned_data.csv', header=None,low_memory=False)


In [44]:
# Set the column names
data.columns = column_names

In [45]:
# Convert TotalPremium and TotalClaims to numeric
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce')
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce')

In [46]:
# Verify the column names
print(data.columns)

Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet',
       'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium',
       'TotalClaims'],
      dtype='object')


In [32]:
# Define functions for statistical tests
# Define the chi-squared test function
def chi_squared_test(data, col1, col2):
    contingency_table = pd.crosstab(data[col1], data[col2])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    return chi2, p

# Define the t-test function
def t_test(data, group_col, value_col):
    group_a = data[data[group_col] == data[group_col].unique()[0]][value_col]
    group_b = data[data[group_col] == data[group_col].unique()[1]][value_col]
    t_stat, p = stats.ttest_ind(group_a, group_b, nan_policy='omit')
    return t_stat, p

In [33]:
# Null Hypothesis 1: There are no risk differences across provinces
print("Testing risk differences across provinces...")
chi2, p = chi_squared_test(data, 'Province', 'TotalClaims')
print(f"Chi-squared test: chi2={chi2}, p-value={p}")

Testing risk differences across provinces...
Chi-squared test: chi2=10722.684705166641, p-value=1.0


In [34]:
# Null Hypothesis 2: There are no risk differences between zip codes
print("Testing risk differences between zip codes...")
chi2, p = chi_squared_test(data, 'PostalCode', 'TotalClaims')
print(f"Chi-squared test: chi2={chi2}, p-value={p}")

Testing risk differences between zip codes...
Chi-squared test: chi2=1214174.4063742857, p-value=1.0


In [35]:
# Null Hypothesis 3: There are no significant margin (profit) differences between zip codes
print("Testing profit margin differences between zip codes...")
data['ProfitMargin'] = data['TotalPremium'] - data['TotalClaims']
t_stat, p = t_test(data, 'PostalCode', 'ProfitMargin')
print(f"T-test: t_stat={t_stat}, p-value={p}")

Testing profit margin differences between zip codes...
T-test: t_stat=nan, p-value=nan


In [37]:
# Null Hypothesis 4: There are no significant risk differences between Women and Men
print("Testing risk differences between Women and Men...")
chi2, p = chi_squared_test(data, 'Gender', 'TotalClaims')
print(f"Chi-squared test: chi2={chi2}, p-value={p}")

Testing risk differences between Women and Men...
Chi-squared test: chi2=2534.8814995125476, p-value=1.0


In [38]:
# Analyze and report results
def analyze_results(p_value, alpha=0.05):
    if p_value < alpha:
        return "Reject the null hypothesis"
    else:
        return "Fail to reject the null hypothesis"


In [39]:
# Analyzing and reporting
print("Results:")
print("Risk differences across provinces:", analyze_results(p))
print("Risk differences between zip codes:", analyze_results(p))
print("Profit margin differences between zip codes:", analyze_results(p))
print("Risk differences between Women and Men:", analyze_results(p))

Results:
Risk differences across provinces: Fail to reject the null hypothesis
Risk differences between zip codes: Fail to reject the null hypothesis
Profit margin differences between zip codes: Fail to reject the null hypothesis
Risk differences between Women and Men: Fail to reject the null hypothesis
