# A/B Hypothesis Testing Analysis


## 1. Setup
Import necessary libraries and load data.

In [1]:
# Import necessary libraries

import pandas as pd
import sys
import os


In [2]:
# Adjust the path to point to the 'scripts' directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../scripts')))
from ab_testing_analysis import (load_data, clean_data, segment_data, perform_t_test, perform_chi_squared_test, perform_z_test, 
    cohen_d, interpret_p_value, hypothesis_1, hypothesis_2, hypothesis_3, hypothesis_4, analyze_and_report
)

# Load data


In [3]:
file_path = '../../data/MachineLearningRating_v3.txt'
# Replace delimiter with the correct one
df = pd.read_csv(file_path, delimiter='|')  # Change delimiter as needed

# Display the first few rows of the dataframe
df.head()

  df = pd.read_csv(file_path, delimiter='|')  # Change delimiter as needed


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


# Clean the dataset

In [4]:
df_clean = clean_data(df)
df_clean.head()

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [5]:
print(df_clean.columns)


Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet',
       'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium',
       'TotalClaims'],
      dtype='object')


# 2. Select Metrics
Select the key performance indicators (KPIs) for analysis.

In [6]:
kpi_column = 'TotalClaims'

## 3. Data Segmentation
Create groups for comparison: Control Group (Group A) and Test Group (Group B).

In [7]:
feature_column = 'Province'
control_value = 'ProvinceA'
test_value = 'ProvinceB'

# 4. Statistical Testing
## Hypothesis 1: Risk Differences Across Provinces


In [8]:
def run_hypotheses_tests(df):
    print("Hypothesis 1: Risk Differences Across Provinces")
    result_h1 = hypothesis_1(df, 'Province1', 'Province2', 'TotalClaims')
    print(result_h1)

 ### Business Impact and Recommendations


In [9]:
analyze_and_report(hypothesis_1, df, 'Province1', 'Province2', 'TotalClaims', "Hypothesis 1: Risk Differences Across Provinces")


Group A (Province1): 0 records
Group B (Province2): 0 records
--- Hypothesis 1: Risk Differences Across Provinces ---
Fail to reject the null hypothesis (p-value = nan). No significant difference.
Business Impact: No significant difference was found between Province1 and Province2 groups for TotalClaims.
Actionable Insights: No immediate changes are necessary. Continue monitoring.
Customer Experience: These results could influence how different customer segments perceive the product. For instance, addressing significant differences could help in enhancing customer satisfaction.
------------------------------------------------------



  t_stat, p_value = stats.ttest_ind(group_a[kpi_col], group_b[kpi_col], equal_var=False)


## Hypothesis 2: Risk Differences Between Zipcodes



In [10]:
print("Hypothesis 2: Risk Differences Between Zipcodes")
result_h2 = hypothesis_2(df, '12345', '67890', 'TotalClaims')
print(result_h2)

Hypothesis 2: Risk Differences Between Zipcodes
Group A (12345): 0 records
Group B (67890): 0 records
Error: One or both groups are empty. Please check the data and segmentation criteria.


### Business Impact and Recommendations


In [11]:
analyze_and_report(hypothesis_2, df, '12345', '67890', 'TotalClaims', "Hypothesis 2: Risk Differences Between Zipcodes")

Group A (12345): 0 records
Group B (67890): 0 records
--- Hypothesis 2: Risk Differences Between Zipcodes ---
Error: One or both groups are empty. Please check the data and segmentation criteria.
Business Impact: No significant difference was found between 12345 and 67890 groups for TotalClaims.
Actionable Insights: No immediate changes are necessary. Continue monitoring.
Customer Experience: These results could influence how different customer segments perceive the product. For instance, addressing significant differences could help in enhancing customer satisfaction.
------------------------------------------------------



## Hypothesis 3: Margin Differences Between Zip Codes


In [12]:
result_h3 = hypothesis_3(df, '12345', '67890', 'TotalPremium')
print(result_h3)

    

Group A (12345): 0 records
Group B (67890): 0 records
Fail to reject the null hypothesis (p-value = nan). No significant difference.


### Business Impact and Recommendations


In [13]:
    
analyze_and_report(hypothesis_3, df, '12345', '67890', 'TotalPremium', "Hypothesis 3: Margin Differences Between Zip Codes")

Group A (12345): 0 records
Group B (67890): 0 records
--- Hypothesis 3: Margin Differences Between Zip Codes ---
Fail to reject the null hypothesis (p-value = nan). No significant difference.
Business Impact: No significant difference was found between 12345 and 67890 groups for TotalPremium.
Actionable Insights: No immediate changes are necessary. Continue monitoring.
Customer Experience: These results could influence how different customer segments perceive the product. For instance, addressing significant differences could help in enhancing customer satisfaction.
------------------------------------------------------



## Hypothesis 4: Risk Differences Between Women and Men


In [20]:
hypothesis_4_result = hypothesis_4(df_clean, control_value='Female', test_value='Male', kpi_col='TotalClaims')
print(hypothesis_4_result)


Group A (FEMALE): 6755 records
Group B (MALE): 42817 records
Fail to reject the null hypothesis (p-value = nan). No significant difference.


  z_stat = (mean_a - mean_b) / np.sqrt((std_a**2/n_a) + (std_b**2/n_b))


# Analyze and Report


In [29]:
# Perform hypothesis testing and report findings
def display_results():
    # Hypothesis Testing
    print("Results of Hypothesis Testing:")
    
    # Hypothesis 1: Risk Differences Across Provinces
    result_h1 = hypothesis_1(df_clean, 'Province1', 'Province2', 'TotalClaims')
    print(f"Hypothesis 1: Risk Differences Across Provinces: {result_h1}")
    
    # Hypothesis 2: Risk Differences Between Zipcodes
    result_h2 = hypothesis_2(df_clean, '12345', '67890', 'TotalClaims')
    print(f"Hypothesis 2: Risk Differences Between Zipcodes: {result_h2}")
    
    # Hypothesis 3: Margin Differences Between Zip Codes
    result_h3 = hypothesis_3(df_clean, '12345', '67890', 'TotalPremium')
    print(f"Hypothesis 3: Margin Differences Between Zip Codes: {result_h3}")
    
    # Hypothesis 4: Risk Differences Between Women and Men
    result_h4 = hypothesis_4(df_clean, 'Male', 'Female', 'TotalClaims')
    print(f"Hypothesis 4: Risk Differences Between Women and Men: {result_h4}")

    

# Call the display function to show results
display_results()


Results of Hypothesis Testing:
Group A (Province1): 0 records
Group B (Province2): 0 records
Hypothesis 1: Risk Differences Across Provinces: Fail to reject the null hypothesis (p-value = nan). No significant difference.
Group A (12345): 0 records
Group B (67890): 0 records
Hypothesis 2: Risk Differences Between Zipcodes: Error: One or both groups are empty. Please check the data and segmentation criteria.
Group A (12345): 0 records
Group B (67890): 0 records
Hypothesis 3: Margin Differences Between Zip Codes: Fail to reject the null hypothesis (p-value = nan). No significant difference.
Group A (MALE): 42817 records
Group B (FEMALE): 6755 records
Hypothesis 4: Risk Differences Between Women and Men: Fail to reject the null hypothesis (p-value = nan). No significant difference.
