In [68]:
import pandas as pd
import numpy as np


In [93]:
df = pd.read_csv('data/fraud_oracle.csv')
print(df.columns)
print(df.dtypes)

missing_values = df.isnull().sum()
missing_summary = missing_values[missing_values > 0]
if missing_summary.empty:
    print("No missing values found in the dataset.")
else:
    print("Missing values detected:")
    print(missing_summary)

Index(['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'FraudFound_P', 'PolicyNumber', 'RepNumber',
       'Deductible', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year',
       'BasePolicy'],
      dtype='object')
Month                   object
WeekOfMonth              int64
DayOfWeek               object
Make                    object
AccidentArea            object
DayOfWeekClaimed        object
MonthClaimed            object
WeekOfMonthClaimed       int64
Sex                     object
MaritalStatus           object
Age                      int64
Fault                   object
PolicyType         

In [94]:
df

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,Nov,4,Friday,Toyota,Urban,Tuesday,Nov,5,Male,Married,...,6 years,31 to 35,No,No,External,none,no change,1 vehicle,1996,Collision
15416,Nov,5,Thursday,Pontiac,Urban,Friday,Dec,1,Male,Married,...,6 years,31 to 35,No,No,External,more than 5,no change,3 to 4,1996,Liability
15417,Nov,5,Thursday,Toyota,Rural,Friday,Dec,1,Male,Single,...,5 years,26 to 30,No,No,External,1 to 2,no change,1 vehicle,1996,Collision
15418,Dec,1,Monday,Toyota,Urban,Thursday,Dec,2,Female,Married,...,2 years,31 to 35,No,No,External,more than 5,no change,1 vehicle,1996,All Perils


In [95]:
'''
Yes-No columns change to 1-0 
change ordered columns to numerical values
One hot encoding for columns with multiple values but not ordered
'''

yes_no_columns = ['PoliceReportFiled', 'WitnessPresent']
df[yes_no_columns] = df[yes_no_columns].apply(lambda x: x.map({'Yes': 1, 'No': 0}))


ordinal_mappings = {
    'AgeOfVehicle': {'less than 1': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, 
                     '5 years': 5, '6 years': 6, '7 years': 7, 'more than 7': 8},

    'AgeOfPolicyHolder': {'16 to 17': 16, '18 to 20': 19, '21 to 25': 23, '26 to 30': 28, 
                          '31 to 35': 33, '36 to 40': 38, '41 to 50': 45, '51 to 65': 58, 
                          'over 65': 70},

    'NumberOfCars': {'1 vehicle': 1, '2 vehicles': 2, '3 to 4': 3, '5 to 8': 5, 'more than 8': 9},

    'NumberOfSuppliments': {'none': 0, '1 to 2': 1, '3 to 5': 3, 'more than 5': 5},

    'AddressChange_Claim': {'no change': 0, 'under 6 months': 1, '1 year': 2, 
                             '2 to 3 years': 3, '4 to 8 years': 4, 'more than 8 years': 5},

    'Days_Policy_Accident': {'none': 0, '1 to 7': 1, '8 to 15': 2, '15 to 30': 3, "more than 30": 4},

    'Days_Policy_Claim': {'none': 0, '8 to 15': 1, '15 to 30': 2, 'more than 30': 3},
    
    'PastNumberOfClaims': {'none': 0, '1': 1, '2 to 4': 2, 'more than 4': 3},

    'AgeOfVehicle': { 'new': 0, '2 years': 1, '3 years': 2, '4 years': 3, '5 years': 4, '6 years': 5, '7 years': 6, 'more than 7': 7},
                     
    'VehiclePrice': {'less than 20000': 0, '20000 to 29000': 1, '30000 to 39000': 2, '40000 to 59000': 3, '60000 to 69000': 4, 'more than 69000': 5}
}





# test = df["MonthClaimed"].unique()
# array(['Jan', 'Nov', 'Jul', 'Feb', 'Mar', 'Dec', 'Apr', 'Aug', 'May',
#        'Jun', 'Sep', 'Oct', '0'], dtype=object)|

for col, mapping in ordinal_mappings.items():
    df[col] = df[col].map(mapping)


binary_mappings = {
    'Sex': {'Male': 1, 'Female': 0},
    'AgentType': {'Internal': 1, 'External': 0}
}

for col, mapping in binary_mappings.items():
    df[col] = df[col].map(mapping)

df = pd.get_dummies(df, columns=['MaritalStatus'], drop_first=True)

# df.drop(columns=['PolicyNumber', 'RepNumber'], inplace=True)


print(df)

      Month  WeekOfMonth  DayOfWeek     Make AccidentArea DayOfWeekClaimed  \
0       Dec            5  Wednesday    Honda        Urban          Tuesday   
1       Jan            3  Wednesday    Honda        Urban           Monday   
2       Oct            5     Friday    Honda        Urban         Thursday   
3       Jun            2   Saturday   Toyota        Rural           Friday   
4       Jan            5     Monday    Honda        Urban          Tuesday   
...     ...          ...        ...      ...          ...              ...   
15415   Nov            4     Friday   Toyota        Urban          Tuesday   
15416   Nov            5   Thursday  Pontiac        Urban           Friday   
15417   Nov            5   Thursday   Toyota        Rural           Friday   
15418   Dec            1     Monday   Toyota        Urban         Thursday   
15419   Dec            2  Wednesday   Toyota        Urban         Thursday   

      MonthClaimed  WeekOfMonthClaimed  Sex  Age  ... WitnessPr

In [96]:
'''
feature engineering: 
1. ClaimsPerPolicy - Counts the number of claims made by each policyholder.
2. DaysSinceLastClaim - Calculates the time gap between consecutive claims for the same policyholder.
3. UnusualDeductible - Flags claims with deductible amounts above the median.
4. AccidentAreaChange - Identifies policyholders with claims in different accident areas.
5. ClaimsPerAgent - Counts the number of claims handled by each agent type.
'''

df['ClaimsPerPolicy'] = df.groupby('PolicyNumber')['PolicyNumber'].transform('count')

df = df.sort_values(by=['PolicyNumber', 'Year', 'Month', 'WeekOfMonthClaimed'])

# there is 1 entry where month claimed is 0 and not any month, so i will replace it with 0 to show that maybe it has not been claimed?
month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12,
    '0': 0
}
df['MonthNumeric'] = df['MonthClaimed'].map(month_mapping)

df['DaysSinceLastClaim'] = df.groupby('PolicyNumber')['WeekOfMonthClaimed'].diff().fillna(0)

df['UnusualDeductible'] = (df['Deductible'] > df['Deductible'].median()).astype(int)

df['AccidentAreaChange'] = df.groupby('PolicyNumber')['AccidentArea'].nunique() > 1
df['AccidentAreaChange'] = df['AccidentAreaChange'].fillna(False).astype(int)

df['ClaimsPerAgent'] = df.groupby('AgentType')['AgentType'].transform('count')

df


  df['AccidentAreaChange'] = df['AccidentAreaChange'].fillna(False).astype(int)


Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,Age,...,BasePolicy,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,ClaimsPerPolicy,MonthNumeric,DaysSinceLastClaim,UnusualDeductible,AccidentAreaChange,ClaimsPerAgent
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,0,21,...,Liability,False,True,False,1,1,0.0,0,0,15179
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,1,34,...,Collision,False,True,False,1,1,0.0,0,0,15179
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,1,47,...,Collision,True,False,False,1,11,0.0,0,0,15179
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,1,65,...,Liability,True,False,False,1,7,0.0,0,0,15179
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,0,27,...,Collision,False,True,False,1,2,0.0,0,0,15179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,Nov,4,Friday,Toyota,Urban,Tuesday,Nov,5,1,35,...,Collision,True,False,False,1,11,0.0,0,0,15179
15416,Nov,5,Thursday,Pontiac,Urban,Friday,Dec,1,1,30,...,Liability,True,False,False,1,12,0.0,0,0,15179
15417,Nov,5,Thursday,Toyota,Rural,Friday,Dec,1,1,24,...,Collision,False,True,False,1,12,0.0,0,0,15179
15418,Dec,1,Monday,Toyota,Urban,Thursday,Dec,2,0,34,...,All Perils,True,False,False,1,12,0.0,0,0,15179


In [97]:
missing_values = df.isnull().sum()
missing_summary = missing_values[missing_values > 0]
if missing_summary.empty:
    print("No missing values found in the dataset.")
else:
    print("Missing values detected:")
    print(missing_summary)

No missing values found in the dataset.


In [98]:
feature_columns = [
    'ClaimsPerPolicy', 'DaysSinceLastClaim', 'UnusualDeductible', 
    'AccidentAreaChange', 'ClaimsPerAgent'
]
feature_stats = df[feature_columns].describe().T
print(feature_stats)

                      count          mean          std    min      25%  \
ClaimsPerPolicy     15420.0      1.000000     0.000000    1.0      1.0   
DaysSinceLastClaim  15420.0      0.000000     0.000000    0.0      0.0   
UnusualDeductible   15420.0      0.037224     0.189317    0.0      0.0   
AccidentAreaChange  15420.0      0.000000     0.000000    0.0      0.0   
ClaimsPerAgent      15420.0  14945.533204  1852.901214  241.0  15179.0   

                        50%      75%      max  
ClaimsPerPolicy         1.0      1.0      1.0  
DaysSinceLastClaim      0.0      0.0      0.0  
UnusualDeductible       0.0      0.0      1.0  
AccidentAreaChange      0.0      0.0      0.0  
ClaimsPerAgent      15179.0  15179.0  15179.0  


In [None]:
# since sd of ClaimsPerPolicy, DaysSinceLastClaim, AccidentAreaChange is 0, maybe we can conclude that policy number is unique?
# Month claimed is dropped as it has been converted to monthnumeric, rename back to monthclaimed
df.drop(columns=['AccidentAreaChange', 'DaysSinceLastClaim', 'ClaimsPerPolicy', "MonthClaimed"], inplace=True)
df.rename(columns={"MonthNumeric": "MonthClaimed"}, inplace=True)
# df.to_csv('data/fraud_oracle_processed.csv')