# Importing Libraries and Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.model_selection import train_test_split

In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200

In [4]:
df = pd.read_csv('data/india_fraud_cleaned.csv')
df

Unnamed: 0,ReportedFraud,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentTime,NumberOfVehicles,PropertyDamage,BodilyInjuries,Witnesses,PoliceReport,AmountOfTotalClaim,AmountOfInjuryClaim,AmountOfPropertyClaim,AmountOfVehicleDamage,InsuredAge,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss,CustomerLoyaltyPeriod,DateOfPolicyCoverage,InsurancePolicyState,Policy_CombinedSingleLimit,Policy_Deductible,PolicyAnnualPremium,UmbrellaLimit,InsuredRelationship,VehicleMake,VehicleModel,VehicleYOM
0,N,2015-01-09,Multi-vehicle Collision,Rear Collision,Total Loss,Other,State9,City4,19,3,Missing,1,1,Missing,43973,9396,4698,29879,58,MALE,JD,sales,video-games,0,-42700,432,1998-11-14,State2,250/500,2000,1142.87,0,own-child,Mercedes,E400,2005
1,N,2015-02-20,Single Vehicle Collision,Side Collision,Minor Damage,Ambulance,State4,City3,14,1,Missing,1,1,YES,62310,6539,6539,49232,34,FEMALE,High School,transport-moving,polo,51300,0,128,2000-08-19,State2,100/300,1000,1274.38,2857344,own-child,Toyota,Highlander,2010
2,N,2015-01-14,Multi-vehicle Collision,Side Collision,Major Damage,Ambulance,State5,City2,21,3,Missing,0,0,Missing,42824,6069,6069,30686,52,FEMALE,PhD,machine-op-inspct,exercise,0,0,346,2000-06-23,State1,500/1000,745,1269.93,0,other-relative,Volkswagen,Passat,2002
3,Y,2015-01-07,Multi-vehicle Collision,Side Collision,Major Damage,Ambulance,State7,City4,12,3,YES,2,0,Missing,45672,253,5741,39678,25,FEMALE,College,exec-managerial,exercise,47400,-56100,42,1992-01-15,State2,100/300,986,1218.60,0,other-relative,Toyota,Highlander,2011
4,Y,2015-02-26,Multi-vehicle Collision,Side Collision,Major Damage,Other,State4,City6,3,3,NO,1,2,NO,81472,7407,14813,59252,27,MALE,Masters,sales,chess,0,0,109,2001-09-23,State3,250/500,576,1431.02,4235779,unmarried,Ford,Wrangler,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28801,N,2015-02-13,Single Vehicle Collision,Side Collision,Minor Damage,Fire,State7,City3,16,1,NO,1,2,Missing,50515,5543,5543,39429,39,MALE,JD,farming-fishing,basketball,0,0,251,2007-02-07,State2,250/500,1000,1108.55,739864,unmarried,Nissan,RSX,2002
28802,N,2015-01-19,Multi-vehicle Collision,Side Collision,Total Loss,Police,State5,City3,19,3,YES,2,0,Missing,94920,8678,8738,77504,21,MALE,Associate,prof-specialty,golf,20000,-75000,14,2006-10-18,State2,100/300,1008,1551.84,41592,own-child,Accura,RSX,2014
28803,N,2015-01-21,Multi-vehicle Collision,Side Collision,Total Loss,Police,State5,City7,16,3,YES,2,0,NO,87893,13224,7990,66679,35,MALE,Associate,prof-specialty,golf,0,-75000,153,2011-11-23,State1,100/300,660,1119.32,0,not-in-family,Saab,RSX,2013
28804,N,2015-01-02,Parked Car,Missing,Trivial Damage,,State4,City7,12,1,Missing,1,1,Missing,7284,728,1457,5099,40,FEMALE,Masters,exec-managerial,reading,0,-63900,241,1997-08-04,State3,250/1000,1674,1229.88,4674160,wife,Accura,RSX,2004


In [5]:
df['ReportedFraud'] = [1 if x == 'Y' else 0 for x in df['ReportedFraud']]

In [6]:
df['MonthOfIncident'] = pd.to_datetime(df['DateOfIncident']).dt.month
df['DayOfIncident'] = pd.to_datetime(df['DateOfIncident']).dt.day
df['MonthOfPolicyCoverage'] = pd.to_datetime(df['DateOfPolicyCoverage']).dt.month
df['DayOfPolicyCoverage'] = pd.to_datetime(df['DateOfPolicyCoverage']).dt.day

# Train Test Split

In [7]:
# split train test val in 80/10/10
train, test = train_test_split(df, stratify=df['ReportedFraud'], test_size=0.1)
train, val = train_test_split(train, stratify=df.loc[train.index, "ReportedFraud"], test_size=1/9)

# One Hot Encoding on Categorical Variables

In [8]:
cat_cols = ['TypeOfIncident', 'TypeOfCollission', 'SeverityOfIncident', 'AuthoritiesContacted', 'IncidentState',
       'IncidentCity', 'PropertyDamage', 'Witnesses', 'PoliceReport', 'InsuredGender', 'InsuredEducationLevel',
       'InsuredOccupation', 'InsuredHobbies', 'InsurancePolicyState', 'Policy_CombinedSingleLimit', 'InsuredRelationship',
       'VehicleMake', 'VehicleModel']
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = train.select_dtypes(include=numerics).columns.tolist()

dummy_cols = pd.get_dummies(train[cat_cols])
dummy_cols
train_ohe = pd.concat([train[numeric_cols], dummy_cols], axis = 1)
train_ohe.shape

(23044, 177)

In [9]:
cat_cols = ['TypeOfIncident', 'TypeOfCollission', 'SeverityOfIncident', 'AuthoritiesContacted', 'IncidentState',
       'IncidentCity', 'PropertyDamage', 'Witnesses', 'PoliceReport', 'InsuredGender', 'InsuredEducationLevel',
       'InsuredOccupation', 'InsuredHobbies', 'InsurancePolicyState', 'Policy_CombinedSingleLimit', 'InsuredRelationship',
       'VehicleMake', 'VehicleModel']
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = test.select_dtypes(include=numerics).columns.tolist()

dummy_cols = pd.get_dummies(test[cat_cols])
dummy_cols
test_ohe = pd.concat([test[numeric_cols], dummy_cols], axis = 1)
test_ohe.shape

(2881, 177)

In [10]:
cat_cols = ['TypeOfIncident', 'TypeOfCollission', 'SeverityOfIncident', 'AuthoritiesContacted', 'IncidentState',
       'IncidentCity', 'PropertyDamage', 'Witnesses', 'PoliceReport', 'InsuredGender', 'InsuredEducationLevel',
       'InsuredOccupation', 'InsuredHobbies', 'InsurancePolicyState', 'Policy_CombinedSingleLimit', 'InsuredRelationship',
       'VehicleMake', 'VehicleModel']
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = val.select_dtypes(include=numerics).columns.tolist()

dummy_cols = pd.get_dummies(val[cat_cols])
dummy_cols
val_ohe = pd.concat([val[numeric_cols], dummy_cols], axis = 1)
val_ohe.shape

(2881, 177)

In [11]:
# Split IV and DV
X_train = train_ohe.iloc[:,1:]
y_train = train_ohe['ReportedFraud'] 

X_val = val_ohe.iloc[:,1:]
y_val = val_ohe['ReportedFraud']

X_test = test_ohe.iloc[:,1:]
y_test = test_ohe['ReportedFraud']

## Dealing with Class Imbalance using SMOTE

In [12]:
smote = SMOTE()

# fit predictor and target variable
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print('Original dataset shape', Counter(y_train))
print('Resample dataset shape', Counter(y_smote))

Original dataset shape Counter({0: 16820, 1: 6224})
Resample dataset shape Counter({0: 16820, 1: 16820})


In [13]:
X_smote.to_csv('data/X_smote.csv', index=False)
y_smote.to_csv('data/y_smote.csv', index=False)

X_test.to_csv('data/X_test.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

X_val.to_csv('data/X_val.csv', index=False)
y_val.to_csv('data/y_val.csv', index=False)