In [2]:
# Initial Library Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import datetime as dt
from sklearn.metrics import confusion_matrix,classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

In [3]:
def create_date(df, year, month, day_of_week, week_of_month):
    df['date'] = pd.to_datetime(df[[year, month, day_of_week]])
    df['FirstDayOfMonth'] = pd.to_datetime(df[[year, month]].assign(day=1))
    df['Offset'] = df[day_of_week] - df['FirstDayOfMonth'].dt.weekday
    df['DaysToAdd'] = df[week_of_month]*7 + df['Offset']
    df['date'] = df['FirstDayOfMonth'] + pd.to_timedelta(df['DaysToAdd'], unit='D')
    df.drop(['FirstDayOfMonth', 'Offset', 'DaysToAdd'], axis=1)
    return df

In [4]:
# Import Data from csv
oracle_path = Path('dataset/fraud_oracle.csv')
oracle_data = pd.read_csv(oracle_path)

In [5]:
# View Oracle Dataframe
oracle_data.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [6]:
# List all columns
oracle_data.columns

Index(['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'FraudFound_P', 'PolicyNumber', 'RepNumber',
       'Deductible', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year',
       'BasePolicy'],
      dtype='object')

In [7]:
# Check columns we're unsure of
oracle_data['AgeOfPolicyHolder'].value_counts()

31 to 35    5593
36 to 40    4043
41 to 50    2828
51 to 65    1392
26 to 30     613
over 65      508
16 to 17     320
21 to 25     108
18 to 20      15
Name: AgeOfPolicyHolder, dtype: int64

In [8]:
display(oracle_data['WeekOfMonth'].value_counts())

3    3640
2    3558
4    3398
1    3187
5    1637
Name: WeekOfMonth, dtype: int64

In [9]:
display(oracle_data['Year'].value_counts())

1994    6142
1995    5195
1996    4083
Name: Year, dtype: int64

In [10]:
display(oracle_data['Month'].value_counts())
print()
display(oracle_data['MonthClaimed'].value_counts())

Jan    1411
May    1367
Mar    1360
Jun    1321
Oct    1305
Dec    1285
Apr    1280
Feb    1266
Jul    1257
Sep    1240
Nov    1201
Aug    1127
Name: Month, dtype: int64




Jan    1446
May    1411
Mar    1348
Oct    1339
Jun    1293
Feb    1287
Nov    1285
Apr    1271
Sep    1242
Jul    1225
Dec    1146
Aug    1126
0         1
Name: MonthClaimed, dtype: int64

In [11]:
# Check columns we're unsure of
display(oracle_data['DayOfWeek'].value_counts())
print()
display(oracle_data['DayOfWeekClaimed'].value_counts())

Monday       2616
Friday       2445
Tuesday      2300
Thursday     2173
Wednesday    2159
Saturday     1982
Sunday       1745
Name: DayOfWeek, dtype: int64




Monday       3757
Tuesday      3375
Wednesday    2951
Thursday     2660
Friday       2497
Saturday      127
Sunday         52
0               1
Name: DayOfWeekClaimed, dtype: int64

In [12]:
# Drop single row where 'DayOfWeekClaimed' equals 0
nodayofweek = oracle_data[(oracle_data['DayOfWeekClaimed'] == '0')].index
oracle_data.drop(nodayofweek , inplace=True)

In [13]:
# Check that the row was removed
oracle_data['DayOfWeekClaimed'].value_counts()

Monday       3757
Tuesday      3375
Wednesday    2951
Thursday     2660
Friday       2497
Saturday      127
Sunday         52
Name: DayOfWeekClaimed, dtype: int64

In [14]:
# Check columns we're unsure of
display(oracle_data['WeekOfMonthClaimed'].value_counts())
print()
display(oracle_data['WeekOfMonth'].value_counts())

2    3720
3    3583
1    3449
4    3433
5    1234
Name: WeekOfMonthClaimed, dtype: int64




3    3640
2    3557
4    3398
1    3187
5    1637
Name: WeekOfMonth, dtype: int64

In [15]:
# Check columns we're unsure of
oracle_data['AgeOfVehicle'].value_counts()

7 years        5807
more than 7    3981
6 years        3448
5 years        1357
new             372
4 years         229
3 years         152
2 years          73
Name: AgeOfVehicle, dtype: int64

In [16]:
# Convert string months to integer
d = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
oracle_data.MonthClaimed = oracle_data.MonthClaimed.map(d)
oracle_data.Month = oracle_data.Month.map(d)
display(oracle_data['MonthClaimed'].value_counts())
print()
display(oracle_data['Month'].value_counts())

1     1446
5     1411
3     1348
10    1339
6     1293
2     1287
11    1285
4     1271
9     1242
7     1225
12    1146
8     1126
Name: MonthClaimed, dtype: int64




1     1411
5     1367
3     1360
6     1321
10    1305
12    1285
4     1280
2     1266
7     1256
9     1240
11    1201
8     1127
Name: Month, dtype: int64

In [17]:
# Convert string weekdays to integer
dd = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6, 'Sunday':7}
oracle_data.DayOfWeekClaimed = oracle_data.DayOfWeekClaimed.map(dd)
oracle_data.DayOfWeek = oracle_data.DayOfWeek.map(dd)
display(oracle_data['DayOfWeekClaimed'].value_counts())
print()
display(oracle_data['DayOfWeek'].value_counts())

1    3757
2    3375
3    2951
4    2660
5    2497
6     127
7      52
Name: DayOfWeekClaimed, dtype: int64




1    2615
5    2445
2    2300
4    2173
3    2159
6    1982
7    1745
Name: DayOfWeek, dtype: int64

In [18]:
# Drop columns we don't have enough informnation about or need
oracle_data = oracle_data.drop(columns=['PolicyNumber', 'RepNumber', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'NumberOfSuppliments','Make', 'DriverRating', 'Age', 'PolicyType',
                                        'Month', 'WeekOfMonth', 'DayOfWeek', 'Year'])
oracle_data.head()

Unnamed: 0,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Fault,VehicleCategory,VehiclePrice,FraudFound_P,Deductible,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,AddressChange_Claim,NumberOfCars,BasePolicy
0,Urban,2,1,1,Female,Single,Policy Holder,Sport,more than 69000,0,300,none,3 years,26 to 30,No,No,External,1 year,3 to 4,Liability
1,Urban,1,1,4,Male,Single,Policy Holder,Sport,more than 69000,0,400,none,6 years,31 to 35,Yes,No,External,no change,1 vehicle,Collision
2,Urban,4,11,2,Male,Married,Policy Holder,Sport,more than 69000,0,400,1,7 years,41 to 50,No,No,External,no change,1 vehicle,Collision
3,Rural,5,7,1,Male,Married,Third Party,Sport,20000 to 29000,0,400,1,more than 7,51 to 65,Yes,No,External,no change,1 vehicle,Liability
4,Urban,2,2,2,Female,Single,Third Party,Sport,more than 69000,0,400,none,5 years,31 to 35,No,No,External,no change,1 vehicle,Collision


In [19]:
# Set age of new vehicles to 1 and vehicles over 7 years old to 8
oracle_data.loc[oracle_data['AgeOfVehicle'] == 'new', 'AgeOfVehicle'] = '1'
oracle_data.loc[oracle_data['AgeOfVehicle'] == 'more than 7', 'AgeOfVehicle'] = '8'
# Remove any non-digit character
oracle_data['AgeOfVehicle'].replace("\\D","", regex=True, inplace=True)
# and convert to integer type
oracle_data['AgeOfVehicle'] = oracle_data['AgeOfVehicle'].astype('int')
oracle_data.head()

Unnamed: 0,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Fault,VehicleCategory,VehiclePrice,FraudFound_P,Deductible,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,AddressChange_Claim,NumberOfCars,BasePolicy
0,Urban,2,1,1,Female,Single,Policy Holder,Sport,more than 69000,0,300,none,3,26 to 30,No,No,External,1 year,3 to 4,Liability
1,Urban,1,1,4,Male,Single,Policy Holder,Sport,more than 69000,0,400,none,6,31 to 35,Yes,No,External,no change,1 vehicle,Collision
2,Urban,4,11,2,Male,Married,Policy Holder,Sport,more than 69000,0,400,1,7,41 to 50,No,No,External,no change,1 vehicle,Collision
3,Rural,5,7,1,Male,Married,Third Party,Sport,20000 to 29000,0,400,1,8,51 to 65,Yes,No,External,no change,1 vehicle,Liability
4,Urban,2,2,2,Female,Single,Third Party,Sport,more than 69000,0,400,none,5,31 to 35,No,No,External,no change,1 vehicle,Collision


In [20]:
# Make Age of Vehicle column numerical
"""
oracle_data.loc[oracle_data['AgeOfVehicle'] == 'new', 'AgeOfVehicle'] = '1'
oracle_data.loc[oracle_data['AgeOfVehicle'] == '2 years', 'AgeOfVehicle'] = '2'
oracle_data.loc[oracle_data['AgeOfVehicle'] == '3 years', 'AgeOfVehicle'] = '3'
oracle_data.loc[oracle_data['AgeOfVehicle'] == '4 years', 'AgeOfVehicle'] = '4'
oracle_data.loc[oracle_data['AgeOfVehicle'] == '5 years', 'AgeOfVehicle'] = '5'
oracle_data.loc[oracle_data['AgeOfVehicle'] == '6 years', 'AgeOfVehicle'] = '6'
oracle_data.loc[oracle_data['AgeOfVehicle'] == '7 years', 'AgeOfVehicle'] = '7'
oracle_data.loc[oracle_data['AgeOfVehicle'] == 'more than 7', 'AgeOfVehicle'] = '8'

oracle_data.head()
"""

"\noracle_data.loc[oracle_data['AgeOfVehicle'] == 'new', 'AgeOfVehicle'] = '1'\noracle_data.loc[oracle_data['AgeOfVehicle'] == '2 years', 'AgeOfVehicle'] = '2'\noracle_data.loc[oracle_data['AgeOfVehicle'] == '3 years', 'AgeOfVehicle'] = '3'\noracle_data.loc[oracle_data['AgeOfVehicle'] == '4 years', 'AgeOfVehicle'] = '4'\noracle_data.loc[oracle_data['AgeOfVehicle'] == '5 years', 'AgeOfVehicle'] = '5'\noracle_data.loc[oracle_data['AgeOfVehicle'] == '6 years', 'AgeOfVehicle'] = '6'\noracle_data.loc[oracle_data['AgeOfVehicle'] == '7 years', 'AgeOfVehicle'] = '7'\noracle_data.loc[oracle_data['AgeOfVehicle'] == 'more than 7', 'AgeOfVehicle'] = '8'\n\noracle_data.head()\n"

In [21]:
# View the 'MonthClaimed' column
# oracle_data['MonthClaimed'].value_counts()

In [22]:
# Make MonthClaimed column numerical
"""
oracle_data.loc[oracle_data['MonthClaimed'] == 'Jan', 'MonthClaimed'] = '1'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Feb', 'MonthClaimed'] = '2'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Mar', 'MonthClaimed'] = '3'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Apr', 'MonthClaimed'] = '4'
oracle_data.loc[oracle_data['MonthClaimed'] == 'May', 'MonthClaimed'] = '5'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Jun', 'MonthClaimed'] = '6'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Jul', 'MonthClaimed'] = '7'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Aug', 'MonthClaimed'] = '8'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Sep', 'MonthClaimed'] = '9'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Oct', 'MonthClaimed'] = '10'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Nov', 'MonthClaimed'] = '11'
oracle_data.loc[oracle_data['MonthClaimed'] == 'Dec', 'MonthClaimed'] = '12'

oracle_data.head()
"""

"\noracle_data.loc[oracle_data['MonthClaimed'] == 'Jan', 'MonthClaimed'] = '1'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Feb', 'MonthClaimed'] = '2'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Mar', 'MonthClaimed'] = '3'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Apr', 'MonthClaimed'] = '4'\noracle_data.loc[oracle_data['MonthClaimed'] == 'May', 'MonthClaimed'] = '5'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Jun', 'MonthClaimed'] = '6'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Jul', 'MonthClaimed'] = '7'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Aug', 'MonthClaimed'] = '8'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Sep', 'MonthClaimed'] = '9'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Oct', 'MonthClaimed'] = '10'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Nov', 'MonthClaimed'] = '11'\noracle_data.loc[oracle_data['MonthClaimed'] == 'Dec', 'MonthClaimed'] = '12'\n\noracle_data.head()\n"

In [23]:
# Make DayofWeekClaimed column numerical
"""
oracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Monday', 'DayOfWeekClaimed'] = '1'
oracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Tuesday', 'DayOfWeekClaimed'] = '2'
oracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Wednesday', 'DayOfWeekClaimed'] = '3'
oracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Thursday', 'DayOfWeekClaimed'] = '4'
oracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Friday', 'DayOfWeekClaimed'] = '5'
oracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Saturday', 'DayOfWeekClaimed'] = '6'
oracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Sunday', 'DayOfWeekClaimed'] = '7'

oracle_data.head()
"""

"\noracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Monday', 'DayOfWeekClaimed'] = '1'\noracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Tuesday', 'DayOfWeekClaimed'] = '2'\noracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Wednesday', 'DayOfWeekClaimed'] = '3'\noracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Thursday', 'DayOfWeekClaimed'] = '4'\noracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Friday', 'DayOfWeekClaimed'] = '5'\noracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Saturday', 'DayOfWeekClaimed'] = '6'\noracle_data.loc[oracle_data['DayOfWeekClaimed'] == 'Sunday', 'DayOfWeekClaimed'] = '7'\n\noracle_data.head()\n"

In [24]:
#View Column types
oracle_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15419 entries, 0 to 15419
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   AccidentArea         15419 non-null  object
 1   DayOfWeekClaimed     15419 non-null  int64 
 2   MonthClaimed         15419 non-null  int64 
 3   WeekOfMonthClaimed   15419 non-null  int64 
 4   Sex                  15419 non-null  object
 5   MaritalStatus        15419 non-null  object
 6   Fault                15419 non-null  object
 7   VehicleCategory      15419 non-null  object
 8   VehiclePrice         15419 non-null  object
 9   FraudFound_P         15419 non-null  int64 
 10  Deductible           15419 non-null  int64 
 11  PastNumberOfClaims   15419 non-null  object
 12  AgeOfVehicle         15419 non-null  int32 
 13  AgeOfPolicyHolder    15419 non-null  object
 14  PoliceReportFiled    15419 non-null  object
 15  WitnessPresent       15419 non-null  object
 16  Agen

In [25]:
# Convert newly numerical columns to integers
"""
oracle_data['DayOfWeekClaimed'] = oracle_data['DayOfWeekClaimed'].astype('int')
oracle_data['MonthClaimed'] = oracle_data['MonthClaimed'].astype('int')
oracle_data['AgeOfVehicle'] = oracle_data['AgeOfVehicle'].astype('int')
oracle_data.info()
"""

"\noracle_data['DayOfWeekClaimed'] = oracle_data['DayOfWeekClaimed'].astype('int')\noracle_data['MonthClaimed'] = oracle_data['MonthClaimed'].astype('int')\noracle_data['AgeOfVehicle'] = oracle_data['AgeOfVehicle'].astype('int')\noracle_data.info()\n"

In [26]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [27]:
# Create a list of the columns with categorical variables
categorical_variables = ['AccidentArea', 'Sex', 'MaritalStatus', 'Fault', 'VehicleCategory', 'VehiclePrice', 
                        'PastNumberOfClaims', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 
                        'AgentType', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy']

# Use the fit_transform method from the OneHotEncoder to encode the categorical variables
encoded_data = enc.fit_transform(oracle_data[categorical_variables])

In [28]:
# Create a DataFrame with the encoded variables
encoded_variables = pd.DataFrame(encoded_data,
    columns = enc.get_feature_names(categorical_variables))

encoded_variables.head()



Unnamed: 0,AccidentArea_Rural,AccidentArea_Urban,Sex_Female,Sex_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,Fault_Third Party,...,AddressChange_Claim_no change,AddressChange_Claim_under 6 months,NumberOfCars_1 vehicle,NumberOfCars_2 vehicles,NumberOfCars_3 to 4,NumberOfCars_5 to 8,NumberOfCars_more than 8,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [29]:
# Seperate the numerical columns and concat with encoded features
numerical_variables = oracle_data[['DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 
                                  'Deductible', 'AgeOfVehicle', 'FraudFound_P']]
encoded_oracle = pd.concat([encoded_variables, numerical_variables], axis=1)
encoded_oracle.dropna(inplace=True)
encoded_oracle.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15418 entries, 0 to 15418
Data columns (total 57 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   AccidentArea_Rural                  15418 non-null  float64
 1   AccidentArea_Urban                  15418 non-null  float64
 2   Sex_Female                          15418 non-null  float64
 3   Sex_Male                            15418 non-null  float64
 4   MaritalStatus_Divorced              15418 non-null  float64
 5   MaritalStatus_Married               15418 non-null  float64
 6   MaritalStatus_Single                15418 non-null  float64
 7   MaritalStatus_Widow                 15418 non-null  float64
 8   Fault_Policy Holder                 15418 non-null  float64
 9   Fault_Third Party                   15418 non-null  float64
 10  VehicleCategory_Sedan               15418 non-null  float64
 11  VehicleCategory_Sport               15418

In [30]:
# Assign X and y variables
y = encoded_oracle['FraudFound_P']
X = encoded_oracle.copy()
X.drop(columns='FraudFound_P', inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15418 entries, 0 to 15418
Data columns (total 56 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   AccidentArea_Rural                  15418 non-null  float64
 1   AccidentArea_Urban                  15418 non-null  float64
 2   Sex_Female                          15418 non-null  float64
 3   Sex_Male                            15418 non-null  float64
 4   MaritalStatus_Divorced              15418 non-null  float64
 5   MaritalStatus_Married               15418 non-null  float64
 6   MaritalStatus_Single                15418 non-null  float64
 7   MaritalStatus_Widow                 15418 non-null  float64
 8   Fault_Policy Holder                 15418 non-null  float64
 9   Fault_Third Party                   15418 non-null  float64
 10  VehicleCategory_Sedan               15418 non-null  float64
 11  VehicleCategory_Sport               15418

In [31]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11)

In [32]:
display(X_train.shape)
display(X_test.shape)

(11563, 56)

(3855, 56)

In [33]:
# Create instance for scaler
scaler = StandardScaler()

In [34]:
# Fit the scaler with X training info
X_scaler = scaler.fit(X_train)

In [35]:
# Transform the X data with scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
# Imports for Running LR Model and reports
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

In [37]:
# Create model instance
logistic_regression_model = LogisticRegression()

In [38]:
# Fit model
lr_model = logistic_regression_model.fit(X_train_scaled, y_train)

In [39]:
# Predict for Training Values
training_predictions = lr_model.predict(X_train_scaled)

In [40]:
# Create Dataframe to hold info
y_train_predictions = pd.DataFrame({'Predictions':training_predictions, 'Actual': y_train})
y_train_predictions.head()

Unnamed: 0,Predictions,Actual
5001,0.0,0.0
14146,0.0,0.0
5437,0.0,0.0
13223,0.0,0.0
10106,0.0,0.0


In [41]:
# View Confusion Matrix for Model compared to Training Data
train_conf_matrix = confusion_matrix(y_train, training_predictions)
train_conf_matrix

array([[10868,     1],
       [  692,     2]], dtype=int64)

In [42]:
# Put Confusion matrix into Dataframe
train_matrix = pd.DataFrame(train_conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
train_matrix

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10868,1
Actual 1,692,2


In [43]:
# View Classification Report for Training Predictions
print(classification_report(y_train, training_predictions))

              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     10869
         1.0       0.67      0.00      0.01       694

    accuracy                           0.94     11563
   macro avg       0.80      0.50      0.49     11563
weighted avg       0.92      0.94      0.91     11563



In [44]:
# Predict for testing values
testing_predictions = lr_model.predict(X_test_scaled)

# Review the predictions
testing_predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [45]:
# Put Testing Predictions into Dataframe
y_test_predictions = pd.DataFrame({'Predictions':testing_predictions, 'Actual': y_test})
y_test_predictions.head()

Unnamed: 0,Predictions,Actual
15235,0.0,0.0
11504,0.0,0.0
13248,0.0,0.0
8850,0.0,0.0
9655,0.0,0.0


In [46]:
# View Confusion Matrix for Model compared to test Data
test_conf_matrix = confusion_matrix(y_test, testing_predictions)
test_conf_matrix

array([[3627,    0],
       [ 227,    1]], dtype=int64)

In [47]:
# Put Confusion matrix into Dataframe
test_matrix = pd.DataFrame(test_conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
test_matrix

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3627,0
Actual 1,227,1


In [48]:
# View Classification Report for Testing Predictions
print(classification_report(y_test, testing_predictions))

              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97      3627
         1.0       1.00      0.00      0.01       228

    accuracy                           0.94      3855
   macro avg       0.97      0.50      0.49      3855
weighted avg       0.94      0.94      0.91      3855



In [49]:
# View Model Accuracy Scores
print(f'Training Data Accuracy Score: {lr_model.score(X_train_scaled, y_train)}')
print(f'Testing Data Accuracy Score: {lr_model.score(X_test_scaled, y_test)}')

Training Data Accuracy Score: 0.9400674565424197
Testing Data Accuracy Score: 0.9411154345006485


## k-Nearest Neighbors Algorithm
* This algorithm attempt to predict a test sample class by looking at the k training samples that are nearest (in distance) to the test sample
* The default value for the hyperparameter (k) is 5
* If necessary, we'll use hyperparameter tuning, that is, experimenting with a few values of k, to choose the value that improves the performance of  algorithm

In [50]:
# Creating the KNN Model
# Instantiate the model with k = 2 neighbors
# We first ran with its default k value, which is n_neighbors=5 neighbors, and did not like the result
knn = KNeighborsClassifier(n_neighbors=2)
# Train the model
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=2)

In [51]:
# Create predictions
y_pred = knn.predict(X_test_scaled)

In [52]:
# Print the prediction accuracy
print(f'{knn.score(X_test_scaled, y_test):.2%}')

93.77%


In [53]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[3612,  225],
       [  15,    3]], dtype=int64)

In [54]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       1.00      0.94      0.97      3837
         1.0       0.01      0.17      0.02        18

    accuracy                           0.94      3855
   macro avg       0.50      0.55      0.50      3855
weighted avg       0.99      0.94      0.96      3855



## K-Fold Cross-Validation
* This enables us to use all our data for both training and testing
* As such, it gives a better sense of how well our model will make predictions for new data
* It splits the dataset into k equal-size folds (this k is unrelated to k in knn)
* It then repeatedly train the model with k-1 folds, and test the model with the remaining fold
* The cycle continues until each fold has been used to test the model

In [55]:
# K-Fold Cross-Validation

kfold = KFold(n_splits=10, random_state=11, shuffle=True)
scores = cross_val_score(estimator=knn, X=X_test_scaled, y=y_test, cv=kfold)
scores

array([0.94041451, 0.91968912, 0.95336788, 0.93523316, 0.9507772 ,
       0.91948052, 0.93766234, 0.95064935, 0.93766234, 0.95064935])

In [56]:
print(f'Mean accuracy: {scores.mean():.2%}')
print(f'Accuracy standard deviation: {scores.std():.2%}')

Mean accuracy: 93.96%
Accuracy standard deviation: 1.18%


### KNN Hyperparameter Tuning
* To determine the best k parameter, we'll try different values of k, then compare the estimator performance with each

In [68]:
for k in range(1, 20, 1):
    kfold = KFold(n_splits=5, random_state=11, shuffle=True)
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(estimator=knn, X=X_test_scaled, y=y_test, cv=kfold)
    print(f'k={k:<2}; mean accuracy={scores.mean():.2%}; ' +
         f'standard deviation={scores.std():.2%}')

k=1 ; mean accuracy=89.16%; standard deviation=0.81%
k=2 ; mean accuracy=93.90%; standard deviation=0.69%
k=3 ; mean accuracy=93.28%; standard deviation=0.64%
k=4 ; mean accuracy=93.98%; standard deviation=0.57%
k=5 ; mean accuracy=93.83%; standard deviation=0.57%
k=6 ; mean accuracy=94.06%; standard deviation=0.65%
k=7 ; mean accuracy=94.01%; standard deviation=0.62%
k=8 ; mean accuracy=94.09%; standard deviation=0.66%
k=9 ; mean accuracy=94.09%; standard deviation=0.66%
k=10; mean accuracy=94.09%; standard deviation=0.66%
k=11; mean accuracy=94.09%; standard deviation=0.66%
k=12; mean accuracy=94.09%; standard deviation=0.66%
k=13; mean accuracy=94.09%; standard deviation=0.66%
k=14; mean accuracy=94.09%; standard deviation=0.66%
k=15; mean accuracy=94.09%; standard deviation=0.66%
k=16; mean accuracy=94.09%; standard deviation=0.66%
k=17; mean accuracy=94.09%; standard deviation=0.66%
k=18; mean accuracy=94.09%; standard deviation=0.66%
k=19; mean accuracy=94.09%; standard deviation

## Support Vector Machine (SVM)
* SVM is a kernel machine, whose behavior can be changed using different kernel function
* The linear kernel is the most commonly used
* However, when the relation between class labels and attributes is nonlinear, the RBF kernel is recommended
* We'll use RBF kernel for our analysis
* The RBF has two hyperparameters: C (defaut=1.0) and gamma (default='scale')
* We'll start with the default hyperparameters, then use the “grid-search” approach to find the best C and gamma

In [76]:
# Create an SVM classifier with an RBF kernel and set values of C and gamma to the default values
svc = SVC(kernel='rbf', C=10, gamma=0.001)
#svc = SVC(gamma='scale')
  
# Fit the model to the training data
svc.fit(X_train_scaled, y_train)

SVC(C=10, gamma=0.001)

In [77]:
# Calculate the accuracy of the model on the test data
y_pred = svc.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9411154345006485


In [78]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       1.00      0.94      0.97      3854
         1.0       0.00      1.00      0.01         1

    accuracy                           0.94      3855
   macro avg       0.50      0.97      0.49      3855
weighted avg       1.00      0.94      0.97      3855



In [72]:
# Finding the best values for C and gamma using GridSearch  
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
  
# Create a grid search object
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
  
# Fit the grid search object to the training data
grid.fit(X_train_scaled, y_train)
  
# Get the best parameters
print(
    "The best parameters are %s with a score of %0.2f"
    % (grid.best_params_, grid.best_score_)
)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .....................................C=0.1, gamma=1; total time=   8.3s
[CV] END .....................................C=0.1, gamma=1; total time=   8.3s
[CV] END .....................................C=0.1, gamma=1; total time=   7.8s
[CV] END .....................................C=0.1, gamma=1; total time=  10.0s
[CV] END .....................................C=0.1, gamma=1; total time=   8.2s
[CV] END ...................................C=0.1, gamma=0.1; total time=   3.5s
[CV] END ...................................C=0.1, gamma=0.1; total time=   3.5s
[CV] END ...................................C=0.1, gamma=0.1; total time=   3.6s
[CV] END ...................................C=0.1, gamma=0.1; total time=   3.5s
[CV] END ...................................C=0.1, gamma=0.1; total time=   3.5s
[CV] END ..................................C=0.1, gamma=0.01; total time=   1.8s
[CV] END ..................................C=0.1

## Comparing the Performance of Multiple Models
* It's difficult to know in advance which machine learning model(s) will perform best
* In this section, we'll use the previously discussed individual techniques to compare several classification estimators

In [80]:
# Create the estimators using dictionary key-value pairs
estimators = {
    'KNeighborsClassifier': knn,
    'SVC': SVC(kernel='rbf', C=10, gamma=0.001)}

# Executing the models
for estimator_name, estimator_object in estimators.items():
    kfold = KFold(n_splits=10, random_state=11, shuffle=True)
    scores = cross_val_score(estimator=estimator_object, X=X_test_scaled, y=y_test, cv=kfold)
    print(f'{estimator_name:>20}: ' + 
         f'mean accuracy={scores.mean():.2%}: ' +
         f'standard deviation={scores.std():.2%}')

KNeighborsClassifier: mean accuracy=94.09%: standard deviation=1.14%
                 SVC: mean accuracy=94.09%: standard deviation=1.14%
