## Machine Learning Model

In [1]:
# Initial Library Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Data Loading And Visualization

In [2]:
# Read dataset

df = pd.read_csv('dataset/fraud_oracle.csv')
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


## Data Preparation

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [4]:
df.columns

Index(['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'FraudFound_P', 'PolicyNumber', 'RepNumber',
       'Deductible', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year',
       'BasePolicy'],
      dtype='object')

In [5]:
# Drop columns not needed

fraud_data = df.drop(columns = ['PolicyNumber', 'RepNumber', 'Days_Policy_Claim', 'Days_Policy_Accident', 'Make', 'NumberOfSuppliments', 'DriverRating', 'PolicyType', 'Age', 'Month', 'WeekOfMonth', 'DayOfWeek'])

fraud_data.head()

Unnamed: 0,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Fault,VehicleCategory,VehiclePrice,FraudFound_P,...,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Urban,Tuesday,Jan,1,Female,Single,Policy Holder,Sport,more than 69000,0,...,none,3 years,26 to 30,No,No,External,1 year,3 to 4,1994,Liability
1,Urban,Monday,Jan,4,Male,Single,Policy Holder,Sport,more than 69000,0,...,none,6 years,31 to 35,Yes,No,External,no change,1 vehicle,1994,Collision
2,Urban,Thursday,Nov,2,Male,Married,Policy Holder,Sport,more than 69000,0,...,1,7 years,41 to 50,No,No,External,no change,1 vehicle,1994,Collision
3,Rural,Friday,Jul,1,Male,Married,Third Party,Sport,20000 to 29000,0,...,1,more than 7,51 to 65,Yes,No,External,no change,1 vehicle,1994,Liability
4,Urban,Tuesday,Feb,2,Female,Single,Third Party,Sport,more than 69000,0,...,none,5 years,31 to 35,No,No,External,no change,1 vehicle,1994,Collision


In [43]:
df['DriverRating'].value_counts()

1    3944
3    3884
2    3801
4    3791
Name: DriverRating, dtype: int64

In [7]:
df['Days_Policy_Claim'].value_counts()

more than 30    15342
15 to 30           56
8 to 15            21
none                1
Name: Days_Policy_Claim, dtype: int64

In [8]:
df['AgentType'].value_counts()

External    15179
Internal      241
Name: AgentType, dtype: int64

In [9]:
# Make Age of Vehicle column numerical
df.loc[df['AgeOfVehicle'] == 'new', 'AgeOfVehicle'] = '1'
df.loc[df['AgeOfVehicle'] == '2 years', 'AgeOfVehicle'] = '2'
df.loc[df['AgeOfVehicle'] == '3 years', 'AgeOfVehicle'] = '3'
df.loc[df['AgeOfVehicle'] == '4 years', 'AgeOfVehicle'] = '4'
df.loc[df['AgeOfVehicle'] == '5 years', 'AgeOfVehicle'] = '5'
df.loc[df['AgeOfVehicle'] == '6 years', 'AgeOfVehicle'] = '6'
df.loc[df['AgeOfVehicle'] == '7 years', 'AgeOfVehicle'] = '7'
df.loc[df['AgeOfVehicle'] == 'more than 7', 'AgeOfVehicle'] = '8'

df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,8,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [10]:
# View the 'MonthClaimed' column
df['MonthClaimed'].value_counts()


Jan    1446
May    1411
Mar    1348
Oct    1339
Jun    1293
Feb    1287
Nov    1285
Apr    1271
Sep    1242
Jul    1225
Dec    1146
Aug    1126
0         1
Name: MonthClaimed, dtype: int64

In [11]:
# Make MonthClaimed column numerical
df.loc[df['MonthClaimed'] == 'Jan', 'MonthClaimed'] = '1'
df.loc[df['MonthClaimed'] == 'Feb', 'MonthClaimed'] = '2'
df.loc[df['MonthClaimed'] == 'Mar', 'MonthClaimed'] = '3'
df.loc[df['MonthClaimed'] == 'Apr', 'MonthClaimed'] = '4'
df.loc[df['MonthClaimed'] == 'May', 'MonthClaimed'] = '5'
df.loc[df['MonthClaimed'] == 'Jun', 'MonthClaimed'] = '6'
df.loc[df['MonthClaimed'] == 'Jul', 'MonthClaimed'] = '7'
df.loc[df['MonthClaimed'] == 'Aug', 'MonthClaimed'] = '8'
df.loc[df['MonthClaimed'] == 'Sep', 'MonthClaimed'] = '9'
df.loc[df['MonthClaimed'] == 'Oct', 'MonthClaimed'] = '10'
df.loc[df['MonthClaimed'] == 'Nov', 'MonthClaimed'] = '11'
df.loc[df['MonthClaimed'] == 'Dec', 'MonthClaimed'] = '12'

df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,1,1,Female,Single,...,3,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,1,4,Male,Single,...,6,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,11,2,Male,Married,...,7,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,7,1,Male,Married,...,8,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,2,2,Female,Single,...,5,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [12]:
# Make DayofWeekClaimed column numerical
df.loc[df['DayOfWeekClaimed'] == 'Monday', 'DayOfWeekClaimed'] = '1'
df.loc[df['DayOfWeekClaimed'] == 'Tuesday', 'DayOfWeekClaimed'] = '2'
df.loc[df['DayOfWeekClaimed'] == 'Wednesday', 'DayOfWeekClaimed'] = '3'
df.loc[df['DayOfWeekClaimed'] == 'Thursday', 'DayOfWeekClaimed'] = '4'
df.loc[df['DayOfWeekClaimed'] == 'Friday', 'DayOfWeekClaimed'] = '5'
df.loc[df['DayOfWeekClaimed'] == 'Saturday', 'DayOfWeekClaimed'] = '6'
df.loc[df['DayOfWeekClaimed'] == 'Sunday', 'DayOfWeekClaimed'] = '7'

df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,2,1,1,Female,Single,...,3,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,1,1,4,Male,Single,...,6,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,4,11,2,Male,Married,...,7,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,5,7,1,Male,Married,...,8,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,2,2,2,Female,Single,...,5,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [13]:
#View Column types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [14]:
# Convert newly numerical columns to integers
df['DayOfWeekClaimed'] = df['DayOfWeekClaimed'].astype('int')
df['MonthClaimed'] = df['MonthClaimed'].astype('int')
df['AgeOfVehicle'] = df['AgeOfVehicle'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  int64 
 6   MonthClaimed          15420 non-null  int64 
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [15]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [16]:
# Create a list of the columns with categorical variables
categorical_variables = ['AccidentArea', 'Sex', 'MaritalStatus', 'Fault', 'VehicleCategory', 'VehiclePrice', 
                        'PastNumberOfClaims', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 
                        'AgentType', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy']

# Use the fit_transform method from the OneHotEncoder to encode the categorical variables
encoded_data = enc.fit_transform(df[categorical_variables])


In [17]:
# Create a DataFrame with the encoded variables
encoded_variables = pd.DataFrame(encoded_data,
    columns = enc.get_feature_names(categorical_variables))

encoded_variables.head()



Unnamed: 0,AccidentArea_Rural,AccidentArea_Urban,Sex_Female,Sex_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,Fault_Third Party,...,AddressChange_Claim_no change,AddressChange_Claim_under 6 months,NumberOfCars_1 vehicle,NumberOfCars_2 vehicles,NumberOfCars_3 to 4,NumberOfCars_5 to 8,NumberOfCars_more than 8,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
# Seperate the numerical columns and concat with encoded features
numerical_variables = df[['DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 
                                  'Deductible', 'AgeOfVehicle', 'FraudFound_P']]
encoded_df = pd.concat([encoded_variables, numerical_variables], axis=1)

encoded_df.head()

Unnamed: 0,AccidentArea_Rural,AccidentArea_Urban,Sex_Female,Sex_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,Fault_Third Party,...,NumberOfCars_more than 8,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Deductible,AgeOfVehicle,FraudFound_P
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,2,1,1,300,3,0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1,1,4,400,6,0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,4,11,2,400,7,0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,5,7,1,400,8,0
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,2,2,2,400,5,0


In [82]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 57 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   AccidentArea_Rural                  15420 non-null  float64
 1   AccidentArea_Urban                  15420 non-null  float64
 2   Sex_Female                          15420 non-null  float64
 3   Sex_Male                            15420 non-null  float64
 4   MaritalStatus_Divorced              15420 non-null  float64
 5   MaritalStatus_Married               15420 non-null  float64
 6   MaritalStatus_Single                15420 non-null  float64
 7   MaritalStatus_Widow                 15420 non-null  float64
 8   Fault_Policy Holder                 15420 non-null  float64
 9   Fault_Third Party                   15420 non-null  float64
 10  VehicleCategory_Sedan               15420 non-null  float64
 11  VehicleCategory_Sport               15420

## Principal Component Analysis

In [90]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import hvplot.pandas
from sklearn.cluster import KMeans

In [91]:
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,2,1,1,Female,Single,...,3,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,1,1,4,Male,Single,...,6,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,4,11,2,Male,Married,...,7,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,5,7,1,Male,Married,...,8,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,2,2,2,Female,Single,...,5,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  int64 
 6   MonthClaimed          15420 non-null  int64 
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [93]:
pca_data = df[['Age','DayOfWeek','FraudFound_P', 'Deductible', 'DriverRating', 'BasePolicy']]

pca_data.head()

Unnamed: 0,Age,DayOfWeek,FraudFound_P,Deductible,DriverRating,BasePolicy
0,21,Wednesday,0,300,1,Liability
1,34,Wednesday,0,400,4,Collision
2,47,Friday,0,400,3,Collision
3,65,Saturday,0,400,2,Liability
4,27,Monday,0,400,1,Collision


In [94]:
# Plot the clusters using the "Age" and "Deductiblee" columns
pca_data.hvplot.scatter(
    x="Age",
    y="Deductible",
    by="FraudFound_P"
)

In [95]:
# Plot the clusters using the "limit_bal" and "age" columns
pca_data.hvplot.scatter(
    x="Age",
    y="DriverRating",
    by="FraudFound_P"
)

In [96]:
# Plot the clusters using the "limit_bal" and "age" columns
# Plot the clusters using the "limit_bal" and "age" columns
pca_data.hvplot.scatter(
    x="BasePolicy",
    y="Age",
    by="FraudFound_P"
)

In [106]:
pca_data_new = df[['FraudFound_P', 'Deductible', 'Age', 'WeekOfMonth', 'MonthClaimed']]

pca_data_new.head()

Unnamed: 0,FraudFound_P,Deductible,Age,WeekOfMonth,MonthClaimed
0,0,300,21,5,1
1,0,400,34,3,1
2,0,400,47,5,11
3,0,400,65,2,7
4,0,400,27,5,2


In [107]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [156]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 57 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   AccidentArea_Rural                  15420 non-null  float64
 1   AccidentArea_Urban                  15420 non-null  float64
 2   Sex_Female                          15420 non-null  float64
 3   Sex_Male                            15420 non-null  float64
 4   MaritalStatus_Divorced              15420 non-null  float64
 5   MaritalStatus_Married               15420 non-null  float64
 6   MaritalStatus_Single                15420 non-null  float64
 7   MaritalStatus_Widow                 15420 non-null  float64
 8   Fault_Policy Holder                 15420 non-null  float64
 9   Fault_Third Party                   15420 non-null  float64
 10  VehicleCategory_Sedan               15420 non-null  float64
 11  VehicleCategory_Sport               15420

In [165]:
encoded_new = encoded_df.drop(columns = 'FraudFound_P')
encoded_new.head()

Unnamed: 0,AccidentArea_Rural,AccidentArea_Urban,Sex_Female,Sex_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,Fault_Third Party,...,NumberOfCars_5 to 8,NumberOfCars_more than 8,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Deductible,AgeOfVehicle
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,2,1,1,300,3
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1,1,4,400,6
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,4,11,2,400,7
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,5,7,1,400,8
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,2,2,2,400,5


In [166]:
# Fit the PCA model on the transformed credit card DataFrame
pca_df = pca.fit_transform(encoded_new)

# Review the first 5 rows of list data
pca_df[:5]

array([[-107.71371628,    5.49379137],
       [  -7.70612755,    5.34018775],
       [  -7.70336825,   -4.63538254],
       [  -7.69932636,   -0.64747901],
       [  -7.70984869,    4.41017331]])

In [167]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.988455  , 0.00615183])

In [168]:
# Create the PCA DataFrame
pca_df_new = pd.DataFrame(
    pca_df,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
pca_df_new.head()


Unnamed: 0,PCA1,PCA2
0,-107.713716,5.493791
1,-7.706128,5.340188
2,-7.703368,-4.635383
3,-7.699326,-0.647479
4,-7.709849,4.410173


In [171]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=2, random_state=0)

# Fit the model
model.fit(pca_df_new)

# Make predictions
k_2 = model.predict(pca_df_new)

# Create a copy of the customers_pca_df DataFrame
fraud_pca_predictions = pca_df_new.copy()

# Add a class column with the labels
fraud_pca_predictions["FraudFound_P"] = k_2

In [172]:
# Plot the clusters
fraud_pca_predictions.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="FraudFound_P"
)

## PCA Observations

# blasdfweijnokmfelwkmfklewm

## Neural Network

In [151]:
# Import packages 

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout

In [152]:
# dataframe

encoded_df.head()

Unnamed: 0,AccidentArea_Rural,AccidentArea_Urban,Sex_Female,Sex_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widow,Fault_Policy Holder,Fault_Third Party,...,NumberOfCars_more than 8,BasePolicy_All Perils,BasePolicy_Collision,BasePolicy_Liability,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Deductible,AgeOfVehicle,FraudFound_P
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,2,1,1,300,3,0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1,1,4,400,6,0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,4,11,2,400,7,0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,5,7,1,400,8,0
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,2,2,2,400,5,0


In [175]:
# Assign X and y variables
y = encoded_df['FraudFound_P']
X = encoded_df.copy()
X.drop(columns='FraudFound_P', inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 56 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   AccidentArea_Rural                  15420 non-null  float64
 1   AccidentArea_Urban                  15420 non-null  float64
 2   Sex_Female                          15420 non-null  float64
 3   Sex_Male                            15420 non-null  float64
 4   MaritalStatus_Divorced              15420 non-null  float64
 5   MaritalStatus_Married               15420 non-null  float64
 6   MaritalStatus_Single                15420 non-null  float64
 7   MaritalStatus_Widow                 15420 non-null  float64
 8   Fault_Policy Holder                 15420 non-null  float64
 9   Fault_Third Party                   15420 non-null  float64
 10  VehicleCategory_Sedan               15420 non-null  float64
 11  VehicleCategory_Sport               15420

In [176]:
# Split into training and testing windows
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [177]:
# Create the StandardScaler instance
X_scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [178]:
# Define the the number of inputs to the model
number_inputs = 56

# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 = 64

# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 = 32

# Create the Sequential model instance
nn = Sequential()

# Add the first Dense layer specifying the number of inputs, the number of hidden nodes, and the activation function
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_inputs, activation="relu"))

# Add the second Dense layer specifying the number of hidden nodes and the activation function
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

nn.add(Dropout(.2,input_shape=(hidden_nodes_layer2,)))

# Add the second Dense layer specifying the number of hidden nodes and the activation function
nn.add(Dense(units=24, activation="relu"))

nn.add(Dropout(.2,input_shape=(24,)))

# Add the second Dense layer specifying the number of hidden nodes and the activation function
nn.add(Dense(units=12, activation="relu"))

# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(1))

In [179]:
# Summarise the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 64)                3648      
                                                                 
 dense_11 (Dense)            (None, 32)                2080      
                                                                 
 dropout_4 (Dropout)         (None, 32)                0         
                                                                 
 dense_12 (Dense)            (None, 24)                792       
                                                                 
 dropout_5 (Dropout)         (None, 24)                0         
                                                                 
 dense_13 (Dense)            (None, 12)                300       
                                                                 
 dense_14 (Dense)            (None, 1)                

In [180]:
# Compile the Sequential model
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

In [181]:
# Fit the model
nn.fit(X_train_scaled,y_train, 
                    epochs=20,
                    batch_size=100,
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb3ba9ece10>

In [183]:
#evaluating model

nn.evaluate(X_test_scaled, y_test, verbose=2)

121/121 - 2s - loss: 0.0600 - accuracy: 0.9344 - 2s/epoch - 13ms/step


[0.05997014418244362, 0.9343709349632263]

In [None]:
 # Saving prediction for the test data