In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics

from sklearn.svm import SVC

In [None]:
# expand rows and columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
df = pd.read_csv('insurance_claims.csv')

In [None]:
# drop unnecessary columns
df = df.drop(['months_as_customer', 'policy_number', 'policy_bind_date',
             'policy_csl', 'insured_zip', 'insured_occupation', 'insured_hobbies',
             'incident_date', 'incident_state', 'incident_city', 'incident_location',
             'auto_make', 'auto_model', '_c39', 'property_damage', 
              'police_report_available', 'collision_type', 'authorities_contacted'], axis = 'columns')

# change target variable "fraud_reported" from "yes vs no" to "0 vs 1"
df = df.replace({'fraud_reported': {'Y': 1, 'N': 0}})

df.head()

In [None]:
df.corr()
# total_claim_amount vs injury_claim vs property_claim vs vehicle_claim 0.56~1

In [None]:
df['fraud_reported'].unique()

In [None]:
# one-hot encoding for categorial variables

# ['OH', 'IN', 'IL']
# ['MALE', 'FEMALE']
# ['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College', 'JD']
# ['husband', 'other-relative', 'own-child', 'unmarried', 'wife', 'not-in-family']
# ['Single Vehicle Collision', 'Vehicle Theft', 'Multi-vehicle Collision', 'Parked Car']
# ['Major Damage', 'Minor Damage', 'Total Loss', 'Trivial Damage']


variables = ['policy_state', 'insured_sex', 'insured_education_level', 'insured_relationship',
            'incident_type', 'incident_severity']

for var in variables:
    df[var] = df[var].astype('category')
    df_num = pd.get_dummies(df, prefix_sep = '_', drop_first = True)

In [None]:
# split the train and validation data
x = df_num.drop(['fraud_reported'], axis='columns')
y = df_num['fraud_reported']
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size = .2, random_state = 1)

In [None]:
# It is usually a good idea to scale the data for SVM training
# Only scale the training set otherwise it'll be cheating

# The transformation is given by:

# X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
# X_scaled = X_std * (max - min) + min
# where min, max = feature_range.

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

X_val = scaler.transform(X_val)

In [None]:
# Linear kernel
svc_linear = SVC(kernel='linear')
svc_linear.fit(X_train,y_train)

y_pred_l=svc_linear.predict(X_val)

print('Accuracy Score:')
print(metrics.accuracy_score(y_val,y_pred_l))
print('F1 Score:')
print(metrics.f1_score(y_val,y_pred_l))

In [None]:
# Polynomial kernel with degree
svc_poly = SVC(kernel='poly', degree = 2)
svc_poly.fit(X_train,y_train)

y_pred_p=svc_poly.predict(X_val)

print('Accuracy Score:')
print(metrics.accuracy_score(y_val,y_pred_p))
print('F1 Score:')
print(metrics.f1_score(y_val,y_pred_p))

In [None]:
# RBF kernel with gamma
# https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html#:~:text=Intuitively%2C%20the%20gamma%20parameter%20defines,the%20model%20as%20support%20vectors.
svc_r = SVC(kernel='rbf')
svc_r.fit(X_train,y_train)

y_pred_r = svc_r.predict(X_val)

print('Accuracy Score:')
print(metrics.accuracy_score(y_val,y_pred_r))
print('F1 Score:')
print(metrics.f1_score(y_val,y_pred_r))

In [None]:
# to be continued:
# Grid search technique 
# using TSNE to plot the final result