In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option("display.max_columns",None)

In [None]:
df=pd.read_csv(r"https://raw.githubusercontent.com/dsrscientist/Data-Science-ML-Capstone-Projects/master/Automobile_insurance_fraud.csv")


In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:

df.drop('_c39',axis=1,inplace=True)
df.drop(['policy_number','incident_location'],axis=1,inplace=True)

In [None]:
num_vars = [y for y in df.columns if df[y].dtypes != 'O']
df[num_vars]

In [None]:
df.isnull().sum()

In [None]:
print(df['collision_type'].value_counts())
print(df['property_damage'].value_counts())
print(df['police_report_available'].value_counts())

In [None]:
# replacing the character ? with mean of the feature

df['collision_type']=df['collision_type'].replace(
    to_replace='?', 
    value='Rear Collision', 
    inplace=False, 
    limit=None, 
    regex=False, method='pad')
df['property_damage']=df['property_damage'].replace(
    to_replace='?', 
    value='No', 
    inplace=False, 
    limit=None, 
    regex=False, method='pad')
df['police_report_available']=df['police_report_available'].replace(
    to_replace='?', 
    value='No', 
    inplace=False, 
    limit=None, 
    regex=False, method='pad')

In [None]:
df.head()

In [None]:
import datetime

df['policy_year'] = pd.to_datetime(df.policy_bind_date,format='%d-%m-%Y')
df['incident_year'] = pd.to_datetime(df.incident_date, format='%d-%m-%Y')
df['policy_year'] = df['policy_year'].dt.strftime('%Y')
df['incident_year'] = df['incident_year'].dt.strftime('%Y')

In [None]:
df.drop(['policy_bind_date','incident_date'],axis=1,inplace=True)

# grouping categorical variables 
cat_vars = [x for x in df.columns if df[x].dtypes == 'O']
df[cat_vars]
cat_vars.remove('policy_year')

In [None]:
# checking for any skewness in the dataset
for i in df.columns:
    sns.histplot(df[i],bins=20)
    plt.show()

In [None]:
# check for outliers

def boxplots_custom(dataset, columns_list, rows, cols, subtitle):
    fig, axs = plt.subplots(rows, cols, sharey=True, figsize=(16,12))
#     fig.subtitle(subtitle,y=0.63, size=14)
    axs = axs.flatten()
    for i, data in enumerate(columns_list):
        if i % 3 == 0:
            axs[i].set_ylabel('The number of entries')
        sns.boxplot(data=dataset[data], orient='h', ax=axs[i])
        axs[i].set_title(data)
        
boxplots_custom(dataset=df, columns_list=num_vars, rows=6, cols=3, subtitle='Boxplots before deleting outliers')

In [None]:
# plotting countplot
for i in num_vars:
    plt.figure(figsize=(7,5))
    sns.countplot(df[i],hue=df['fraud_reported'],palette='GnBu')
    plt.show()

In [None]:
sns.stripplot(x='age',y='months_as_customer',data=df,jitter=True,hue='fraud_reported',palette='GnBu')

There are more number of Genuine cases who claim insurance and the months as a customer will gradually increases with the age of the insured suggesting that the insured is a Genuine customer & will not make any fraud claims

In [None]:
sns.stripplot(x='months_as_customer',y='injury_claim',data=df,jitter=True,hue='fraud_reported',palette='GnBu')

In [None]:
sns.stripplot(x='months_as_customer',y='vehicle_claim',data=df,jitter=True,hue='fraud_reported',palette='GnBu')

In [None]:
sns.stripplot(x='total_claim_amount',y='injury_claim',data=df,jitter=True,hue='fraud_reported',palette='GnBu')

In [None]:
sns.stripplot(x='total_claim_amount',y='property_claim',data=df,jitter=True,hue='fraud_reported',palette='GnBu')

In [None]:
sns.stripplot(x='vehicle_claim',y='total_claim_amount',data=df,jitter=True,hue='fraud_reported',palette='GnBu')

In [None]:
fx, ax = plt.subplots(nrows=1, ncols=1, figsize=(25,19))
ax.set_title('Correlation Matrix', fontsize=16)

sns.heatmap(df.corr(), vmin=-1, vmax=1, cmap='GnBu', annot=True)
plt.show()

### label Encoder

In [None]:
#Labelizing categorical variables

from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
df[cat_vars]=df[cat_vars].apply(lb.fit_transform)

In [None]:
df.head()

In [None]:
# deleting outliers
Q1 = df[num_vars].quantile(0.25)
Q3 = df[num_vars].quantile(0.75)
IQR = Q3 - Q1
print('Here we will get IQR for each column\n',IQR)

df= df[~((df[num_vars] < (Q1 - 1.5 * IQR)) |(df[num_vars] > (Q3 + 1.5 * IQR))).any(axis=1)]
display(df.shape)

In [None]:
x=df.drop(['fraud_reported','incident_year'],axis=1)
y=df['fraud_reported']

In [None]:
y.value_counts()

### Re-shaping

In [None]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
# splitting the dataset in to train and test split
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.33, random_state=42)
#combine them back for resampling
train_data = pd.concat([X_train, y_train], axis=1)
# separate minority and majority classes
negative = train_data[train_data.fraud_reported==0]
positive = train_data[train_data.fraud_reported==1]
# upsample minority
pos_upsampled = resample(negative,
 replace=True, # sample with replacement
 n_samples=len(positive), # match number in majority class
 random_state=27) # reproducible results
# combine majority and upsampled minority
upsampled = pd.concat([positive, pos_upsampled])
# check new class counts
upsampled.fraud_reported.value_counts()

In [None]:
x=upsampled.drop('fraud_reported',axis=1)
y=upsampled['fraud_reported']

In [None]:
#Removing skewness
x.skew()
from sklearn.preprocessing import power_transform
x[num_vars]=power_transform(x[num_vars],method='yeo-johnson')

In [None]:
#import al the necessary libraries

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

In [None]:
# split train test set
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
y_test.value_counts()

### Scaling the data set

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standardized_x=scaler.fit_transform(x_train)
standardized_xtest=scaler.fit_transform(x_test)
df_standardized = pd.DataFrame(data=standardized_x)
df_standardized.head()

In [None]:
models= [('lr',LogisticRegression()),('rfc',RandomForestClassifier()),('etc',ExtraTreesClassifier()),('bgc',BaggingClassifier()),
        ('gbc',GradientBoostingClassifier()),('dtc',DecisionTreeClassifier()),('knn',KNeighborsClassifier()),
        ('bnb',BernoulliNB()),('svc',SVC())]


### cross validation

In [None]:
#function to find crossValidation score of different models

def basic_model_selection(x,y,cross_folds,model):
    scores=[]
    names = []
    for i,j in model:
        cv_scores = cross_val_score(j, x, y, cv=cross_folds,n_jobs=-1)
        scores.append(cv_scores)
        names.append(i)
    for k in range(len(scores)):
        print(names[k],scores[k].mean())

In [None]:
basic_model_selection(standardized_x,y_train,10,models)

In [None]:
param_test1 = {'n_estimators':range(20,81,10)}
gbc_search= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='accuracy',n_jobs=4,cv=5)
gbc_search.fit(standardized_x,y_train)

In [None]:
gbc_search.best_params_, gbc_search.best_score_

In [None]:
gbc= GradientBoostingClassifier(max_depth=8,
                                max_features='sqrt',
                                min_samples_leaf=50,
                                min_samples_split=500,
                                random_state=10,
                                subsample=0.8,n_estimators=20)

In [None]:
gbc.fit(standardized_x,y_train)
pred_gbc=gbc.predict(standardized_xtest)

In [None]:
print(accuracy_score(pred_gbc,y_test))
print(classification_report(pred_gbc,y_test))
confusion_matrix(y_test,pred_gbc)

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score,accuracy_score,recall_score,roc_auc_score,roc_curve
sm = SMOTE(random_state=42,n_jobs=-1)
X_train, Y_train = sm.fit_resample(standardized_x,y_train)

smote = GradientBoostingClassifier(max_depth=8,
                                max_features='sqrt',
                                min_samples_leaf=50,
                                min_samples_split=500,
                                random_state=10,
                                subsample=0.8,n_estimators=20).fit(X_train,Y_train)

smote_pred = smote.predict(standardized_xtest)

# Checking accuracy
accuracy_score(y_test, smote_pred)

In [None]:
print(accuracy_score(smote_pred,y_test))
print(classification_report(smote_pred,y_test))
confusion_matrix(y_test,smote_pred)

### Hyper parameter tuning for Bagging Classifier

In [None]:
param_grid = {
    'base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'max_samples' : [0.05, 0.1, 0.2, 0.5]
}

clf = GridSearchCV(BaggingClassifier(RandomForestClassifier(),
                                     n_estimators = 100, max_features = 0.5),
                   param_grid, scoring = 'accuracy')
clf.fit(standardized_x, y_train)

In [None]:
smote_bgc = BaggingClassifier(n_estimators = 100, max_features = 0.5).fit(standardized_x,y_train)

smote_pred_bgc = smote_bgc.predict(standardized_xtest)

# Checking accuracy
accuracy_score(y_test, smote_pred_bgc)

In [None]:
print(accuracy_score(smote_pred_bgc,y_test))
print(classification_report(smote_pred_bgc,y_test))
confusion_matrix(y_test,smote_pred_bgc)


In [None]:
predicted_df={"Predicted Loan_Status":smote_pred_bgc,"Original Loan_Status":y_test}
predicted_df=pd.DataFrame(predicted_df)
y_test.value_counts()

### creating pickel file

In [None]:
import pickle
filename = 'Insurance.pkl'
pickle.dump(smote_bgc, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open('Insurance.pkl', 'rb'))
result = loaded_model.score(standardized_xtest, y_test)
print(result*100)

In [None]:
conclusion=pd.DataFrame([loaded_model.predict(standardized_xtest)[:],y_test[:]],index=["Predicted","Original"])
conclusion