---
## Classification practice on claimants data <br> <font size=3.4> Dataset:claimants.csv<br><font size=2>Used logistic regression, various model validation techniques, and different ensemble methods to find the best model
---

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
filename= '/Users/Ketan/Datasets/claimants.csv'
df= pd.read_csv(filename)
df

Unnamed: 0,CASENUM,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,5,0,0.0,1.0,0.0,50.0,34.940
1,3,1,1.0,0.0,0.0,18.0,0.891
2,66,1,0.0,1.0,0.0,5.0,0.330
3,70,0,0.0,1.0,1.0,31.0,0.037
4,96,1,0.0,1.0,0.0,30.0,0.038
...,...,...,...,...,...,...,...
1335,34100,1,0.0,1.0,0.0,,0.576
1336,34110,0,1.0,1.0,0.0,46.0,3.705
1337,34113,1,1.0,1.0,0.0,39.0,0.099
1338,34145,0,1.0,0.0,0.0,8.0,3.177


In [3]:
df = df.iloc[:,1:]
df

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,0,0.0,1.0,0.0,50.0,34.940
1,1,1.0,0.0,0.0,18.0,0.891
2,1,0.0,1.0,0.0,5.0,0.330
3,0,0.0,1.0,1.0,31.0,0.037
4,1,0.0,1.0,0.0,30.0,0.038
...,...,...,...,...,...,...
1335,1,0.0,1.0,0.0,,0.576
1336,0,1.0,1.0,0.0,46.0,3.705
1337,1,1.0,1.0,0.0,39.0,0.099
1338,0,1.0,0.0,0.0,8.0,3.177


In [4]:
df.isnull().sum()

ATTORNEY      0
CLMSEX       12
CLMINSUR     41
SEATBELT     48
CLMAGE      189
LOSS          0
dtype: int64

In [5]:
df['CLMAGE']=df['CLMAGE'].fillna(df['CLMAGE'].mean())
df.isnull().sum()

ATTORNEY     0
CLMSEX      12
CLMINSUR    41
SEATBELT    48
CLMAGE       0
LOSS         0
dtype: int64

In [6]:
df = df.fillna(df.mode().iloc[0])

In [7]:
X= df.iloc[:,1:]
Y= df.iloc[:,0]

## Using simple logistic regression first

In [8]:
#Logistic regression and fit the model
from sklearn.linear_model import LogisticRegression

model1 = LogisticRegression()
model1.fit(X,Y)
#Predict for X dataset
y_pred1 = model1.predict(X)
y_pred_df1= pd.DataFrame({'actual': Y,
                         'predicted_prob': model1.predict(X)})
y_pred_df1

Unnamed: 0,actual,predicted_prob
0,0,0
1,1,1
2,1,1
3,0,1
4,1,1
...,...,...
1335,1,1
1336,0,0
1337,1,1
1338,0,0


In [9]:
#Classification report
from sklearn.metrics import classification_report
print(classification_report(Y,y_pred1))

              precision    recall  f1-score   support

           0       0.75      0.64      0.69       685
           1       0.67      0.78      0.72       655

    accuracy                           0.70      1340
   macro avg       0.71      0.71      0.70      1340
weighted avg       0.71      0.70      0.70      1340



In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(Y, model1.predict_proba (X)[:,1])

auc = roc_auc_score(Y, y_pred1)

import matplotlib.pyplot as plt
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')
plt.show()

In [11]:
auc

0.7053045077171672

## Selecting the model validation technique

### Trial 1 : Train Test split approach

In [12]:
from sklearn.model_selection import train_test_split
import numpy as np
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model2 = LogisticRegression()
model2.fit(X_train, Y_train)
result2 = model2.score(X_test, Y_test)
np.round(result2, 4)

0.702

### Trial 2 : Cross Validation approach

In [13]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model3 = LogisticRegression(max_iter=400)
results3 = cross_val_score(model3, X, Y, cv=kfold)
print('Result:',np.round(results3.mean(),4),'\n','\n','Standard dev:',np.round(results3.std(),4))

Result: 0.7022 
 
 Standard dev: 0.0297


### Trial 3 : Leave One Out Cross Validation approach

In [14]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
loocv = LeaveOneOut()
model4 = LogisticRegression(max_iter=400)
results4 = cross_val_score(model4, X, Y, cv=loocv)
print('Result:',np.round(results4.mean(),4),'\n','\n','Standard dev:',np.round(results4.std(),4))

Result: 0.6985 
 
 Standard dev: 0.4589


### Since the CV score for Cross Validation approach is the best, so we'll finalise this as our model validation technique,Now, let's try some Ensemble methods to see if we can further increase the accuracy of the model

### Trial-1: Bagging

In [15]:
# Bagged Decision Trees for Classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

seed = 7
cart = DecisionTreeClassifier()
num_trees = 100
model5 = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results5 = cross_val_score(model5, X, Y, cv=kfold)
print(results5.mean())

0.6626865671641792


### Trial-2: Random Forest

In [16]:
# Random Forest Classification

from sklearn.ensemble import RandomForestClassifier

num_trees = 100
max_features = 2
model6 = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results6 = cross_val_score(model6, X, Y, cv=kfold)
print(results6.mean())

0.673134328358209


### Trial-3: Boosting

In [17]:
# AdaBoost Classification

from sklearn.ensemble import AdaBoostClassifier
num_trees = 10
seed=7

model7 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results7 = cross_val_score(model7, X, Y, cv=kfold)
print(results7.mean())

0.7216417910447761


### Trial-4: Stacking

In [18]:
# Stacking Ensemble for Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# create the sub models
estimators = []
model8_1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model8_1))
model8_2 = DecisionTreeClassifier()
estimators.append(('cart', model8_2))
model8_3 = SVC()
estimators.append(('svm', model8_3))

# create the ensemble model
ensemble1 = VotingClassifier(estimators)
results8 = cross_val_score(ensemble1, X, Y, cv=kfold)
print(results8.mean())

0.708955223880597


In [19]:
# create the sub models
estimators = []
model9_1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model9_1))
model9_2 = DecisionTreeClassifier()
estimators.append(('cart', model9_2))
model9_3 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('Boosting', model9_3))

# create the ensemble model
ensemble2 = VotingClassifier(estimators)
results9 = cross_val_score(ensemble2, X, Y, cv=kfold)
print(results9.mean())

0.7238805970149254


In [20]:
# create the sub models
estimators = []
model10_1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model10_1))
model10_2 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('boosting', model10_2))
model10_3 = SVC()
estimators.append(('svm', model10_3))

# create the ensemble model
ensemble3 = VotingClassifier(estimators)
results10 = cross_val_score(ensemble2, X, Y, cv=kfold)
print(results9.mean())

0.7104477611940297


In [21]:
# create the sub models
estimators = []
model11_1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model11_1))
model11_2 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('boosting', model11_2))


# create the ensemble model
ensemble4 = VotingClassifier(estimators)
results11 = cross_val_score(ensemble4, X, Y, cv=kfold)
print(results11.mean())

0.7208955223880598
