In [1]:
# the objective of this project is to apply cross validation and GridSearchCV on a data

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as st
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width',1000)

In [3]:
# Lets take a data set with classification problem

In [4]:
colnames=['buying','maint','doors','persons','lug_boot','safety','target']

In [5]:
cars=pd.read_csv('cars.csv',names=colnames)
cars.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [6]:
# lets explore the features

In [7]:
cars.shape

(1728, 7)

In [8]:
cars.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
target      0
dtype: int64

In [9]:
cars.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
target      object
dtype: object

In [10]:
cars.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'target'], dtype='object')

In [11]:
cars['buying'].value_counts()

high     432
low      432
vhigh    432
med      432
Name: buying, dtype: int64

In [12]:
cars['maint'].value_counts()

high     432
low      432
vhigh    432
med      432
Name: maint, dtype: int64

In [13]:
cars['doors'].value_counts()

5more    432
2        432
3        432
4        432
Name: doors, dtype: int64

In [14]:
cars['persons'].value_counts()

2       576
more    576
4       576
Name: persons, dtype: int64

In [15]:
cars['lug_boot'].value_counts()

big      576
small    576
med      576
Name: lug_boot, dtype: int64

In [16]:
cars['safety'].value_counts()

high    576
low     576
med     576
Name: safety, dtype: int64

In [17]:
cars['target'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: target, dtype: int64

In [18]:
# the features are balanced

In [19]:
# lets encode the features and target

In [20]:
from sklearn.preprocessing import LabelEncoder
def le(df,col):
    lab=LabelEncoder()
    df[col]=pd.Series(lab.fit_transform(df[col]))

In [21]:
cars.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'target'], dtype='object')

In [22]:
le(cars,'buying')
le(cars,'maint')
le(cars,'doors')
le(cars,'persons')
le(cars,'lug_boot')
le(cars,'safety')
le(cars,'target')

In [23]:
cars.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
5,3,3,0,0,1,0,2
6,3,3,0,0,0,1,2
7,3,3,0,0,0,2,2
8,3,3,0,0,0,0,2
9,3,3,0,1,2,1,2


In [24]:
# lets check the distribution and outliers

In [25]:
cols=cars.columns.values

In [26]:
cars.skew()

buying      0.000000
maint       0.000000
doors       0.000000
persons     0.000000
lug_boot    0.000000
safety      0.000000
target     -0.988383
dtype: float64

In [27]:
# there is no skewness

In [28]:
from scipy.stats import zscore
z=np.abs(zscore(cars))
np.where(z>3)

(array([], dtype=int64), array([], dtype=int64))

In [29]:
# no outliers

In [30]:
# now, lets build a DecisionTree and SVC on the dataset and apply cross validation and GrigSearchCV

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score,f1_score,recall_score

from sklearn.model_selection import cross_val_score,cross_val_predict,cross_validate

In [45]:
# Now, let's define some functions to reuse them
def fitc(m,x,y,rs=45):
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.25,random_state=rs,stratify=y)
    m.fit(x_train,y_train)
    
def get_scorec(m,x,y,rs=45):
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.25,random_state=rs,stratify=y)
    m.fit(x_train,y_train)
    train_score=m.score(x_train,y_train)
    pred=m.predict(x_test)
    test_score=accuracy_score(pred,y_test)
    return train_score,test_score

def get_predc(m,x,y,rs=45):
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.25,random_state=rs,stratify=y)
    m.fit(x_train,y_train)
    pred=m.predict(x_test)
    return pred

def get_metricsc(m,x,y,rs=45):
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.25,random_state=rs,stratify=y)
    m.fit(x_train,y_train)
    pred=m.predict(x_test)
    acc=accuracy_score(pred,y_test)
    conf=confusion_matrix(pred,y_test)
    clas=classification_report(pred,y_test)
    f1=f1_score(y_test,pred,average='weighted')
    recall=f1_score(pred,y_test,average='weighted')
    precision=f1_score(pred,y_test,average='weighted')
    return acc,f1,recall,precision,conf,clas

from sklearn.model_selection import GridSearchCV
def gridcvc(m,p,x,y):
    gkb=GridSearchCV(m,p,n_jobs=-1,cv=5,scoring='accuracy')
    gkb.fit(x,y)
    return gkb.best_params_


from sklearn.model_selection import RandomizedSearchCV
def randomcvc(m,p,x,y):
    from sklearn.model_selection import RandomizedSearchCV
    gkb=RandomizedSearchCV(estimator=m,param_distributions=p,
                          cv=3,n_iter=50,
                          n_jobs=-1, verbose=5,return_train_score=True,random_state=45)
    gkb.fit(x,y)
    return gkb.best_params_

from mlxtend.feature_selection import SequentialFeatureSelector as sfs
def bfec(m,x,y,k):
    sf=sfs(m, k_features=k, forward=False, cv=5, n_jobs=-1)
    sf.fit(x,y)
    feat_names=list(sf.k_feature_names_)
    xbfe=x[feat_names]
    return xbfe

def ffsc(m,x,y,k):
    sf=sfs(m, k_features=k, forward=True, cv=5, n_jobs=-1)
    sf.fit(x,y)
    feat_names=list(sf.k_feature_names_)
    xffs=x[feat_names]
    return xffs

from sklearn.decomposition import PCA
def nforpcac(x):
    pca=PCA().fit(x)
    plt.figure()
    plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
    plt.xlabel('No of Components')
    plt.ylabel('Variance')
    plt.axhline(y=95, color='r', linestyle='--')
    plt.show()
    
def pcac(m,x,n):
    pca=PCA(n_components=n)
    xpca=pd.DataFrame(pca.fit_transform(x))
    return xpca

def crossc(m,x,y,c=5):
    score=cross_val_score(m,x,y,cv=c,n_jobs=-1)
    return (score.mean(), score.std())

from sklearn.preprocessing import StandardScaler
def stdc(x):
    scx=StandardScaler()
    xs=pd.DataFrame(scx.fit_transform(x), columns=x.columns)
    return xs

from sklearn.preprocessing import MinMaxScaler
def mmc(x):
    mm=MinMaxScaler()
    xm=pd.DataFrame(mm.fit_transform(x), columns=x.columns)
    return xm

def modified_bfec(m,x,y,k):
    sf=sfs(m, k_features=k, forward=False, cv=5, n_jobs=-1)
    sf.fit(x,y)
    feat_names=list(sf.k_feature_names_)
    xbfe=x[feat_names]
    x_train, x_test, y_train, y_test=train_test_split(xbfe,y,test_size=0.25,random_state=45,stratify=y)
    m.fit(x_train,y_train)
    pred=m.predict(x_test)
    test_score=accuracy_score(pred,y_test)
    return test_score

def modified_ffsc(m,x,y,k):
    sf=sfs(m, k_features=k, forward=True, cv=5, n_jobs=-1)
    sf.fit(x,y)
    feat_names=list(sf.k_feature_names_)
    xffs=x[feat_names]
    x_train, x_test, y_train, y_test=train_test_split(xffs,y,test_size=0.25,random_state=45,stratify=y)
    m.fit(x_train,y_train)
    pred=m.predict(x_test)
    test_score=accuracy_score(pred,y_test)
    return test_score

def modified_pcac(m,x,y,n):
    pca=PCA(n_components=n)
    xpca=pd.DataFrame(pca.fit_transform(x))
    x_train, x_test, y_train, y_test=train_test_split(xpca,y,test_size=0.25,random_state=45,stratify=y)
    m.fit(x_train,y_train)
    pred=m.predict(x_test)
    test_score=accuracy_score(pred,y_test)
    return test_score

def ttsc(x,y,rs=45):
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.25,random_state=rs,stratify=y)
    return x_train, x_test, y_train, y_test

def classification(m,x,y,rs=45):
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.25,random_state=rs,stratify=y)
    m.fit(x_train,y_train)
    pred=m.predict(x_test)
    clas=classification_report(pred,y_test)
    return clas

In [33]:
dtc=DecisionTreeClassifier(random_state=45)
svc=SVC(random_state=45)

In [34]:
# lets seperate the data
x=cars.drop('target',axis=1)
y=cars['target']

In [35]:
xs=stdc(x)

In [36]:
print("DT ", get_scorec(dtc,x,y))
print("SVC ", get_scorec(svc,x,y))

DT  (1.0, 0.9861111111111112)
SVC  (0.9382716049382716, 0.9027777777777778)


In [37]:
print("DT ", get_scorec(dtc,xs,y))
print("SVC ", get_scorec(svc,xs,y))

DT  (1.0, 0.9861111111111112)
SVC  (0.9344135802469136, 0.8819444444444444)


In [88]:
# both the models are performing well with x and y
# but as the score of DTC is high, we'll go for it

# DecisionTree

In [39]:
get_scorec(dtc,x,y)

(1.0, 0.9861111111111112)

In [46]:
acc,f1,recall,precision,conf,clas=get_metricsc(dtc,x,y)
print(acc)
print(' ')
print(f1)
print(' ')
print(recall)
print(' ')
print(precision)
print(' ')
print(conf)
print(' ')
print(clas)

0.9861111111111112
 
0.9861346145437053
 
0.9860876076785167
 
0.9860876076785167
 
[[ 93   0   3   0]
 [  1  17   0   0]
 [  2   0 300   0]
 [  0   0   0  16]]
 
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        96
           1       1.00      0.94      0.97        18
           2       0.99      0.99      0.99       302
           3       1.00      1.00      1.00        16

    accuracy                           0.99       432
   macro avg       0.99      0.98      0.98       432
weighted avg       0.99      0.99      0.99       432



In [53]:
# lets apply cross validation
for i in range(2,15):
    print(i, ' ', crossc(dtc,x,y,i))

2   (0.7436881693782276, 0.04657834278863221)
3   (0.7893906041996331, 0.027845506719750033)
4   (0.7738629038693126, 0.07596755467824605)
5   (0.7859826589595376, 0.10754047481675459)
6   (0.8582753848045224, 0.04808869585801243)
7   (0.80436026302179, 0.06239309545699675)
8   (0.838060230321706, 0.07780428944635684)
9   (0.8879644005340345, 0.08197331459134659)
10   (0.8690421891015943, 0.1036335090095845)
11   (0.8874234617819027, 0.0865013422033242)
12   (0.9352746756553216, 0.03840366861707017)
13   (0.9265039926469621, 0.043623538643733674)
14   (0.9496066945909554, 0.03039862655971016)


In [51]:
# lets apply a 10 fold stratified cross validation
score=cross_val_score(dtc,x,y,cv=10,scoring='accuracy')
score

array([0.62643678, 0.79310345, 0.95402299, 0.77011494, 0.9132948 ,
       0.95930233, 0.87209302, 0.88953488, 0.95930233, 0.95321637])

In [52]:
score.mean(),score.std()

(0.8690421891015943, 0.1036335090095845)

In [54]:
# lets apply a 15 fold stratified cross validation
score=cross_val_score(dtc,x,y,cv=15,scoring='accuracy')
score

array([0.97435897, 0.91452991, 0.95726496, 0.97435897, 0.99145299,
       0.92241379, 0.92241379, 0.97413793, 0.93103448, 0.86842105,
       0.91150442, 0.90265487, 0.99115044, 0.96460177, 0.97345133])

In [55]:
score.mean(),score.std()

(0.9449166464303863, 0.035709782206286005)

In [56]:
# lets apply a 20 fold stratified cross validation
score=cross_val_score(dtc,x,y,cv=15,scoring='accuracy')
score

array([0.97435897, 0.91452991, 0.95726496, 0.97435897, 0.99145299,
       0.92241379, 0.92241379, 0.97413793, 0.93103448, 0.86842105,
       0.91150442, 0.90265487, 0.99115044, 0.96460177, 0.97345133])

In [57]:
score.mean(),score.std()

(0.9449166464303863, 0.035709782206286005)

In [58]:
# there is no problem of over fitting or under fitting

In [59]:
# lets tune the model using grid search

In [74]:
DecisionTreeClassifier?

In [61]:
param_dtc={'criterion':['gini','entropy'],'splitter':['best','random'],'max_depth':[1,2,4,6],'min_samples_split':[2,4,6,8],
          'min_samples_leaf':[1,2,4],'min_weight_fraction_leaf':[0.0,0.1,0.2]}
gridcvc(dtc,param_dtc,x,y)

{'criterion': 'entropy',
 'max_depth': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'splitter': 'random'}

In [62]:
# lets further converge it
param_dtc={'criterion':['gini','entropy'],'splitter':['best','random'],'max_depth':[3,4,5,6],'min_samples_split':[2,3,4,5],
          'min_samples_leaf':[1,2,3],'min_weight_fraction_leaf':[0.0,0.1,0.2]}
gridcvc(dtc,param_dtc,x,y)

{'criterion': 'entropy',
 'max_depth': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'splitter': 'random'}

In [63]:
dtc1=DecisionTreeClassifier(random_state=45,criterion= 'entropy',
 max_depth= 4,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 splitter= 'random')
get_scorec(dtc1,x,y)

(0.8287037037037037, 0.7870370370370371)

In [64]:
acc,f1,recall,precision,conf,clas=get_metricsc(dtc1,x,y)
print(acc)
print(' ')
print(f1)
print(' ')
print(recall)
print(' ')
print(precision)
print(' ')
print(conf)
print(' ')
print(clas)

0.7870370370370371
 
0.770607955632052
 
0.8034661184420221
 
0.8034661184420221
 
[[ 79  17  42  16]
 [  0   0   0   0]
 [ 17   0 261   0]
 [  0   0   0   0]]
 
              precision    recall  f1-score   support

           0       0.82      0.51      0.63       154
           1       0.00      0.00      0.00         0
           2       0.86      0.94      0.90       278
           3       0.00      0.00      0.00         0

    accuracy                           0.79       432
   macro avg       0.42      0.36      0.38       432
weighted avg       0.85      0.79      0.80       432



In [65]:
# the score has reduced
dtc2=DecisionTreeClassifier(random_state=45,criterion= 'gini',
 max_depth= 4,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 splitter= 'best')
get_scorec(dtc2,x,y)

(0.8294753086419753, 0.7800925925925926)

In [67]:
# lets further converge it
param_dtc1={'max_depth':[2,3,4,5,6],'min_samples_split':[2,3,4,5],
          'min_samples_leaf':[1,2,3,4],'min_weight_fraction_leaf':[0.0,0.1,0.2]}
gridcvc(dtc,param_dtc1,x,y)

{'max_depth': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0}

In [69]:
# lets further converge it
param_dtc2={'max_depth':[6,7,8,9,10,11],'min_samples_split':[2,3,4,5],
          'min_samples_leaf':[1,2,3,4],'min_weight_fraction_leaf':[0.0,0.1,0.2]}
gridcvc(dtc,param_dtc2,x,y)

{'max_depth': 8,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0}

In [70]:
dtc3=DecisionTreeClassifier(random_state=45,
 max_depth= 8,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0)
get_scorec(dtc3,x,y)

(0.9498456790123457, 0.9583333333333334)

In [73]:
# the score is still lesser than the untuned model
# lets do further tuning
# lets further converge it
param_dtc3={'max_depth':[6,7,8,9,10,11],'min_samples_split':[2,3,4,5],
          'min_samples_leaf':[1,2,3,4],'min_weight_fraction_leaf':[0.0,0.1,0.2],
           'min_impurity_decrease':[0.0,0.1,0.2,0.5]}
gridcvc(dtc,param_dtc3,x,y)

{'max_depth': 8,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0}

In [77]:
param_dtc4={'max_depth':[6,7,8,9,10,11],'min_samples_split':[2,3,4,5],
          'min_samples_leaf':[1,2,3,4],'min_weight_fraction_leaf':[0.0,0.1,0.2],
           'min_impurity_decrease':[0.0,0.1,0.2,0.5],
           'max_leaf_nodes':[2,5,7,9]}
gridcvc(dtc,param_dtc4,x,y)

{'max_depth': 6,
 'max_leaf_nodes': 5,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0}

In [78]:
dtc4=DecisionTreeClassifier(random_state=45,
 max_depth= 8,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,max_leaf_nodes=5)
get_scorec(dtc4,x,y)

(0.8155864197530864, 0.7662037037037037)

In [79]:
dtc3=DecisionTreeClassifier(random_state=45,
 max_depth= 8,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0)
get_scorec(dtc3,x,y)

(0.9498456790123457, 0.9583333333333334)

In [80]:
param_dtc5={'max_features':['auto','sqrt','log2'],'presort':[True,False]}
gridcvc(dtc3,param_dtc5,x,y)

{'max_features': 'auto', 'presort': True}

In [81]:
dtc3a=DecisionTreeClassifier(random_state=45,
 max_depth= 8,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,max_features='auto', presort= True)
get_scorec(dtc3a,x,y)

(0.8487654320987654, 0.7986111111111112)

In [82]:
# lets cross validate dtc3,x,y
for i in range(2,21):
    print(i, ' ', crossc(dtc3,x,y,i))

2   (0.7338495234395408, 0.04598825176324017)
3   (0.7749239831629535, 0.029186104537714246)
4   (0.7692425432010325, 0.06219946478494228)
5   (0.7906102970829412, 0.08250320008568311)
6   (0.8374439985282672, 0.02416669102281545)
7   (0.7882727121468808, 0.05162592163999623)
8   (0.8251788322972691, 0.07866989119281241)
9   (0.8756424081461718, 0.0792333099453897)
10   (0.8405803892496507, 0.09302669895246415)
11   (0.8492063338190576, 0.08549000165277448)
12   (0.899309438122272, 0.05650699311958264)
13   (0.8986322222771553, 0.05925691551072495)
14   (0.9116828470500818, 0.06501085713293689)
15   (0.9013042780159687, 0.062389090703528834)
16   (0.9141471960812504, 0.05905773240585247)
17   (0.9118731045187181, 0.06946449079822369)
18   (0.9089057181876918, 0.05960132081518445)
19   (0.9226395123366797, 0.0592770567905215)
20   (0.9145167174276004, 0.06493639733786405)


In [83]:
# the best score is at cv=19

In [84]:
# lets apply a 19 fold stratified cross validation
score=cross_val_score(dtc3,x,y,cv=19,scoring='accuracy')
score

array([0.91397849, 0.95698925, 0.95698925, 0.91397849, 0.95652174,
       0.97826087, 0.97826087, 0.92391304, 0.94505495, 0.96703297,
       0.91208791, 0.96703297, 0.75555556, 0.79775281, 0.84269663,
       0.96629213, 0.93258427, 0.92134831, 0.94382022])

In [85]:
score.mean(),score.std()

(0.9226395123366797, 0.0592770567905215)

In [86]:
acc,f1,recall,precision,conf,clas=get_metricsc(dtc3,x,y)
print(acc)
print(' ')
print(f1)
print(' ')
print(recall)
print(' ')
print(precision)
print(' ')
print(conf)
print(' ')
print(clas)

0.9583333333333334
 
0.9588766694102431
 
0.9577899972564238
 
0.9577899972564238
 
[[ 91   3   9   0]
 [  4  13   0   0]
 [  1   1 294   0]
 [  0   0   0  16]]
 
              precision    recall  f1-score   support

           0       0.95      0.88      0.91       103
           1       0.76      0.76      0.76        17
           2       0.97      0.99      0.98       296
           3       1.00      1.00      1.00        16

    accuracy                           0.96       432
   macro avg       0.92      0.91      0.92       432
weighted avg       0.96      0.96      0.96       432



In [87]:
# the best model is dtc
# lets save it

In [89]:
pred=get_predc(dtc,x,y)

In [90]:
predic=pd.DataFrame(pred)
predic.to_csv('pred_cars.csv')

In [91]:
import joblib
joblib.dump(dtc,'dtc_cars.pkl')

['dtc_cars.pkl']