In [1]:
#Cross Validation
#Cross-validation is a technique for evaluating ML models by training several ML models on subsets of the available input data and evaluating them on the complementary subset of the data. Use cross-validation to detect overfitting, ie, failing to generalize a pattern.

#The three steps involved in cross-validation are as follows :

 # 1. Reserve some portion of sample data-set.
  #2. Using the rest data-set train the model.
  #3. Test the model using the reserve portion of the data-set.

In [2]:
# Stratified k-fold cross validation

# StratifiedKFold is a variation of KFold. First, StratifiedKFold shuffles your data, after that splits the data into n_splits parts and Done. Now, it will use each part as a test set. Note that it only and always shuffles data one time before splitting.

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,auc
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedGroupKFold
import warnings
warnings.simplefilter('ignore')

In [4]:
cancer = load_breast_cancer()

In [5]:
df = pd.DataFrame(cancer.data,columns=cancer.feature_names)
df['target'] = pd.Series(cancer.target)


In [6]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [7]:
x = df.drop('target',axis=1)

In [8]:
y = df['target'].astype('category')

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [11]:
lr = LogisticRegression()

In [12]:
lr.fit(x_train,y_train)

In [13]:
pred = lr.predict(x_test)

In [14]:
confusion_matrix(pred,y_test)

array([[38,  2],
       [ 2, 72]], dtype=int64)

In [15]:
roc_auc_score(pred,y_test)

0.9614864864864865

In [17]:
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=45)
pred_test_full =0
cv_score =[]
i=1
for train_index,test_index in kf.split(x,y):
    print('{} of KFold {}'.format(i,kf.n_splits))
    
    ### Training Set
    xtr,xvl = x.iloc[train_index],x.iloc[test_index]
    
    ### Validation Set
    ytr,yvl = y.iloc[train_index],y.iloc[test_index]
    
    #model
    lr = LogisticRegression(C=2)
    lr.fit(xtr,ytr)
    score = roc_auc_score(yvl,lr.predict(xvl))
    print('ROC AUC score:',score)
    cv_score.append(score)
    
#     pred_test = lr.predict_proba(x_test)[:,1]
#     pred_test_full +=pred_test
    i+=1

<IPython.core.display.Javascript object>

1 of KFold 5
ROC AUC score: 0.9415329184408779
2 of KFold 5
ROC AUC score: 0.932361611529643
3 of KFold 5
ROC AUC score: 0.9384920634920635
4 of KFold 5
ROC AUC score: 0.9811507936507937
5 of KFold 5
ROC AUC score: 0.9312541918175722


In [18]:
print('Confusion matrix\n',confusion_matrix(yvl,lr.predict(xvl)))
print('Cv',cv_score,'\nMean cv Score',np.mean(cv_score))

Confusion matrix
 [[38  4]
 [ 3 68]]
Cv [0.9415329184408779, 0.932361611529643, 0.9384920634920635, 0.9811507936507937, 0.9312541918175722] 
Mean cv Score 0.9449583157861902
