# Using Cross Validation in Sklearn

There are three ways to use **cross valiation** in Sklearn.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

X, y = datasets.load_iris(return_X_y=True)
print(X.shape, y.shape)

In [5]:
# method 1: using cross_val_score and StratifiedKFold

# In the cross_val_score function: if the parameter cv is int or None, and if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used (but without shuffling the records). In all other cases, KFold is used.
# If shuffuling the records is needed before data splitting, it is recommended to create a StratifiedKFold object and use it with cross_val_score

cv_fold=10
random_seed=10
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# CASE 1
# using cross_val_score without shuffling records

clf = svm.SVC(kernel='linear', C=1)
# Note: y should be an array
scores = cross_val_score(clf, X, y, cv=cv_fold)
# note that this is an array
print(scores) 
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# CASE 2:
# using a StratifiedKFold splitter and cross_val_score. This enables shuffling of records
# If shuffle is False, don't set the random_state
skf = StratifiedKFold(n_splits=cv_fold, shuffle=True, random_state=random_seed)
print("Cross validation splitter: ", skf)

clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X, y, cv=skf)

print("Cross validation score: %0.4f (+/-%0.4f)" % (scores.mean(), scores.std() * 2))

[1.         0.93333333 1.         1.         0.86666667 1.
 0.93333333 1.         1.         1.        ]
Accuracy: 0.97 (+/- 0.09)
Cross validation splitter:  StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
Cross validation score: 0.9800 (+/-0.0611)


In [11]:
# method 2: pre-processing the training/testing data before each cross validation iteration
# Note: X and y should be an array instead of dataframe or Series

cv_fold=10
random_seed=100
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

list_train_accuracy = []
list_test_accuracy = []

skf = StratifiedKFold(n_splits=cv_fold, shuffle=True, random_state=random_seed)
print(skf)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # insert processing work on train/test data here
    
    clf = svm.SVC(kernel='linear', C=1)
    clf.fit(X_train, y_train)
    
    # append the accuracy to the lists
    list_train_accuracy.append(clf.score(X_train, y_train))
    list_test_accuracy.append(clf.score(X_test, y_test))
    
print("Cross validation score: %0.4f (+/-%0.4f)" % (np.mean(list_test_accuracy), np.std(list_test_accuracy) * 2))

StratifiedKFold(n_splits=10, random_state=100, shuffle=True)
Cross validation score: 0.9733 (+/-0.0653)


In [13]:
# method 3: using Pipeline to do data transformation with held out data

# This is similar to method 2 but more concise. It is suited for simple preprocessing.

# Just as it is important to test a predictor on data held-out from training, preprocessing (such as standardization, feature selection, etc.) and similar data transformations similarly should be learnt from a training set and applied to held-out data for prediction
# An example of learning standardization from a training set and applying it to the held-out data

from sklearn import preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)

# combining with cross validation
# A Pipeline makes it easier to compose estimators, providing this behavior under cross-validation:

from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
scores = cross_val_score(clf, X, y, cv=cv_fold)
print("Cross validation score: %0.4f (+/-%0.4f)" % (scores.mean(), scores.std() * 2))


Cross validation score: 0.9667 (+/-0.0894)
