# Cross Validation

In [2]:
import pandas as pd
candy = pd.read_csv('datasets/candy-data.csv')
candy.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [3]:
X = candy.drop(['competitorname','winpercent'],axis=1)
y = candy['winpercent']

In [4]:
from sklearn.model_selection import KFold

# Use KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1111)

# Create splits -- Generate indices to split data into training and test set. 
splits = kf.split(X)

# Print the number of indices
for train_index, val_index in splits:
    print("Number of training indices: %s" % len(train_index))
    print("Number of validation indices: %s" % len(val_index))

Number of training indices: 68
Number of validation indices: 17
Number of training indices: 68
Number of validation indices: 17
Number of training indices: 68
Number of validation indices: 17
Number of training indices: 68
Number of validation indices: 17
Number of training indices: 68
Number of validation indices: 17


In [5]:
from sklearn.model_selection import KFold

# Use KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1111)

# Create splits -- Generate indices to split data into training and test set. 
splits = kf.split(X)

# Print the indices in each training and validation sets at each iteration
for train_index, val_index in splits:
    print("Number of training indices: %s" % (train_index))
    print("Number of validation indices: %s" % (val_index))

Number of training indices: [ 0  2  3  4  5  7  8  9 10 11 12 13 14 17 18 20 21 22 23 24 25 27 28 29
 31 32 33 34 35 36 37 39 40 42 44 45 46 47 48 49 50 51 52 53 54 55 56 57
 60 61 62 63 64 67 68 70 71 73 74 75 76 77 78 79 80 81 82 83]
Number of validation indices: [ 1  6 15 16 19 26 30 38 41 43 58 59 65 66 69 72 84]
Number of training indices: [ 0  1  2  4  5  6  8  9 11 12 14 15 16 17 18 19 20 21 22 23 24 25 26 28
 29 30 31 32 33 34 36 38 40 41 42 43 44 45 48 50 51 52 53 54 55 56 58 59
 60 61 62 63 64 65 66 67 68 69 70 71 72 73 75 78 80 81 82 84]
Number of validation indices: [ 3  7 10 13 27 35 37 39 46 47 49 57 74 76 77 79 83]
Number of training indices: [ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 26 27 28 30 32 34 35 36 37 38 39 40 41 42 43 45 46 47 49 50 51 52 54 55
 57 58 59 61 62 64 65 66 69 71 72 73 74 76 77 79 81 82 83 84]
Number of validation indices: [ 2 25 29 31 33 44 48 53 56 60 63 67 68 70 75 78 80]
Number of training indices: [ 1  2  3  6  7

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rfc = RandomForestRegressor(n_estimators=25, random_state=1111)

# Create splits -- Generate indices to split data into training and test set. 
splits = kf.split(X)

# Access the training and validation indices of splits
for train_index, val_index in splits:
    # Setup the training and validation data
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]
    # Fit the random forest model
    rfc.fit(X_train, y_train)
    # Make predictions, and print the accuracy
    predictions = rfc.predict(X_val)
    print("Split MSE: " + str(mean_squared_error(y_val, predictions)))

Split MSE: 150.99298148707666
Split MSE: 171.22206240542593
Split MSE: 131.72569156195593
Split MSE: 80.61940183841385
Split MSE: 221.63020627476214


# Sklearn's cross_val_score()

In [7]:
# Instruction 1: Load the cross-validation method
from sklearn.model_selection import cross_val_score

# Instruction 2: Load the random forest regression model
from sklearn.ensemble import RandomForestRegressor

# Instruction 3: Load the mean squared error method
# Instruction 4: Load the function for creating a scorer
from sklearn.metrics import mean_squared_error, make_scorer

In [8]:
rfc = RandomForestRegressor(n_estimators=25, random_state=1111)
mse = make_scorer(mean_squared_error)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up cross_val_score
cv = cross_val_score(estimator=rfc,
                     X=X_train,
                     y=y_train,
                     cv=10,
                     scoring=mse)

# Print the mean error
print(cv.mean())

125.08498113726314


In [9]:
cv

array([114.95580647, 156.06194645, 148.50119665,  95.84285862,
       126.72936664,  59.27083503, 109.66617657, 102.84197154,
       137.89116868, 199.08848472])

# Leave-one-out-cross-validation (LOOCV)

![image-7](image-7.png)

- The name says it all. In leave-one-out-cross-validation, we are going to implement KFold cross-validation, where k is equal to n, the number of observations in the data. This means that every single point will be used in a validation set, completely by itself. For the first model, we will use all of the data for training, except for the first point, which will be used for validation. In model 2, we leave only the second data point out, in model three, the third data point, and so on. We create n models, for n-observations in the data.

In [10]:
import numpy as np
from sklearn.metrics import mean_absolute_error, make_scorer

# Create scorer
mae_scorer = make_scorer(mean_absolute_error)

rfr = RandomForestRegressor(n_estimators=15, random_state=1111)

# Implement LOOCV
scores = cross_val_score(rfr, X=X, y=y, cv=X.shape[0], scoring=mae_scorer)

# Print the mean and standard deviation
print("The mean of the errors is: %s." % np.mean(scores))
print("The standard deviation of the errors is: %s." % np.std(scores))

The mean of the errors is: 9.52044832324183.
The standard deviation of the errors is: 7.349020637882744.
