# K-fold Cross Validation
This notebook demonstrates how to do a k-fold cross validation evaluation with a linear regression model.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [3]:
df = pd.DataFrame(df = pd.read_csv('data/spotify_daily_charts_tracks.csv')
df.head())
df['target'] = boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


### Test validation splits

Simple holdout split (with shuffle)

In [5]:
X=np.arange(10)
Y=np.power(X,2)
print(X,Y)

[0 1 2 3 4 5 6 7 8 9] [ 0  1  4  9 16 25 36 49 64 81]


In [6]:
x_train, y_train, x_test, y_test = train_test_split(X,Y,test_size = 0.2)
print(x_train, y_train, x_test, y_test)

[1 5 9 4 7 2 0 8] [6 3] [ 1 25 81 16 49  4  0 64] [36  9]


K-fold

In [7]:
kf = KFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print(x_train, y_train, x_test, y_test)

TRAIN: [2 3 4 5 6 7 8 9] TEST: [0 1]
[2 3 4 5 6 7 8 9] [ 4  9 16 25 36 49 64 81] [0 1] [0 1]
TRAIN: [0 1 4 5 6 7 8 9] TEST: [2 3]
[0 1 4 5 6 7 8 9] [ 0  1 16 25 36 49 64 81] [2 3] [4 9]
TRAIN: [0 1 2 3 6 7 8 9] TEST: [4 5]
[0 1 2 3 6 7 8 9] [ 0  1  4  9 36 49 64 81] [4 5] [16 25]
TRAIN: [0 1 2 3 4 5 8 9] TEST: [6 7]
[0 1 2 3 4 5 8 9] [ 0  1  4  9 16 25 64 81] [6 7] [36 49]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]
[0 1 2 3 4 5 6 7] [ 0  1  4  9 16 25 36 49] [8 9] [64 81]


### Linear regression with model validation

Simple holdout(with shuffle)

In [8]:
X = boston.data
Y = boston.target

In [16]:
np.shape(X)

(506, 13)

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2)

In [10]:
np.shape(X_train)

(404, 13)

In [11]:
def evaluate(model, X, y):
    predicted = model.predict(X)
    rmse = (np.sqrt(mean_squared_error(y, predicted)))
    r2 = r2_score(y, predicted)
    return rmse, r2


In [12]:
#fit using training data
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
#print(lin_model.coef_)
print('train: R2=%0.2f '% lin_model.score(X_train, Y_train))

#evaluate using test data
print('test: RMSE=%0.2f, R2=%0.2f' % evaluate(lin_model, X_test, Y_test))

train: R2=0.75 
test: RMSE=5.00, R2=0.69


KFold

In [3]:
idx = np.arange(0, len(y))

for j in np.random.randint(0, high=10000, size=10):
    np.random.shuffle(idx)
    kf = KFold(n_splits=5, random_state=None, shuffle=False)

    for train_index, test_index in kf.split(X):
        model = LogisticRegression().fit(X[idx][train], y[idx][train])
        y_score = model.predict_proba(X[idx][test])
        fpr, tpr, _ = roc_curve(y[idx][test], y_score[:, 1])

        plt.plot(fpr, tpr, 'b', alpha=0.05)
        tpr = interp(base_fpr, fpr, tpr)
        tpr[0] = 0.0
        tprs.append(tpr)

NameError: name 'y' is not defined

In [13]:
kf = KFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    #print(np.shape(X_test), np.shape(Y_test))
    
    #fit using training data
    lin_model = LinearRegression()
    lin_model.fit(X_train, Y_train)
    
    #evaluate fit of train data
    print('train: R2=%0.2f '% lin_model.score(X_train, Y_train))

    #evaluate using test data
    print('test: RMSE=%0.2f, R2=%0.2f' % evaluate(lin_model, X_test, Y_test))

train: R2=0.75 
test: RMSE=3.53, R2=0.64
train: R2=0.73 
test: RMSE=5.10, R2=0.71
train: R2=0.69 
test: RMSE=5.75, R2=0.59
train: R2=0.84 
test: RMSE=8.99, R2=0.08
train: R2=0.74 
test: RMSE=5.77, R2=-0.25
