# Entry 19 notebook - Implementing Cross-validation

In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import category_encoders as ce

### Custom functions

In [10]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

In [13]:
df_raw = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', header=None, sep='\t')

In [14]:
cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_yr', 'origin']
df_raw = df_raw[0].str.split(expand=True)
df_raw.columns = cols
df_raw.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_yr,origin
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0


In [15]:
X_train, X_test, y_train, y_test = split_data(df_raw, 'mpg', 0.8)

In [27]:
kfolds = KFold(n_splits=5, shuffle=True, random_state=12)
linreg = LinearRegression()
cv_results = cross_validate(linreg, X_train, y_train, cv=kfolds)
cv_r2_results = cross_validate(linreg, X_train, y_train, cv=kfolds, scoring='r2')

In [28]:
cv_results

{'fit_time': array([0.00379491, 0.00253606, 0.00414109, 0.00340629, 0.00509095]),
 'score_time': array([0.00229406, 0.00261688, 0.00254798, 0.00268078, 0.00295305]),
 'test_score': array([0.69084141, 0.81115171, 0.92215763, 0.71742051, 0.84694778])}

In [29]:
cv_r2_results

{'fit_time': array([0.00241303, 0.002599  , 0.00226474, 0.00228596, 0.00258422]),
 'score_time': array([0.00114703, 0.00218892, 0.00115824, 0.00121617, 0.00095105]),
 'test_score': array([0.69084141, 0.81115171, 0.92215763, 0.71742051, 0.84694778])}