# Cross Validation

In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [3]:
df = pd.read_csv('./tgz/used_cars.csv.gz').set_index('Id')
df.head()

Unnamed: 0_level_0,Price,Year,Mileage,City,State,Vin,Make,Model
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297899 entries, 1 to 297899
Data columns (total 8 columns):
Price      297899 non-null int64
Year       297899 non-null int64
Mileage    297899 non-null int64
City       297899 non-null object
State      297899 non-null object
Vin        297899 non-null object
Make       297899 non-null object
Model      297899 non-null object
dtypes: int64(3), object(5)
memory usage: 20.5+ MB


In [4]:
train, test = train_test_split(df)

In [5]:
X, y = train[['Year', 'Mileage']], train.Price

## Basic Cross-Validation

- gives us the score for every split

In [6]:
lr = LinearRegression()
cross_val_score(lr, X, y, cv=4) # r^2 by default

array([0.20751247, 0.21583148, 0.22548613, 0.22162317])

In [7]:
tree = DecisionTreeRegressor(max_depth=4)
cross_val_score(tree, X, y, cv=4).mean()

0.2604244225869469

In [8]:
tree = DecisionTreeRegressor(max_depth=2)
cross_val_score(tree, X, y, cv=4).mean()

0.22705324323403636

In [9]:
# to use mean squared error instead...
from sklearn.metrics import mean_squared_error, make_scorer

cross_val_score(lr, X, y, cv=4, scoring=make_scorer(mean_squared_error))

array([1.62482997e+08, 1.55911708e+08, 1.44789862e+08, 1.49293054e+08])

## Grid Search

- an algorithm
- a "grid" of params to search through

In [22]:
tree = DecisionTreeRegressor()
params = {
    'max_depth': range(1, 15),
    'splitter': ['best', 'random']
}

In [23]:
grid = GridSearchCV(tree, params, cv=4)
# grid = GridSearchCV(tree, params, cv=4, scoring=make_scorer(mean_squared_error))
grid.fit(X, y)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': range(1, 15),
                         'splitter': ['best', 'random']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None,

In [24]:
results = grid.cv_results_
results

{'mean_fit_time': array([0.04189688, 0.01050991, 0.05853271, 0.01511997, 0.0870319 ,
        0.01603061, 0.11395401, 0.01903129, 0.13717932, 0.02224344,
        0.15122372, 0.02694702, 0.16785097, 0.02650726, 0.18501192,
        0.02945262, 0.20130765, 0.03160107, 0.21934074, 0.03559512,
        0.23049343, 0.03740174, 0.24664968, 0.03795892, 0.26367325,
        0.04044104, 0.28259891, 0.04197341]),
 'std_fit_time': array([0.00697935, 0.00048194, 0.00140922, 0.00135897, 0.0047033 ,
        0.00120929, 0.01950715, 0.00073587, 0.01233946, 0.00109938,
        0.00702504, 0.00353632, 0.00442943, 0.00132452, 0.00671152,
        0.00154139, 0.00474861, 0.00362929, 0.00643131, 0.00232031,
        0.00189641, 0.00385487, 0.00190502, 0.00090947, 0.003815  ,
        0.00241173, 0.00241974, 0.00127898]),
 'mean_score_time': array([0.00328231, 0.00238502, 0.00265676, 0.00275332, 0.00325882,
        0.00294465, 0.00350428, 0.00302428, 0.00386047, 0.0034017 ,
        0.00445706, 0.00375408, 0.004622

In [26]:
for params, score in zip(results['params'], results['mean_test_score']):
    params['score'] = score
    
results_df = pd.DataFrame(results['params'])
results_df['rank_test_score'] = results['rank_test_score']

In [27]:
results_df

Unnamed: 0,max_depth,score,splitter,rank_test_score
0,1,0.153751,best,25
1,1,0.022152,random,28
2,2,0.227053,best,15
3,2,0.16425,random,24
4,3,0.249025,best,8
5,3,0.132794,random,27
6,4,0.260424,best,4
7,4,0.187557,random,22
8,5,0.265601,best,2
9,5,0.209758,random,20
