# Cross Validation

Exercises

Within your codeup-data-science directory, create a new repo named advanced-topics. This will be where you do your work for this module. Create a repository on GitHub with the same name, and link your local repository to GitHub.

Save this work in your advanced-topics repo. Then add, commit, and push your changes.

Do your work for this exercise in a jupyter notebook or python script named cross_validation.

Use the cross validation techniques discussed in the lesson to figure out what kind of model works best with the cars dataset used in the lesson.

In [22]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Acquire

In [3]:
#load mpg data
df = data('mpg')

In [4]:
df

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [5]:
df.trans.value_counts()

auto(l4)      83
manual(m5)    58
auto(l5)      39
manual(m6)    19
auto(s6)      16
auto(l6)       6
auto(av)       5
auto(s5)       3
auto(s4)       3
auto(l3)       2
Name: trans, dtype: int64

## Prepare

In [6]:
#change trans column to auto and manual
df.trans = np.where(df.trans.str.startswith('auto'), 'auto', 'manual')

In [7]:
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto,f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual,f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual,f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto,f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto,f,16,26,p,compact


## Model

In [8]:
#split into X and y
X = df[['displ', 'year', 'cyl', 'cty', 'hwy']]

In [9]:
y = df.trans

In [10]:
#split into trains and tests -- > no need to stratify because already have X, y split and input in func
X_train, X_test, y_train,y_test =  train_test_split(X, y, random_state = 514)

In [13]:
X_train.shape, X_test.shape

((175, 5), (59, 5))

## Grid Search

In [11]:
#import
from sklearn.model_selection import GridSearchCV

# KNN 

In [12]:
knn = KNeighborsClassifier()

Setting up grid search 
- keys are the names of hyperparameters
- values are the range of values to search through

In [13]:
params = {'n_neighbors': range(1,21),
        'p':range(1,4),
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

In [14]:
grid = GridSearchCV(knn, params, cv=5)

In [15]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': range(1, 21), 'p': range(1, 4)})

In [17]:
for params, score in zip (grid.cv_results_['params'], grid.cv_results_['mean_test_score']): 
    params['score'] = score

In [18]:
pd.DataFrame(grid.cv_results_['params']).sort_values('score')

Unnamed: 0,algorithm,n_neighbors,p,score
222,brute,15,1,0.600000
216,brute,13,1,0.605714
60,ball_tree,1,1,0.605714
61,ball_tree,1,2,0.605714
62,ball_tree,1,3,0.605714
...,...,...,...,...
16,auto,6,2,0.685714
125,kd_tree,2,3,0.685714
123,kd_tree,2,1,0.685714
173,kd_tree,18,3,0.685714


In [19]:
knn_model = grid.best_estimator_

In [20]:
knn_model.score(X_test, y_test)

0.6779661016949152

## Decision Tree Classifier

In [46]:
dt = DecisionTreeClassifier(random_state=514)

In [47]:
params = {'max_depth':range(1,21), 'min_samples_leaf': range(1,11), 'criterion': ['gini', 'entropy']}

In [49]:
grid = GridSearchCV(dt, params, cv=5)

In [50]:

grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=514),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11)})

In [51]:

grid.cv_results_['mean_test_score'][:5]

array([0.65555556, 0.65555556, 0.65555556, 0.65555556, 0.65555556])

In [52]:
for params, score in zip (grid.cv_results_['params'], grid.cv_results_['mean_test_score']): 
    params['score'] = score

In [54]:
pd.DataFrame(grid.cv_results_['params']).sort_values('score')

Unnamed: 0,criterion,max_depth,min_samples_leaf,score
231,entropy,4,2,0.610541
26,gini,3,7,0.618234
230,entropy,4,1,0.618234
27,gini,3,8,0.618234
228,entropy,3,9,0.625356
...,...,...,...,...
52,gini,6,3,0.702564
110,gini,12,1,0.702564
260,entropy,7,1,0.702849
43,gini,5,4,0.710256


In [55]:
dt_model = grid.best_estimator_

In [56]:
dt_model.score(X_test, y_test)

0.6779661016949152

## Random Forest Classifier

In [25]:
rf = RandomForestClassifier(random_state=514)
params = {'max_depth':range(1,11),
          'min_samples_leaf': range(1,5),
          'criterion': ['gini', 'entropy', 'log_loss'],
          'min_samples_split':range(2,5),
          'n_estimators': range(100, 130, 5)}

grid = GridSearchCV(rf, params, cv=5)
grid.fit(X_train, y_train)

grid.cv_results_['mean_test_score'][:5]

for params, score in zip (grid.cv_results_['params'], grid.cv_results_['mean_test_score']): 
    params['score'] = score

pd.DataFrame(grid.cv_results_['params']).sort_values('score')

rf_model = grid.best_estimator_
rf_model.score(X_test, y_test)

3600 fits failed out of a total of 10800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3600 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._di

0.6610169491525424