# Cross-Validation Exercises

__1) Use the cross validation techniques described in the lesson to find the best model for predicting transmission type with the mpg dataset.__

In [1]:
import numpy as np
import pandas as pd
from pydataset import data
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
#Get the data
mpg = data('mpg')
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [4]:
#Change the transmission column to only have the values 'auto' and 'manual'
mpg.trans = np.where(mpg.trans.str.startswith('auto'), 'auto', 'manual')
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto,f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual,f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual,f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto,f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto,f,16,26,p,compact


In [10]:
#Split into X and y groups
X, y = mpg[['displ', 'cyl', 'cty', 'hwy']], mpg.trans

In [11]:
#Create train and test data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)

### Decision Tree Grid Search

In [12]:
#Grid search with the decision tree and show the best mean score and best params
clf = DecisionTreeClassifier(random_state = 123)

grid = GridSearchCV(clf, {'max_depth': range(1, 11), 'min_samples_leaf': range(1, 11)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=123),
             param_grid={'max_depth': range(1, 11),
                         'min_samples_leaf': range(1, 11)})

In [14]:
#What was the best score and best parameters?
grid.best_score_, grid.best_params_

(0.6733997155049787, {'max_depth': 7, 'min_samples_leaf': 2})

### Random Forest Grid Search

In [15]:
clf = RandomForestClassifier(random_state = 123)

grid = GridSearchCV(clf, {'max_depth': range(1, 11), 'min_samples_leaf': range(1, 11)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=123),
             param_grid={'max_depth': range(1, 11),
                         'min_samples_leaf': range(1, 11)})

In [16]:
#What was the best score and best parameters
grid.best_score_, grid.best_params_

(0.6577524893314367, {'max_depth': 10, 'min_samples_leaf': 1})

### KNeighbors Grid Search

In [19]:
clf = KNeighborsClassifier()

grid = GridSearchCV(clf, {'n_neighbors': range(1, 21)}, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)})

In [20]:
#What was the best score and best parameter?
grid.best_score_, grid.best_params_

(0.6786628733997155, {'n_neighbors': 16})

Based on just the few models I created, I would say that the KNeighborsClassifier with n_neighbors = 16 is the best choice.
***

__2) Use cross validation techniques to determine the best model for predicting survival with the titanic dataset.__

In [21]:
#Get the data
titanic = data('titanic')

In [22]:
titanic.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [24]:
#Convert with pd.get_dummies
titanic = pd.get_dummies(titanic, drop_first = True)
titanic.head()

Unnamed: 0,class_2nd class,class_3rd class,age_child,sex_women,survived_yes
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
5,0,0,0,0,1


In [25]:
#Split into X and y groups
X, y = titanic.drop('survived_yes', axis = 1), titanic.survived_yes

In [26]:
#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123, test_size = 0.2)

### Decision Tree Grid Search

In [28]:
clf = DecisionTreeClassifier(random_state = 123)

grid = GridSearchCV(clf, {'max_depth': range(1, 11), 'min_samples_leaf': range(1, 11)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=123),
             param_grid={'max_depth': range(1, 11),
                         'min_samples_leaf': range(1, 11)})

In [30]:
#What was the best score and best params
grid.best_score_, grid.best_params_

(0.7956307831189348, {'max_depth': 3, 'min_samples_leaf': 1})

### Random Forest Grid Search

In [31]:
clf = RandomForestClassifier(random_state = 123)

grid = GridSearchCV(clf, {'max_depth': range(1, 11), 'min_samples_leaf': range(1, 11)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=123),
             param_grid={'max_depth': range(1, 11),
                         'min_samples_leaf': range(1, 11)})

In [32]:
#What was the best score and best params
grid.best_score_, grid.best_params_

(0.7984788986684721, {'max_depth': 4, 'min_samples_leaf': 6})

### KNeighbors Grid Search

In [33]:
clf = KNeighborsClassifier()

grid = GridSearchCV(clf, {'n_neighbors': range(1, 21)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)})

In [34]:
#What was the best score and best params
grid.best_score_, grid.best_params_

(0.7984924396298803, {'n_neighbors': 15})

Based on the few models I tried, I'd say I could use either my best KNeighborsClassifier or best RandomForestClassifier. They had about the same performance.
***

__3) Use cross validation techniques to determine the best model for predicting tip amount with the tips dataset.__

In [36]:
#Get the data
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [37]:
#Create dummies for the cat vars
tips = pd.get_dummies(tips, drop_first = True)
tips.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,0,0,0,1,0,0
2,10.34,1.66,3,1,0,0,1,0,0
3,21.01,3.5,3,1,0,0,1,0,0
4,23.68,3.31,2,1,0,0,1,0,0
5,24.59,3.61,4,0,0,0,1,0,0


In [38]:
#Split into X and y groups
X, y = tips.drop('tip', axis = 1), tips.tip

In [39]:
#Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123, test_size = 0.2)

In [40]:
#Scale the X non-binary vars
from sklearn.preprocessing import MinMaxScaler

#Create the scaler
scaler = MinMaxScaler()

#Fit and transform on the X_train vars 'total_bill' and 'size'
X_train[['total_bill', 'size']] = scaler.fit_transform(X_train[['total_bill', 'size']])

#Transform the same columns in the X_test seet
X_test[['total_bill', 'size']] = scaler.transform(X_test[['total_bill', 'size']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


### LinearRegression Cross-Validation

In [41]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

#Since there are no parameters I want to test through, GridSearch is not necessary
#The resulting score will be the average R^2 value
cross_val_score(model, X_train, y_train, cv = 5).mean()

0.23729097801746873

### LassoLars Grid Search

In [42]:
from sklearn.linear_model import LassoLars

model = LassoLars()

grid = GridSearchCV(model, {'alpha': range(1, 21)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LassoLars(), param_grid={'alpha': range(1, 21)})

In [43]:
#What is the best score and best params
grid.best_score_, grid.best_params_

(-0.15600318891272202, {'alpha': 1})

### RandomForestRegressor Grid Search

In [44]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state = 123)

grid = GridSearchCV(model, {'max_depth': range(1,11), 'min_samples_leaf': range(1,11)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=123),
             param_grid={'max_depth': range(1, 11),
                         'min_samples_leaf': range(1, 11)})

In [45]:
#What is the best score and best params
grid.best_score_, grid.best_params_

(0.26093455992182796, {'max_depth': 3, 'min_samples_leaf': 1})

Based on the models above, the best one was the RandomForestRegressor with max_depth = 3 and min_samples_leaf = 1.