<a href="https://colab.research.google.com/github/james-monahan/Code-school-notebooks/blob/main/Week-12-ml-auto-pca/Grid_Search_Cross_Validation_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic Data

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report

link = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/titanic.csv"
df = pd.read_csv(link)
df['Survived'] = df['Survived'].apply(lambda x: "Survived" if x == 1 else "Dead")
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,Dead,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,Survived,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,Survived,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,Survived,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,Dead,3,Mr. William Henry Allen,male,35.0,0,0,8.05


# Data preparation

What are the "type" of each column? Are there non-numeric columns? 

In [None]:
sexes = {'male': 0, 'female': 1}
df['Sex'] = df['Sex'].map(sexes)

survival = {'Dead': 0, 'Survived': 1}
df['Survived'] = df['Survived'].map(survival)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
X = df.drop(['Survived', 'Name'], axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

LM = LogisticRegression()

model = LM.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("train data:",model.score(X_train, y_train))
print("test data:",model.score(X_test, y_test))

train data: 0.8097643097643098
test data: 0.8088737201365188


#Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cvs = cross_val_score(LogisticRegression(), X, y, cv=6)
cvs

array([0.77702703, 0.80405405, 0.79054054, 0.78378378, 0.7972973 ,
       0.81632653])

In [None]:
#mean, variance, std
cvs.mean(), np.var(cvs), np.std(cvs)

(0.7948382055524913, 0.00016843922589135775, 0.01297841384343086)

#Gridsearch

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DTF = DecisionTreeClassifier()

In [None]:
DTF.get_params

<bound method BaseEstimator.get_params of DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')>

In [None]:
params = {
          # 'ccp_alpha':, 
          # 'class_weight':, 
          # 'criterion':,
          'max_depth': [2, 3, 5, 10], 
          # 'max_features':, 
          # 'max_leaf_nodes':,    
          # 'min_impurity_decrease':, 
          # 'min_impurity_split':,
          'min_samples_leaf':[1, 3, 5, 10] 
          # 'min_samples_split':,
          # 'min_weight_fraction_leaf':, 
          # 'random_state':,
          # 'splitter':
          }

In [None]:
clf = GridSearchCV(param_grid=params, estimator=DTF)
clf.fit(X, y)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [2, 3, 5, 10],
                      

With this search via GridSearch, what are the best values of the hyperparameters if we want to maximize the accuracy score?  

In [None]:
#accuracy and best params
print(clf.best_score_)
print(clf.best_params_)

0.829772106900273
{'max_depth': 5, 'min_samples_leaf': 3}



With this same classifier, you will launch a RandomSearch on the max_depth and min_samples_leaf parameters.  
With this search via RandomSearch, what are the best values of the hyperparameters if you want to maximize the accuracy score?

In [None]:
r_params = {
          'max_depth': range(1,100), 
          'min_samples_leaf':range(1,100) 
          }

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
r_clf = RandomizedSearchCV(param_distributions=r_params, estimator=DTF)
r_clf.fit(X,y)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=None,
       

In [None]:
#accuracy and best params
print("best score:",r_clf.best_score_)
print("best parameters:",r_clf.best_params_)

best score: 0.8208087348441568
best parameters: {'min_samples_leaf': 10, 'max_depth': 21}
