# Lecture 26 – Data 100, Summer 2020

Guest appearancer by Josh Hug

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3
sns.set()

In [None]:
df = sns.load_dataset("titanic")
df = df[["age", "fare", "pclass", "sex", "survived"]]
df = df.dropna()
df['sex'] = df['sex'].replace("male", 0)
df['sex'] = df['sex'].replace("female", 1)
np.random.seed(23)
df_train, df_test = np.split(df, [600])

In [None]:
df_train

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C = 0.01)
lr_model.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])

In [None]:
from sklearn.metrics import accuracy_score
y_hat = lr_model.predict(df_train[["age", "fare", "pclass", "sex"]])
accuracy_score(df_train["survived"], y_hat)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C = 10)
lr_model.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])
y_hat = lr_model.predict(df_train[["age", "fare", "pclass", "sex"]])
accuracy_score(df_train["survived"], y_hat)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
lr_model = LogisticRegression()
parameters = {'C': [0.01, 0.1, 1, 10, 100]}

lr_model_finder = GridSearchCV(lr_model, parameters, cv=10)
lr_model_finder.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])
print(lr_model_finder.best_params_)
print(lr_model_finder.best_score_)

We can also see all of the cross validation scores with `cv_results_`

In [None]:
lr_model_finder.cv_results_

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [None]:
scaled_polynomial_logistic_regression_model = Pipeline([        
    ('scale', StandardScaler()),  
    ('poly', PolynomialFeatures()),
    ('model', LogisticRegression())
])

parameters = {'model__C': [0.01, 0.1, 1, 10, 100],
              'poly__degree': [1, 2]}

lr_scaled_poly_model_finder = GridSearchCV(scaled_polynomial_logistic_regression_model, parameters, cv=10)
lr_scaled_poly_model_finder.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])
print(lr_scaled_poly_model_finder.best_params_)
print(f"best accuracy: {lr_scaled_poly_model_finder.best_score_}")

In [None]:
scaled_polynomial_logistic_regression_model = Pipeline([        
    ('scale', StandardScaler()),  
    ('poly', PolynomialFeatures()),
    ('model', LogisticRegression(penalty = 'l1', solver = 'liblinear'))
])

parameters = {'model__C': [0.01, 0.1, 1, 10, 100],
              'poly__degree': [1, 2]}

lr_scaled_poly_model_finder = GridSearchCV(scaled_polynomial_logistic_regression_model, parameters, cv=10)
lr_scaled_poly_model_finder.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])
print(lr_scaled_poly_model_finder.best_params_)
print(f"best accuracy: {lr_scaled_poly_model_finder.best_score_}")

In [None]:
scaled_polynomial_logistic_regression_model = Pipeline([        
    ('scale', StandardScaler()),  
    ('poly', PolynomialFeatures()),
    ('model', LogisticRegression())
])

parameters = {'model__C': [0.01, 0.1, 1, 10, 100],
              'poly__degree': [1, 2]}

lr_scaled_poly_model_finder = GridSearchCV(scaled_polynomial_logistic_regression_model, 
                                           parameters, cv=10, scoring = "precision")
lr_scaled_poly_model_finder.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])
print(lr_scaled_poly_model_finder.best_params_)
print(f"precision: {lr_scaled_poly_model_finder.best_score_}")

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

decision_tree = DecisionTreeClassifier()
parameters = {'min_impurity_decrease': [0, 0.01, 0.02, 0.05, 0.1]}

dt_model_finder = GridSearchCV(decision_tree, parameters, cv=10)
dt_model_finder.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])
print(dt_model_finder.best_params_)
print(f"accuracy: {dt_model_finder.best_score_}")

In [None]:
from sklearn.svm import SVC
svm_model = SVC()
parameters = {'kernel': ["linear", "rbf"], 'C': [0.1, 1]}

svm_model_finder = GridSearchCV(svm_model, parameters, cv=10)
svm_model_finder.fit(df_train[["age", "fare", "pclass", "sex"]], df_train["survived"])
print(svm_model_finder.best_params_)
print(f"accuracy: {svm_model_finder.best_score_}")

We can even compare multiple model types by using a dictionary of dictionaries.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

all_models = {
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': Pipeline([        
        ('scale', StandardScaler()),        
        ('model', LogisticRegression())
    ]),
    'KNearestNeighbors': KNeighborsClassifier()
}

all_params = {
    'RandomForest':{ 
            "n_estimators"         : [50, 100, 200],
            'min_impurity_decrease': [0, 0.01, 0.02, 0.05, 0.1]
            },
    'DecisionTree': {
        'min_impurity_decrease'    : [0, 0.01, 0.02, 0.05, 0.1]
        },
    'LogisticRegression' : {
        'model__C'                 : 10**np.linspace(-7, 5, 100)
        },
    'KNearestNeighbors' : {
        'n_neighbors'              : [3, 4, 5, 8, 10]
    }
}

In [None]:
for name in all_models.keys():
    model = all_models[name]
    params = all_params[name]
    gscv = GridSearchCV(estimator = model, param_grid = params, cv = 10)
    gscv.fit(df_train[["fare", "age", "pclass",  "sex"]], 
             df_train["survived"])
    print(f"best parameters are: {gscv.best_estimator_}")
    print(f"accuracy is: {gscv.best_score_}")    

Above, we see the Random Forest model does slightly better than a decision tree or logistic regression model, with average cross-validation accuracy of 0.805.

In [None]:
for name in all_models.keys():
    model = all_models[name]
    params = all_params[name]
    gscv = GridSearchCV(estimator = model, param_grid = params, cv = 10, scoring = "precision")
    gscv.fit(df_train[["fare", "age", "pclass",  "sex"]], 
             df_train["survived"])
    print(f"best parameters are: {gscv.best_estimator_}")
    print(f"precision is: {gscv.best_score_}")    