## The purpose of this exercise is to learn and practice machine learning standard process
- Features will be the features choosen in titanic_select_feautres.ipynb
- Going to test which classifier is the most powerful one via GridSearchCV

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import zipfile
from pathlib import Path
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, RandomTreesEmbedding, RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, precision_recall_curve, auc, make_scorer, average_precision_score, roc_auc_score, f1_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
# import training data=
root = Path(".")
with zipfile.ZipFile(root/"titanic.zip") as z:
    names = z.namelist()
    df_train = pd.read_csv(z.open(names[2]))
    x_df_test = pd.read_csv(z.open(names[1]))
    y_df_test = pd.read_csv(z.open(names[0]))
    y_df_test = y_df_test.drop(columns=['PassengerId'])
    
df_test = pd.concat([x_df_test, y_df_test],axis=1)
df_data = pd.concat([df_train, df_test], axis=0)


### Remove the unused variables

In [4]:
# drop the variables that varies too much --> not providing much information
drop_var = ['PassengerId', "Name", "Ticket", "Cabin"]
df_data = df_data.drop(drop_var, axis=1)

### Fill the missing variables

In [5]:
dict_fill_age = df_data.groupby("Sex")["Age"].mean().to_dict()

for gender, fill_age in dict_fill_age.items():
    df_data.loc[(df_data.Sex==gender) & (df_data.Age.isna()), "Age"] = fill_age


In [6]:
df_data.Fare = df_data.Fare.fillna(value=df_data.Fare.median())

In [7]:
SI = SimpleImputer(strategy="most_frequent")
df_data.Embarked = SI.fit_transform(df_data[['Embarked']])

### Encode the categorical variables to numeric variables

In [8]:
df_data["Sex_encoded"] = df_data.Sex.map({"male": 0, "female":1}).astype('int8')


In [9]:
label_encoder = LabelEncoder()
df_data["Embarked_encoded"] = label_encoder.fit_transform(df_data.Embarked).astype('int8')

### Try different classifiers and find the best set of parameters for the classifier

In [19]:
scaler = StandardScaler()

use_cols = ['Pclass', 'Age', 'Sex_encoded']
random_seed = 42

est_names = [
             'Nearest Neighbours',
             'SVM',
             'Guassian Process',
             'Decision Tree',
             'Random Forest',
             'Neural Net',
             'AdaBoost',
             'QDA']

est_list = [KNeighborsClassifier(n_neighbors=10, weights='distance'),
            SVC(gamma=1, C=1, random_state=random_seed, kernel='linear'),
            GaussianProcessClassifier(1.0 * RBF(1.0), random_state=random_seed),
            DecisionTreeClassifier(max_depth=5, random_state=random_seed),
            RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1, random_state=random_seed),
            MLPClassifier(alpha=1, max_iter=1000, random_state=random_seed),
            AdaBoostClassifier(algorithm="SAMME", random_state=random_seed),
            QuadraticDiscriminantAnalysis()
            ]

est_params = [
              {
                'est__n_neighbors': [i for i in range(5,26)],
                'est__weights': ['uniform', 'distance']
                },
              {
                'est__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'est__gamma': ['scale', 'auto', 1, 2, 3],
                'est__C': [i for i in range(1,11)]
                },
              {
               'est__n_restarts_optimizer': [i for i in range(1,4)],
               'est__multi_class': ['one_vs_rest', 'one_vs_one'] 
                },
              {
               'est__criterion': ['gini', 'entropy', 'log_loss'],
               'est__splitter': ['best', 'random'],
               'est__max_depth': [i for i in range(5,11)] 
                },
              {
                'est__criterion': ['gini', 'entropy', 'log_loss'],
                'est__max_depth': [i for i in range(5,11)],
                'est__bootstrap': [True, False],
                'est__class_weight': ['balanced', 'balanced_subsample']
                },
              {
                'est__activation': ['identity', 'logistic', 'tanh', 'relu'],
                'est__solver': ['lbfgs', 'sgd', 'adam'],
                'est__learning_rate': ['constant', 'invscaling', 'adaptive']
                },
              {
                'est__algorithm': ['SAMME', 'SAMME.R']
                },
              {
                'est__store_covariance': [True, False]
                }
              ]

scoring = {"recall":    make_scorer(recall_score, greater_is_better=True),
           "precision": make_scorer(precision_score, greater_is_better=True),
           "accuracy":  make_scorer(accuracy_score, greater_is_better=True),
           "F1 score":  make_scorer(f1_score, greater_is_better=True)
           }


In [20]:
df_train = df_data.iloc[:df_train.shape[0],:]
df_test  = df_data.iloc[df_train.shape[0]:,:]

est_grid = []
est_best_index = []
est_best_score = []
est_best_params = []
est_best_est = []

In [21]:

for name, est, params in zip(est_names, est_list, est_params):
    pipe = Pipeline( steps= [('scaler', scaler), ('est', est)] )
    
    grid = GridSearchCV(pipe,
                        cv=10,
                        param_grid=params,
                        scoring= scoring,
                        refit='F1 score',
                        n_jobs=-1
                        )
    print(name, est, params, sep=',')
    
    grid.fit(df_train[use_cols], df_train["Survived"])
    
    est_grid.append(grid)
    est_best_index.append(grid.best_index_)
    est_best_score.append(grid.best_score_)
    est_best_params.append(grid.best_params_)
    est_best_est.append(grid.best_estimator_)

Nearest Neighbours,KNeighborsClassifier(n_neighbors=10, weights='distance'),{'est__n_neighbors': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], 'est__weights': ['uniform', 'distance']}
SVM,SVC(C=1, gamma=1, kernel='linear', random_state=42),{'est__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'est__gamma': ['scale', 'auto', 1, 2, 3], 'est__C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
Guassian Process,GaussianProcessClassifier(kernel=1**2 * RBF(length_scale=1), random_state=42),{'est__n_restarts_optimizer': [1, 2, 3], 'est__multi_class': ['one_vs_rest', 'one_vs_one']}
Decision Tree,DecisionTreeClassifier(max_depth=5, random_state=42),{'est__criterion': ['gini', 'entropy', 'log_loss'], 'est__splitter': ['best', 'random'], 'est__max_depth': [5, 6, 7, 8, 9, 10]}
Random Forest,RandomForestClassifier(max_depth=5, max_features=1, random_state=42),{'est__criterion': ['gini', 'entropy', 'log_loss'], 'est__max_depth': [5, 6, 7, 8, 9, 10], 'est__bootstrap': [True, False

In [31]:
est_best_params

[{'est__n_neighbors': 11, 'est__weights': 'distance'},
 {'est__C': 1, 'est__gamma': 'scale', 'est__kernel': 'poly'},
 {'est__multi_class': 'one_vs_rest', 'est__n_restarts_optimizer': 1},
 {'est__criterion': 'gini', 'est__max_depth': 10, 'est__splitter': 'random'},
 {'est__bootstrap': True,
  'est__class_weight': 'balanced_subsample',
  'est__criterion': 'entropy',
  'est__max_depth': 7},
 {'est__activation': 'identity',
  'est__learning_rate': 'constant',
  'est__solver': 'lbfgs'},
 {'est__algorithm': 'SAMME.R'},
 {'est__store_covariance': True}]

In [30]:
est_best_est

[Pipeline(steps=[('scaler', StandardScaler()),
                 ('est',
                  KNeighborsClassifier(n_neighbors=11, weights='distance'))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('est', SVC(C=1, kernel='poly', random_state=42))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('est',
                  GaussianProcessClassifier(kernel=1**2 * RBF(length_scale=1),
                                            n_restarts_optimizer=1,
                                            random_state=42))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('est',
                  DecisionTreeClassifier(max_depth=10, random_state=42,
                                         splitter='random'))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('est',
                  RandomForestClassifier(class_weight='balanced_subsample',
                                         criterion='entropy', max_depth=7,
                 

In [32]:
est_best_score

[0.7555308182670435,
 0.7322884180113082,
 0.7086026208820875,
 0.7409504993718342,
 0.7484759816823743,
 0.7242560435674927,
 0.7463696476954451,
 0.7291603927392865]