# Imports

In [31]:
import pandas as pd
import numpy as np

import itertools

import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

style.use('fivethirtyeight')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier

%matplotlib inline

# Data

In [2]:
train = pd.read_csv("train.csv")

In [3]:
train.columns = train.columns.str.lower()

# Encode Columns

In [4]:
train.sex.replace({
    'male' : 1,
    'female' : 0
}, inplace=True)

train['missing_cnt'] = train.isnull().sum(axis=1)

train['missing_any'] = train.missing_cnt > 0

train = train.join(pd.get_dummies(train.embarked, prefix='embarked'))

train = train.join(pd.get_dummies(train.pclass, prefix='pclass'))

# Train-Test Split

In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(
    train.drop(columns=[
        'passengerid',
        'survived',
        'pclass',
        'name',
        'age',
        'ticket',
        'cabin',
        'embarked',
    ]),
    train.survived,
    test_size=0.1,
    random_state=0
)

# Scale

In [6]:
scaler = MinMaxScaler()

train_X = scaler.fit_transform(train_X)

test_X = scaler.transform(test_X)

# Logistic Regression

In [25]:
log_param_grid = {
    'penalty' : ['l2', 'none'],
    'fit_intercept' : [True, False],
    'class_weight' : ['balanced', None],
    'solver' : ['lbfgs'],
    'max_iter' : np.arange(100, 500, 100)
}

log_gscv = GridSearchCV(estimator=LogisticRegression(), param_grid=log_param_grid, cv=10, iid=False)

log_gscv.fit(train_X, train_Y)

print(f"LogRegression Best Score:\n{log_gscv.best_score_}")
print(f"LogRegression Best Params:\n{log_gscv.best_params_}")



LogRegression Best Score:
0.7954049460853259
LogRegression Best Params:
{'class_weight': None, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


# SGD

In [66]:
sgd_params_grid = {
    'loss' : ['hinge', 'log', 'modified_huber',
              'squared_hinge', 'perceptron',
              'squared_loss', 'huber', 'epsilon_insensitive',
              'squared_epsilon_insensitive'],
    'penalty' : ['none', 'l2', 'l1', 'elasticnet'],
    'alpha' : np.arange(1e-4, 0.9, 0.05),
    'fit_intercept' : [True, False],
    'max_iter' : np.arange(1100, 1500, 50)
}

sgd_gscv = GridSearchCV(estimator=SGDClassifier(), param_grid=sgd_params_grid, cv=10, iid=False)

sgd_gscv.fit(train_X, train_Y)

print(f"SGD Best Score:\n{sgd_gscv.best_score_}")
print(f"SGD Best Params:\n{sgd_gscv.best_params_}")

















































































































































































































































































































































































































SGD Best Score:
0.8066248632598845
SGD Best Params:
{'alpha': 0.1501, 'fit_intercept': True, 'loss': 'log', 'max_iter': 1250, 'penalty': 'none'}


# KNN

In [93]:
knn_param_grid = {
    'n_neighbors' : np.arange(1, 25),
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : np.arange(1, 45),
    'p' : [1, 2]
}

knn_gscv = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_param_grid, cv=10, iid=False)

knn_gscv.fit(train_X, train_Y)

print(f"SVC Best Score:\n{knn_gscv.best_score_}")

print(f"SCV Best Params:\n{knn_gscv.best_params_}")

SVC Best Score:
0.8164794007490637
SCV Best Params:
{'algorithm': 'auto', 'leaf_size': 2, 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}


# SVC

In [27]:
svc_param_grid = {
    'kernel' : ['rbf', 'poly', 'sigmoid'],
    'decision_function_shape' : ['ovo', 'ovr'],
    'degree' : [1, 2, 3, 4, 5],
    'gamma' : ['auto', 'scale']
}

svc_gscv = GridSearchCV(estimator=SVC(), param_grid=svc_param_grid, cv=10, iid=False)

svc_gscv.fit(train_X, train_Y)

print(f"SVC Best Score:\n{svc_gscv.best_score_}")

print(f"SCV Best Params:\n{svc_gscv.best_params_}")

SVC Best Score:
0.809077394905454
SCV Best Params:
{'decision_function_shape': 'ovo', 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}


# Ada Boost

In [30]:
ada_param_grid = {
    'n_estimators' : np.arange(25, 75, 5),
    'learning_rate' : np.arange(0.75, 1.25, 0.05),
    'algorithm' : ['SAMME', 'SAMME.R']
}

ada_gscv = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=ada_param_grid, cv=10, iid=False)

ada_gscv.fit(train_X, train_Y)

print(f"Ada Boost Score:\n{ada_gscv.best_score_}")

print(f"Ada Boost Best Params:\n{ada_gscv.best_params_}")

Ada Boost Score:
0.8017340600093764
Ada Boost Best Params:
{'algorithm': 'SAMME.R', 'learning_rate': 0.8, 'n_estimators': 25}
