# Imports

In [62]:
import pandas as pd
import numpy as np

from scipy import stats
# from scipy.signal import argrelextrema

# import itertools

import matplotlib.pyplot as plt
# from matplotlib import style
import seaborn as sns; sns.set()

# style.use('fivethirtyeight')

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier,
    BaggingClassifier, VotingClassifier, StackingClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline

# Data

In [3]:
train = pd.read_csv("train.csv")

train.columns = train.columns.str.lower()

train.set_index(keys='passengerid', inplace=True)

- Separate `X` and `y`
- Create train/test sets

In [72]:
X = train.drop(columns='survived')

y = train.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

# Encode Columns

In [74]:
# convert dtype to "category"
X_train[[
    'pclass',
    'cabin',
    'embarked',
]] = X_train.reindex(columns=[
    'pclass',
    'cabin',
    'embarked',
]).astype('category')

In [100]:
cvect = CountVectorizer(min_df=0.1, binary=True)

cvect_data = cvect.fit_transform(X_train.name).todense()

cvect_cols = cvect.get_feature_names()

cvect_df = pd.DataFrame(cvect_data, columns=cvect_cols, index=X_train.index)

In [111]:
X_train.ticket.str.split()

passengerid
232                 [347067]
837                 [315097]
640                 [376564]
390               [SC, 1748]
598                   [LINE]
               ...          
132    [SOTON/O.Q., 3101307]
491                  [65304]
839                   [1601]
49                    [2662]
81                  [345767]
Name: ticket, Length: 623, dtype: object

In [112]:
X_train.head()

Unnamed: 0_level_0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
232,3,"Larsson, Mr. Bengt Edvin",male,29.0,0,0,347067,7.775,,S
837,3,"Pasic, Mr. Jakob",male,21.0,0,0,315097,8.6625,,S
640,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S
390,2,"Lehmann, Miss. Bertha",female,17.0,0,0,SC 1748,12.0,,C
598,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S


In [None]:
train.sex.replace({
    'male' : 1,
    'female' : 0
}, inplace=True)

train['missing_cnt'] = train.isnull().sum(axis=1)

train['missing_any'] = train.missing_cnt > 0

train = train.join(pd.get_dummies(train.embarked, prefix='embarked'))

train = train.join(pd.get_dummies(train.pclass, prefix='pclass'))

train = train.join(pd.get_dummies(train.name.str.split(', ').str[-1].str.split('.').str[0], prefix='title'))

train.columns = train.columns.str.replace(' ', '_')

train['miss_master'] = train.loc[:, [
    'title_Master',
    'title_Miss',
]].sum(axis=1)

pca = PCA(n_components=.99, random_state=0)

train = train.join(pd.DataFrame(pca.fit_transform(train.loc[:, [
    'miss_master',
    'title_Miss',
    'title_Master'
]])).add_prefix('pca_'))

## Age

In [None]:
age = train.loc[
    train.age.notnull(),
    'pclass':
]

In [None]:
def poi(s):
    """
    Takes a series of values, calculates the Gaussian KDE, and returns the Points of Inflection.
    
    Parameters
    ----------
    s : array-like
        the values to be used for computing the Gaussian KDE
    
    Returns
    -------
    poi : array
        a 1D array with the points of inflection
    """
    
    # compute the gaussian kde
    gkde = stats.gaussian_kde(s)
    
    # get 10,000 x points between the 0 and maximum of s
    x = np.linspace(0, s.max()+1, 10000)
    
    # estimate y points based on x points
    y = gkde.pdf(x)
    
    # find all relative maxima
    rel_max = argrelextrema(y, np.greater)[0]
    
    # find all relative minima
    rel_min = argrelextrema(y, np.less)[0]
    
    # calculate change in x
    dx = x[1] - x[0]
    
    # calculate the slope
    dydx = np.gradient(y, dx)
    
    # find the index for max points of inflection
    poi_max = argrelextrema(dydx, np.greater)[0]
    
    # find the index for min points of inflection
    poi_min = argrelextrema(dydx, np.less)[0]
    
    return x[rel_max], x[rel_min], x[poi_max], x[poi_min]

In [None]:
POI = poi(age.age)

rmax = POI[0]

rmin = POI[1]

pmax = POI[2]

pmin = POI[3]

fig, ax = plt.subplots(figsize=(16,8))

age.age.plot.kde(ax=ax, label='age', alpha=.5)

ax.set_xlim(0, age.age.max())

for i in rmax:
    ax.axvline(i, color='k', ls=':')

for i in rmin:
    ax.axvline(i, color='k', ls=':')

for i in pmax:
    ax.axvline(i, color='g', ls=':', lw=2)

for i in pmin:
    ax.axvline(i, color='r', ls=':', lw=2);

In [None]:
p = np.array(sorted(list(itertools.chain.from_iterable(POI))))

In [None]:
age['age_grp'] = np.where(
    age.age <= p[0], 0, np.where(
        age.age <= p[1], 1, np.where(
            age.age <= p[2], 2, np.where(
                age.age <= p[3], 3, np.where(
                    age.age <= p[4], 4, np.where(
                        age.age <= p[5], 5, np.where(
                            age.age <= p[6], 6, np.where(
                                age.age <= p[7], 7, np.where(
                                    age.age <= p[8], 8, np.where(
                                        age.age <= p[9], 9, 10
                                    )
                                )
                            )
                        )
                    )
                )
            )
        )
    )
)

In [None]:
age.age_grp.value_counts().sort_index().plot.bar();

In [None]:
age['age_grp2'] = age.age < p[3]

In [None]:
age = age.loc[:,age.notnull().all()].copy()

In [None]:
age.drop(columns=[
    'age',
]).corrwith(age.age_grp).abs().sort_values()

In [None]:
train_age_X, test_age_X, train_age_Y, test_age_Y = train_test_split(age.drop(columns=[
    'age',
    'age_grp',
    'age_grp2',
    'ticket',
    'name',
] + age.columns[
    age.columns.str.contains('title')
].tolist()), age.age_grp2, test_size=0.1, random_state=0)

In [None]:
scaler = MinMaxScaler()

train_age_X = scaler.fit_transform(train_age_X)

test_age_X = scaler.transform(test_age_X)

selector = SelectPercentile(percentile=30)

train_age_X = selector.fit_transform(train_age_X, train_age_Y)

test_age_X = selector.transform(test_age_X)

In [None]:
pg = {
    'n_neighbors' : np.arange(1, 30),
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
}

gs = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=pg, cv=10, iid=False)

gs.fit(train_age_X, train_age_Y)

print(gs.best_score_)
print(gs.best_params_)

In [None]:
pg = {
    'alpha' : np.arange(0, 1, 0.05),
    'fit_prior' : [True, False]
}

gs = GridSearchCV(estimator=MultinomialNB(), param_grid=pg, cv=10, iid=False)

gs.fit(train_age_X, train_age_Y)

print(gs.best_score_)
print(gs.best_params_)

In [None]:
pg = {
    'n_estimators' : np.arange(10, 100, 5),
    'criterion' : ['gini', 'entropy'],
    'max_features' : ['auto', 'sqrt', 'log2', None],
    'bootstrap' : [True, False]
}

gs = GridSearchCV(estimator=RandomForestClassifier(), param_grid=pg, cv=10, iid=False)

gs.fit(train_age_X, train_age_Y)

print(gs.best_score_)
print(gs.best_params_)

In [None]:
rfc = RandomForestClassifier(bootstrap=True, criterion='gini', max_features='auto', n_estimators=25, random_state=0)
rfc.fit(train_age_X, train_age_Y)

# rfc.feature_importances_

age_pred = rfc.predict(test_age_X)

pd.DataFrame(
    confusion_matrix(test_age_Y, age_pred, labels=test_age_Y.unique()),
    columns=pd.MultiIndex.from_product([['Pred'], test_age_Y.unique()]),
    index=pd.MultiIndex.from_product([['Actual'], test_age_Y.unique()])
).T

In [None]:
accuracy_score(test_age_Y, age_pred)

In [None]:
precision_score(test_age_Y, age_pred)

In [None]:
recall_score(test_age_Y, age_pred)

In [None]:
f1_score(test_age_Y, age_pred)

In [None]:
train.sibsp.value_counts()

In [None]:
train.parch.value_counts()

In [None]:
train.loc[:, [
    'sibsp',
    'parch',
]].sum(axis=1).value_counts().sort_index().plot.bar()

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

train.loc[:, [
    'sibsp',
    'parch',
]].plot.kde(ax=ax)

ax.set_xlim(0);

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

age.age.plot.kde(ax=ax)

ax.set_xlim(0, age.age.max())

ax.axvline(14, lw=2, ls=':', color='k')
ax.axvline(13, lw=2, ls=':', color='k');

# Train-Test Split

In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(
    train.drop(columns=[
        'passengerid',
        'survived',
        'pclass',
        'name',
        'age',
        'ticket',
        'cabin',
        'embarked',
    ]),
    train.survived,
    test_size=0.1,
    random_state=0
)

# Scale

In [None]:
scaler = MinMaxScaler()

train_X = scaler.fit_transform(train_X)

test_X = scaler.transform(test_X)

# Logistic Regression

In [None]:
log_param_grid = {
    'penalty' : ['l2', 'none'],
    'fit_intercept' : [True, False],
    'class_weight' : ['balanced', None],
    'solver' : ['lbfgs'],
    'max_iter' : np.arange(100, 500, 100)
}

log_gscv = GridSearchCV(estimator=LogisticRegression(), param_grid=log_param_grid, cv=10, iid=False)

log_gscv.fit(train_X, train_Y)

print(f"LogRegression Best Score:\n{log_gscv.best_score_}")
print(f"LogRegression Best Params:\n{log_gscv.best_params_}")

# SGD

In [None]:
sgd_params_grid = {
    'loss' : ['hinge', 'log', 'modified_huber',
              'squared_hinge', 'perceptron',
              'squared_loss', 'huber', 'epsilon_insensitive',
              'squared_epsilon_insensitive'],
    'penalty' : ['none', 'l2', 'l1', 'elasticnet'],
    'alpha' : np.arange(1e-4, 0.9, 0.05),
    'fit_intercept' : [True, False],
    'max_iter' : np.arange(1100, 1500, 50)
}

sgd_gscv = GridSearchCV(estimator=SGDClassifier(), param_grid=sgd_params_grid, cv=10, iid=False)

sgd_gscv.fit(train_X, train_Y)

print(f"SGD Best Score:\n{sgd_gscv.best_score_}")
print(f"SGD Best Params:\n{sgd_gscv.best_params_}")

# KNN

In [None]:
knn_param_grid = {
    'n_neighbors' : np.arange(1, 25),
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : np.arange(1, 45),
    'p' : [1, 2]
}

knn_gscv = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_param_grid, cv=10, iid=False)

knn_gscv.fit(train_X, train_Y)

print(f"SVC Best Score:\n{knn_gscv.best_score_}")

print(f"SCV Best Params:\n{knn_gscv.best_params_}")

# SVC

In [None]:
svc_param_grid = {
    'kernel' : ['rbf', 'poly', 'sigmoid'],
    'decision_function_shape' : ['ovo', 'ovr'],
    'degree' : [1, 2, 3, 4, 5],
    'gamma' : ['auto', 'scale']
}

svc_gscv = GridSearchCV(estimator=SVC(), param_grid=svc_param_grid, cv=10, iid=False)

svc_gscv.fit(train_X, train_Y)

print(f"SVC Best Score:\n{svc_gscv.best_score_}")

print(f"SCV Best Params:\n{svc_gscv.best_params_}")

# Ada Boost

In [None]:
ada_param_grid = {
    'n_estimators' : np.arange(25, 75, 5),
    'learning_rate' : np.arange(0.75, 1.25, 0.05),
    'algorithm' : ['SAMME', 'SAMME.R']
}

ada_gscv = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=ada_param_grid, cv=10, iid=False)

ada_gscv.fit(train_X, train_Y)

print(f"Ada Boost Score:\n{ada_gscv.best_score_}")

print(f"Ada Boost Best Params:\n{ada_gscv.best_params_}")