In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import numpy as np

titanic_df = pd.read_csv('../data/titanic.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
titanic_df.drop(['Name', 'Cabin', 'PassengerId', 'Ticket', 'Embarked'], axis=1, inplace=True)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [18]:
# Binarize Sex
titanic_df['Sex'] = (titanic_df['Sex'] == 'male').astype(int)

In [66]:
X = titanic_df.iloc[:, 1:]
y = titanic_df.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [25]:
age_fill_series = X_train.groupby(['Sex','Pclass']).Age.median().reset_index()
age_fill_series

Unnamed: 0,Sex,Pclass,Age
0,0,1,35.0
1,0,2,28.0
2,0,3,22.0
3,1,1,42.0
4,1,2,31.0
5,1,3,25.0


In [36]:
def impute_age(X):
    merged_frame = pd.merge(X, age_fill_series, how='left', 
                             left_on=['Sex', 'Pclass'], 
                             right_on=['Sex', 'Pclass'])

    merged_frame.rename(columns={'Age_x': 'Age', 'Age_y': 'median_age'}, inplace=True)
    merged_frame.Age.fillna(merged_frame.median_age, inplace=True)
    merged_frame.drop(['median_age'], axis=1, inplace=True)
    return merged_frame

X_train_imputed = impute_age(X_train)
X_test_imputed = impute_age(X_test)

X_train_imputed.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,1,54.0,0,1,77.2875
1,3,1,7.0,4,1,39.6875
2,3,1,25.0,0,0,56.4958
3,3,1,22.0,0,0,8.05
4,3,1,40.0,0,0,7.225


In [40]:
ssX = StandardScaler()
X_train_imputed_scaled = ssX.fit_transform(X_train_imputed)
X_test_imputed_scaled = ssX.fit_transform(X_test_imputed)

## Load the models

In [58]:
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

### kNN

In [59]:
models = [('knn', KNN), 
          ('logistic', LogisticRegression),
          ('tree', DecisionTreeClassifier),
          ('forest', RandomForestClassifier)
         ]

param_choices = [
    {
        'n_neighbors': range(1, 12)
    },
    {
        'C': np.logspace(-3,6, 12),
        'penalty': ['l1', 'l2']
    },
    {
        'max_depth': [1,2,3,4,5],
        'min_samples_leaf': [3,6,10]
    },
    {
        'n_estimators': [50, 100, 200],
        'max_depth': [1,2,3,4,5],
        'min_samples_leaf': [3,6,10]
    }
]

grids = {}
for model_info, params in zip(models, param_choices):
    name, model = model_info
    grid = GridSearchCV(model(), params)
    grid.fit(X_train_imputed_scaled, y_train)
    s = f"{name}: best score: {grid.best_score_}"
    print(s)
    grids[name] = grid

knn: best score: 0.8173652694610778
logistic: best score: 0.812874251497006
tree: best score: 0.8233532934131736
forest: best score: 0.8218562874251497


## Let's use logisitic!

Trees scored better, but let's use the Logistic Regression data just to get practice.

In [62]:
# Let's get the testing score
grids['logistic'].best_estimator_.score(X_test_imputed_scaled, y_test)

0.7668161434977578

### Let's retrain on all the data!

In [97]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('scaler', StandardScaler()),
                     ('logistic', LogisticRegression(**grids['logistic'].best_params_))])

# Do the impuation step
X.Age.fillna(X.groupby(['Sex', 'Pclass']).Age.transform(np.median), inplace=True)

pipeline.fit(X,y)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logistic', LogisticRegression(C=0.2848035868435802, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [98]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [99]:
import pickle

pickle.dump(pipeline, open('model.pkl', 'wb'))

## make model for prediction

In [100]:
X.iloc[0]

Pclass     3.00
Sex        1.00
Age       22.00
SibSp      1.00
Parch      0.00
Fare       7.25
Name: 0, dtype: float64

In [108]:
example = {
  'Pclass': 3,  # int
  'Sex': 'M',    # M or F
  'Age': 22,    # int
  'SibSp': 1,  # int
  'Parch': 0,  # int
  'Fare': 7.25    # float
}

def make_prediction(features):
    X = np.array([features['Pclass'], int(features['Sex'] == 'M'), features['Age'], 
                  features['SibSp'], features['Parch'], features['Fare']]).reshape(1,-1)
    prob_survived = pipeline.predict_proba(X)[0, 1]
    
    result = {
        'prediction': int(prob_survived > 0.5),
        'prob_survived': prob_survived
    }
    return result

In [109]:
make_prediction(example)

{'prediction': 0, 'prob_survived': 0.09804188983748195}

In [110]:
pipeline.predict_proba(X.iloc[0].values.reshape(1,-1))

array([[0.90195811, 0.09804189]])