# Setup

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

# neural net
from sklearn.neural_network import MLPClassifier 
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # use this for consistency
from sklearn.feature_extraction import DictVectorizer # if needed
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, ParameterGrid

# tree
from sklearn import preprocessing, tree
from sklearn import ensemble

# Preprocessing functions, in order

In [2]:
def prefix_encoding(df, drop=True):
    # Prefix encoding
    tickets = df['Ticket'].str.extract(r"(.+\s|\w+\s)(\d*)*")
    tickets.fillna('_', inplace=True)
    df[['Ticket Prefix','Ticket Number']] = tickets
    df['Ticket Prefix'] = df['Ticket Prefix'].str.replace(r"\.| |\/",'', regex=True).str.upper()

    if drop:
        drop_prefixes = df['Ticket Prefix'].value_counts()[df['Ticket Prefix'].value_counts() <= 1].index
        df = df[~df['Ticket Prefix'].isin(drop_prefixes)]
    return df

In [3]:
def prefix_prob(df):
    # probability encoding — prefixes
    probabilities = df['Ticket Prefix'].value_counts() / df.shape[0]
    df['Prefix Probability'] = df['Ticket Prefix'].map(probabilities)
    return df

In [4]:
def impute_decks(df):
    # impute decks, must have a "Cabin" and "Ticket Prefix" columns
    # returns sort by PassengerId
    prefixes = df['Ticket Prefix'].unique()
    valid_decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

    df['Deck'] = df['Cabin'].str.extract(r"([A-Za-z]+)")
    all_decks = df[~df['Deck'].isnull()]

    null_groups = all_decks[all_decks['PassengerId'] == 0]

    for prefix in prefixes:
        prefix_df = df[df['Ticket Prefix'] == prefix]
        prefix_df_notna = prefix_df[~prefix_df['Deck'].isnull()]
        prefix_df_na = prefix_df[prefix_df['Deck'].isnull()]

        if prefix[0] in valid_decks: # check if ticket prefix matches deck
            prefix_df_na['Deck'] = prefix[0]
            prefix_df = pd.concat([prefix_df_notna,prefix_df_na])
            all_decks = pd.concat([all_decks,prefix_df_na])
        elif prefix_df_notna.shape[0] == 0: # 
            null_groups = pd.concat([null_groups, prefix_df])
        else: # impute based on distribution
            prefix_df_na = impute_decks_helper(prefix_df_na, prefix_df_notna, 'Deck')
            all_decks = pd.concat([all_decks,prefix_df_na])

    null_groups = impute_decks_helper(null_groups, df[~df['Deck'].isnull()], 'Deck')
    all_decks = pd.concat([all_decks,null_groups])
    return all_decks.sort_values('PassengerId')

def impute_decks_helper(df_na, df_notna, col):
    # assert col is last in df
    shuffled_df = df_na.sample(frac=1, random_state=42)
    counts = df_notna[col].value_counts()
    dist_counts = (counts / sum(counts) * shuffled_df.shape[0]).astype(int)
    index = 0
    if dist_counts.shape[0] > 1:
        for d in dist_counts.index[:-1]:
            df_na.iloc[index:index+dist_counts[d], -1] = d
            index = index+dist_counts[d]
        df_na.iloc[index:, -1] = dist_counts.index[-1]
    else:
        df_na.iloc[:,-1] = dist_counts.index[0]
    return df_na



In [5]:
def impute_decks_alt(df):
    # deck assignment, must have a "Cabin", "Ticket Prefix" and "PassengerId" columns
    cabin_not_na = df[~df['Cabin'].isnull()][['Ticket Prefix','Cabin','PassengerId']]
    cabin_not_na['Deck'] = cabin_not_na['Cabin'].str.extract(r"([A-Za-z]+)")
    deck_ = cabin_not_na

    cabin_na = df[df['Cabin'].isnull()][['Ticket Prefix','Cabin','PassengerId']]
    cabin_na['Deck'] = pd.Series(index=range(cabin_na.shape[0]))
    deck_na = cabin_na.sample(frac=1, random_state=42)

    dist_counts_ = (deck_['Deck'].value_counts() / deck_.shape[0] * deck_na.shape[0]).astype(int)

    index = 0
    for d in dist_counts_.index[:-1]:
        deck_na.iloc[index:index+dist_counts_[d], 3] = d
        index = index+dist_counts_[d]
    deck_na.iloc[index:, 3]=dist_counts_.index[-1]
    deck_assignments = pd.concat([deck_,deck_na]).sort_values(by='PassengerId')  

    # attaching deck to training set
    df['Deck'] = deck_assignments['Deck']
    return df

In [6]:
def deck_prob_encoding(df):
    # computes and attaches the "deck probability" column
    # requires "Deck" column 
    deck_probabilities = df['Deck'].value_counts() / df.shape[0]
    df['Deck Probability'] = df['Deck'].map(deck_probabilities)
    return df

In [7]:
def title_extract(df):
    # alternative method of imputing age, based on name title
    # weighted by prevalence
    gender_male = ['Mr\.', 'Master', 'Sir', 'Rev\.', 'Col\.', 'Don\.', 'Major', 'Capt\.', 'Jonkheer\.', 'Dr\.']
    gender_female = ['Ms\.', 'Mrs\.', 'Miss\.', 'Madam', 'Mme\.', 'Mlle\.', 'Countess', 'Lady','Dona\.']
    gender_both = gender_male + gender_female
    both_pattern = '|'.join(gender_both)
    df['Title'] = df['Name'].str.extract(f'({both_pattern})', expand=False).fillna('Unknown')

    # for weighting if desired
    gender_dict = {'Mr.': 1, 'Master': 2, 'Sir': 5, 'Rev.': 5, 'Col.': 5, 'Don.': 5, 'Major': 5, 'Capt.': 5, 'Jonkheer.': 5, 'Dr.': 5, 
                    'Ms.': 3, 'Mrs.': 4, 'Miss.': 3, 'Madam': 4, 'Mme.': 4, 'Mlle.': 3, 'Countess': 5, "Lady": 5, "Dona.": 5,
                    'Unknown': 0}
    df['Title'] = df['Title'].map(gender_dict)

    return df

def age_title_impute(df):
    age_mean_by_group = df.groupby("Title")[['Age']].mean()["Age"].to_dict()
    df["Age Estimate"] = df["Title"].map(age_mean_by_group).round()
    df["Best Age"] = df["Age"].fillna(df["Age Estimate"])
    df['Best Age'] = df['Best Age'].fillna(df['Best Age'].mean())
    return df.drop(columns=["Age","Age Estimate"])

In [8]:
def log_column(df, column):
    # must have the 'column' column
    eps = 1e-8 # prevents infinity
    df[column] = np.log(df[column] + eps)
    return df

In [9]:
def family_size(df):
    df['Family Size'] = df['SibSp'] + df['Parch'] + 1
    return df

In [10]:
def normalize(df, column):
    df[column] = MinMaxScaler().fit_transform(df[[column]])
    return df

In [11]:
def encode(df, cols):
    return pd.get_dummies(df, columns=cols, dtype=int)

In [12]:
def cleanup(df, cols=[]):
    # fills out leftover null's AFTER previous functions
    # assumes training set is representative of true population

    #comment this later
    # df['Age'] = df['Age'].fillna(df['Age'].mean())
    # df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Embarked'] = df['Embarked'].fillna('S')

    # excluded columns, modify accordingly
    df = df.drop(columns=cols)

    return df


# Data loading

In [13]:
# loading data, filepath based on deepnote directory
df_train = pd.read_csv('/work/train.csv')
df_test = pd.read_csv('/work/test.csv')
df_train['Sex'] = LabelEncoder().fit_transform(df_train['Sex'])
df_test['Sex'] = LabelEncoder().fit_transform(df_test['Sex'])

## Applying functions

In [14]:
# on TRAINING set
# DOESN'T DROP 'Survived' column

# imputing info
df_copy = df_train.loc[:, ['Sex','Pclass','Age','Embarked','Name','Survived']]
# df_copy = prefix_encoding(df_copy) 
# df_copy = prefix_prob(df_copy)
# df_copy = impute_decks(df_copy)
# df_copy = deck_prob_encoding(df_copy)
df_copy = title_extract(df_copy)
df_copy = age_title_impute(df_copy)
# df_copy = family_size(df_copy)

# log transformations, allows use of ReLU activation
# df_copy = log_column(df_copy, 'Age')
df_copy = normalize(df_copy, 'Best Age')
# df_copy = normalize(df_copy, 'Fare')
df_copy = normalize(df_copy, 'Title')

# encoding and cleaning, DON'T DROP 'Survived'
encode_cols = ['Embarked','Pclass']
# drop_cols = ['Cabin','Name','Ticket','Ticket Number','Ticket Prefix']
drop_cols = ['Name']

df_copy = cleanup(df_copy, drop_cols)
df_copy = encode(df_copy, encode_cols)
df_copy

Unnamed: 0,Sex,Survived,Title,Best Age,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1,0,0.00,0.271174,0,0,1,0,0,1
1,0,1,0.75,0.472229,1,0,0,1,0,0
2,0,1,0.50,0.321438,0,0,1,0,0,1
3,0,1,0.75,0.434531,0,0,1,1,0,0
4,1,0,0.00,0.434531,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,1,0,1.00,0.334004,0,0,1,0,1,0
887,0,1,0.50,0.233476,0,0,1,1,0,0
888,0,0,0.50,0.271174,0,0,1,0,0,1
889,1,1,0.00,0.321438,1,0,0,1,0,0


In [15]:
# on TESTING set
df_copytest = df_test.loc[:, ['Sex','Pclass','Age','Embarked','Name']]
# df_copytest = prefix_encoding(df_test, False)
# df_copytest = prefix_prob(df_copytest)
# df_copytest = impute_decks(df_copytest)
# df_copytest = deck_prob_encoding(df_copytest)
df_copytest = title_extract(df_copytest)
df_copytest = age_title_impute(df_copytest)
# df_copytest = family_size(df_copytest)

# log transformations, allows use of ReLU activation
# df_test = log_column(df_test, 'Age')
df_copytest = normalize(df_copytest, 'Best Age')
# df_copytest = normalize(df_copytest, 'Fare')
df_copytest = normalize(df_copytest, 'Title')

# encoding and cleaning
df_copytest = cleanup(df_copytest, drop_cols)
df_copytest = encode(df_copytest, encode_cols)
df_copytest

Unnamed: 0,Sex,Title,Best Age,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1,0.00,0.452723,0,1,0,0,0,1
1,0,0.75,0.617566,0,0,1,0,0,1
2,1,0.00,0.815377,0,1,0,0,1,0
3,1,0.00,0.353818,0,0,1,0,0,1
4,0,0.75,0.287881,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
413,1,0.00,0.419755,0,0,1,0,0,1
414,0,1.00,0.512066,1,0,0,1,0,0
415,1,0.00,0.505473,0,0,1,0,0,1
416,1,0.00,0.419755,0,0,1,0,0,1


# Train/Test splits

In [16]:
# drop PassengerId
# didn't drop earlier because we can use it for submission
# df_copy = df_copy.drop(columns=['PassengerId'])

# splits the training set into train and test sets
df_rand = df_copy.sample(frac=1, random_state=42)
split_i = int(len(df_rand) * 0.8)

s_train = df_rand.iloc[:split_i]
s_test = df_rand.iloc[split_i:]

In [17]:
# drop survival
X_train = s_train.drop(columns=['Survived'])
X_test = s_test.drop(columns=['Survived'])

In [18]:
# bespoke adjustments based on unseen values
# X_train = X_train.drop(columns=['Deck_T'])
# X_test = X_test.drop(columns=['Deck_T'])

# Neural Net

In [19]:
clf = MLPClassifier(hidden_layer_sizes=(20,20,20,),
                    activation='relu', # relu, tanh, identity, logistic
                    max_iter=200,
                    solver= 'lbfgs', # 'lbfgs', 'sgd', 'adam'
                    verbose=1,
                    random_state=42) # may revert to default for test submission

### Prediction

In [20]:
# fit and predict
clf.fit(X_train, s_train['Survived'])
print(clf.score(X_train, s_train['Survived']))
print(clf.score(X_test, s_test['Survived']))

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1061     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  7.02838D-01    |proj g|=  1.18589D-01

At iterate    1    f=  6.38967D-01    |proj g|=  4.45731D-02

At iterate    2    f=  5.31582D-01    |proj g|=  4.27836D-02

At iterate    3    f=  4.83501D-01    |proj g|=  3.99848D-02

At iterate    4    f=  4.79144D-01    |proj g|=  2.22100D-02

At iterate    5    f=  4.76335D-01    |proj g|=  1.99292D-02
 This problem is unconstrained.

At iterate    6    f=  4.67001D-01    |proj g|=  2.74813D-02

At iterate    7    f=  4.49594D-01    |proj g|=  3.18638D-02

At iterate    8    f=  4.46518D-01    |proj g|=  2.81953D-02

At iterate    9    f=  4.43967D-01    |proj g|=  2.16115D-02

At iterate   10    f=  4.35033D-01    |proj g|=  3.13194D-02

At iterate   11    f=  4.34891D-01    |proj g|=  1.00440D-01

At iterate   12    f=  4.30624D-01    |proj g|=  1.80827

In [21]:
# cross validation
cv = cross_validate(clf, df_copy.drop(columns=['Survived']), df_copy['Survived'])
cv

At iterate  148    f=  3.80622D-01    |proj g|=  7.79280D-03

At iterate  149    f=  3.80573D-01    |proj g|=  1.66086D-02

At iterate  150    f=  3.80526D-01    |proj g|=  1.66232D-02

At iterate  151    f=  3.80474D-01    |proj g|=  1.04793D-02

At iterate  152    f=  3.80456D-01    |proj g|=  1.57763D-02

At iterate  153    f=  3.80432D-01    |proj g|=  1.31465D-02

At iterate  154    f=  3.80404D-01    |proj g|=  1.25164D-02

At iterate  155    f=  3.80354D-01    |proj g|=  1.10168D-02

At iterate  156    f=  3.80264D-01    |proj g|=  1.16188D-02

At iterate  157    f=  3.80190D-01    |proj g|=  1.12421D-02

At iterate  158    f=  3.80125D-01    |proj g|=  1.08633D-02

At iterate  159    f=  3.80052D-01    |proj g|=  1.74478D-02

At iterate  160    f=  3.79383D-01    |proj g|=  1.29367D-02

At iterate  161    f=  3.79245D-01    |proj g|=  1.79304D-02

At iterate  162    f=  3.79119D-01    |proj g|=  2.10732D-02

At iterate  163    f=  3.78944D-01    |proj g|=  2.22550D-02

At itera

{'fit_time': array([6.10937476, 5.66798615, 5.00646424, 3.59839559, 3.39993024]),
 'score_time': array([0.00342083, 0.0030551 , 0.0030427 , 0.00820374, 0.08142447]),
 'test_score': array([0.78212291, 0.8258427 , 0.82022472, 0.79775281, 0.8258427 ])}

In [22]:
def mean(x):
    return sum(x) / len(x)

In [23]:
mean(cv['test_score'])

0.8103571652752495

### Score-yard

Run as-is, with 10-node layers: 0.838, 0.791 (~80 cv scores)

With alt-impute: 0.843, 0.785 (~79 cv scores)

Run as-is, with 17-node layers, as-is: 0.851, 0.780 (~77 cv scores)

With Adam: 0.833, 0.763 (~78 cv scores)

With log transform on Age and Fare: 0.906, 0.757 (~76 cv scores)

With 17-node layers, log transform Age and Fare, ReLU: 0.878, 0.791 (~80 cv scores)

Same as above, include "Deck_G": 0.858, 0.763 (~80 cv scores) — current best submission

Same as above, 500 iterations: 0.881, 0.779 (~78 cv scores)

Age impute by title: 0.893, 0.768 (~79 cv scores)

Age impute by title, no deck prob: 0.893, 0.785 (~78 cv scores)

Title weight, no deck prob: 0.894, 0.768 (~76 cv scores)

20-node layers, title weight: 0.927, 0.780 (~79 cv scores)

20-node layer, log title weight: 0.908, 0.774 (~80 cv scores)

MinMax: 0.912, 0.757 (~78 cv scores)

With family size, forest: 0.994, 0.802 (~81 cv scores) 

With top columns only, forest: 0.916, 0.834 (~81 cv scores)

With top columns only, neural: 0.840, 0.821 (~81 cv scores)

# Forest

In [24]:
forest = ensemble.RandomForestClassifier(n_estimators=100, 
                                         criterion='gini', # "gini", “entropy”, “log_loss”
                                         random_state=42)

### Prediction

In [25]:
# # fit and predict
forest.fit(X_train, s_train['Survived'])
print(forest.score(X_train, s_train['Survived']))
print(forest.score(X_test, s_test['Survived']))

0.9157303370786517
0.8324022346368715


In [26]:
# # cross validation
cv = cross_validate(forest, df_copy.drop(columns=['Survived']), df_copy['Survived'])
cv

{'fit_time': array([0.40556908, 0.37497449, 0.3722744 , 0.4069705 , 0.33265948]),
 'score_time': array([0.01500154, 0.01895452, 0.01975298, 0.06778765, 0.07056117]),
 'test_score': array([0.76536313, 0.83146067, 0.80337079, 0.80898876, 0.84269663])}

In [27]:
mean(cv['test_score'])

0.8103759964848409

# Compile

In [28]:
# if not using PassengerID in the model, else modify prediction call to df_test
# df_predict_test = df_copytest.drop(columns=['PassengerId'])
df_predict_test = df_copytest

In [29]:
best_model = forest
y_test_pred = best_model.predict(df_predict_test)
final_prediction = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test_pred})
final_prediction

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [30]:
# uncomment when preparing submission
final_prediction.to_csv('my_prediction.csv',index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=60e7bd97-0b1a-4e83-8250-92f6e406fd20' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>