In [46]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import random
from itertools import combinations

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [47]:
# Turn off warning messages
import warnings
warnings.filterwarnings("ignore")

In [48]:
# Read in resources
train = pd.read_csv(Path("./Resources/train.csv"))
test = pd.read_csv(Path("./Resources/test.csv"))
test_survived = pd.read_csv(Path("./Resources/gender_submission.csv"))

In [49]:
# add back survived column to test dataset
joined = pd.merge(test, test_survived, how='outer', on="PassengerId")
joined.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1


In [50]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [51]:
# Observe missing values for Age, Cabin, Embarked
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [52]:
# few missing values for Embarked; no significant loss to drop
train = train[train['Embarked'].notna()]
test = joined[joined['Embarked'].notna()]

# str astypes, just in case
for df in [train, joined]:
  df['Ticket'] = df['Ticket'].astype(str)
  df['Embarked'] = df['Embarked'].astype(str)
  df['Name'] = df['Name'].astype(str)
  df['Sex'] = df['Sex'].astype(str)

In [53]:
# fill missing age values
for df in [train, joined]:
    ages = df[df['Age'].notna()]['Age'].to_list()
    # print(np.mean(ages), np.std(ages))
    missing_age_idxs = df[df['Age'].isnull()].index.to_list()
    for idx in missing_age_idxs:
        df['Age'][idx] = random.choice(ages)

    filled = df['Age'].loc[missing_age_idxs].to_list()
    # print(np.mean(filled), np.std(filled))

# print(' ')
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 122.6+ KB


In [54]:
# convert SibSp and Parch to two new columns: num_related and has_related
for df in [train, joined]:
    df['num_related'] = df['SibSp'] + df['Parch']
    df['has_related'] = df['num_related'] > 0

# check results
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,num_related,has_related
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,True


In [55]:
# convert Ticket to has_special_ticket; check if the Ticket value has any non-number characters: true if yes, false if no
for df in [train, joined]:
    df['has_special_ticket'] = df['Ticket'].str.extract(r"([A-Za-z])")
    df['has_special_ticket'].fillna(False, inplace=True)
    df['has_special_ticket'] = df['has_special_ticket'].astype(bool)
    df.drop('Ticket', inplace=True, axis=1)

# check results
train['has_special_ticket'].value_counts()

has_special_ticket
False    659
True     230
Name: count, dtype: int64

In [56]:
# merge premade train/test sets to see if we
# can get distinct results with our own split
full_set = pd.concat([train, joined])
full_set = full_set[full_set['Fare'].notna()]
full_set = full_set.sample(frac=1).reset_index(drop=True)
full_set.to_csv("./Resources/Titanic_full_dataset.csv", index=False)
full_set.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,num_related,has_related,has_special_ticket
0,190,0,3,"Turcin, Mr. Stjepan",male,36.0,0,0,7.8958,,S,0,False,False
1,633,1,1,"Stahelin-Maeglin, Dr. Max",male,32.0,0,0,30.5,B50,C,0,False,False
2,588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60.0,1,1,79.2,B41,C,2,True,False


In [57]:
full_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306 entries, 0 to 1305
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PassengerId         1306 non-null   int64  
 1   Survived            1306 non-null   int64  
 2   Pclass              1306 non-null   int64  
 3   Name                1306 non-null   object 
 4   Sex                 1306 non-null   object 
 5   Age                 1306 non-null   float64
 6   SibSp               1306 non-null   int64  
 7   Parch               1306 non-null   int64  
 8   Fare                1306 non-null   float64
 9   Cabin               293 non-null    object 
 10  Embarked            1306 non-null   object 
 11  num_related         1306 non-null   int64  
 12  has_related         1306 non-null   bool   
 13  has_special_ticket  1306 non-null   bool   
dtypes: bool(2), float64(2), int64(6), object(4)
memory usage: 125.1+ KB


**K Nearest Neighbors Classifier**

In [58]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
# create a K Nearest Neighbors Classifier Model
knn = KNeighborsClassifier(n_neighbors=5)

In [59]:
# generic model evaluation method
def ModelEvaluation(model, X_train, y_train, X_test, y_test, verbose=False):
    # fit the model and predict on test and train datasets
    fitted = model.fit(X_train, y_train)
    train_predict = fitted.predict(X_train)
    test_predict = fitted.predict(X_test)

    # allow users to obtain classification report data
    rep1 = classification_report(y_train, train_predict, output_dict=True)
    rep2 = classification_report(y_test, test_predict, output_dict=True)

    if verbose == True:
      # output performance reports
      print("Training Classification Report:")
      print(classification_report(y_train, train_predict))

      print("Testing Classification Report:")
      print(classification_report(y_test, test_predict))

    return rep1, rep2

In [60]:
# general cleanup method to work with new format of full_set
def FinishCleanup(data_frame):
    data_frame.drop('Name', inplace=True, axis=1)
    data_frame.drop('PassengerId', inplace=True, axis=1)
    data_frame.drop('Cabin', inplace=True, axis=1)

In [61]:
# this will take a single full titanic-related dataset and
# create scaled train and test sets; a specific subset of columns
# can be specified to reduce the features; an alternate target column
# can be specified for further
def FormatTitanicData(data_frame, dummies, target='Survived', columns = []):
    # if not the default, get a subset of all columns
    if columns != []:
        curr_frame = data_frame[columns]
        # if a column is not in the passed set, do not dummy it
        dummies = [c for c in dummies if c in columns]
    else:
        curr_frame = data_frame

    # split target out from data
    X = curr_frame.drop(target, axis=1)
    y = curr_frame[target]

    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # dummy categorical columns
    dummy_train = pd.get_dummies(X_train, columns=dummies)
    dummy_test = pd.get_dummies(X_test, columns=dummies)

    # scale columns
    scaler = StandardScaler()
    train_scaler = scaler.fit(dummy_train)
    train_scaled = train_scaler.transform(dummy_train)
    test_scaler = scaler.fit(dummy_test)
    test_scaled = test_scaler.transform(dummy_test)

    return train_scaled, y_train, test_scaled, y_test

In [64]:
full_set2 = pd.read_csv(Path("./Resources/Titanic_full_dataset.csv"))
FinishCleanup(full_set2)
full_set2.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,num_related,has_related,has_special_ticket
0,0,3,male,36.0,0,0,7.8958,S,0,False,False
1,1,1,male,32.0,0,0,30.5,C,0,False,False


In [65]:
dummies = ['Pclass', 'Embarked', 'Sex']
a, b, c, d = FormatTitanicData(full_set2, dummies)
evals = ModelEvaluation(knn, a, b, c, d)

In [67]:
perf_dict = {}
col_set = []
non_target_cols = full_set2.columns.to_list()[1:]

# get all sets of columns that arent 1 column or all columns
for i in range(2, len(non_target_cols) - 1):
    # documentation found https://docs.python.org/3/library/itertools.html#itertools.combinations
    combs = combinations(non_target_cols, i)
    lists = [list(c) for c in combs]
    for l in lists:
      l.append('Survived') # always add back the target
      col_set.append(l)

# for each set of columns obtained above, evaluate its performance
for cols in col_set:
    dummies = ['Pclass', 'Embarked', 'Sex']
    # print(cols)
    X_train, y_train, X_test, y_test = FormatTitanicData(full_set2, dummies, columns=cols)
    eval1, eval2 = ModelEvaluation(knn, X_train, y_train, X_test, y_test)
    avg_acc = (eval1['accuracy'] + eval2['accuracy']) / 2
    perf = round(avg_acc * 100, 3)

    perf_dict[str(cols)] = perf

# check the top 5 performers to see if there are any commonly present features;
# Survived will always be present
for key in sorted(perf_dict, key=perf_dict.get, reverse=True)[:10]:
    print(f"{key}: {perf_dict[key]}")

['Pclass', 'Sex', 'Fare', 'Embarked', 'num_related', 'has_related', 'Survived']: 88.367
['Pclass', 'Sex', 'Age', 'SibSp', 'Embarked', 'Survived']: 87.907
['Pclass', 'Sex', 'Fare', 'Embarked', 'Survived']: 87.806
['Sex', 'Age', 'SibSp', 'num_related', 'Survived']: 87.754
['Pclass', 'Sex', 'Parch', 'Fare', 'num_related', 'has_special_ticket', 'Survived']: 87.602
['Sex', 'Age', 'SibSp', 'Embarked', 'num_related', 'has_special_ticket', 'Survived']: 87.601
['Sex', 'Age', 'num_related', 'has_special_ticket', 'Survived']: 87.6
['Sex', 'Age', 'SibSp', 'num_related', 'has_special_ticket', 'Survived']: 87.5
['Sex', 'Age', 'SibSp', 'Fare', 'num_related', 'has_related', 'has_special_ticket', 'Survived']: 87.5
['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'has_special_ticket', 'Survived']: 87.5


In [68]:
counts = {}
for key in sorted(perf_dict, key=perf_dict.get, reverse=True)[:8]:
    key = key.replace('[', '')
    key = key.replace(' ', '')
    key = key.replace(']', '')
    curr = key.split(',')
    for i in curr:
      if i in counts.keys():
        counts[i] = counts[i] + 1
      else:
        counts[i] = 1
counts

{"'Pclass'": 4,
 "'Sex'": 8,
 "'Fare'": 3,
 "'Embarked'": 4,
 "'num_related'": 6,
 "'has_related'": 1,
 "'Survived'": 8,
 "'Age'": 5,
 "'SibSp'": 4,
 "'Parch'": 1,
 "'has_special_ticket'": 4}