<a href="https://colab.research.google.com/github/hainesdata/gas/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

def format_raw(df):
  # df.drop(columns=['Ticket', 'Embarked', 'Cabin', 'SibSp', 'Age'], inplace=True)
  df['Cabin'] = [(x[0] if type(x) is str else 'X') for x in df['Cabin']]
  return df

_df_train = format_raw(pd.read_csv('train.csv'))
_df_test = format_raw(pd.read_csv('test.csv'))

In [None]:
# Explore

# Should drop Cabin and Age, but don't know how much of an affect this has on the training yet
_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# Check survival rates
print(_df_train['Survived'].value_counts())
print(_df_train['Survived'].value_counts(normalize=True))

0    549
1    342
Name: Survived, dtype: int64
0    0.616162
1    0.383838
Name: Survived, dtype: float64


In [None]:
# Since rates are imbalanced, needs to have equal proportion of survived and not survived
# Will undersample

from imblearn.under_sampling import RandomUnderSampler

train = _df_train.copy()
y = 'Survived'
x_names = [c for c in train.columns if c != y]
rus = RandomUnderSampler(random_state=42)
train_x, train_y = rus.fit_resample(train[x_names], train[y])
train = pd.concat([train_x, train_y], axis=1)
train[y].value_counts(normalize=True)


0    0.5
1    0.5
Name: Survived, dtype: float64

In [None]:
# Sex-Survival relationship
_df_train.groupby('Sex')[y].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [None]:
# Class-Survival relationship
_df_train.groupby('Pclass')[y].mean()

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from math import sqrt

# LogiRegr
def preprocess_lr(_df, x, y, i, tt, test=False, make_bins=False, standardize=False):
    df = _df.copy()
    le = LabelEncoder()
    if test:
        cols = [c for c in x if c not in y]
    else:
        cols = [c for c in x if c not in y and c not in i] 
    for i in cols:
        if isinstance(df[i][1], str):
            df[i] = le.fit_transform(df[i])
        if isinstance(df[i][1], float):
            X_bar = np.mean(df[i])
            df[i] = df[i].fillna(X_bar)
        if make_bins:
            df[f'{i}_bin'] = le.fit_transform(pd.cut(df[i], int(sqrt(len(df[i]))), duplicates='drop'))
    if standardize:
        df = scale(df, cols)
    if test:
        return df
    else:
        xt, xv, yt, yv = train_test_split(df[cols], df[y], test_size=tt, random_state=42)
        return xt, xv, yt, yv

def scale(df, x):
    s = StandardScaler()
    s.fit(df[x])
    df[x] = s.transform(df[x])
    return df

def instantiate_lr(df, x, y, i, tt, bins, std=False):
    xt, xv, yt, yv = preprocess_lr(df, x, y, i, tt, standardize=std, make_bins=bins)
    model = LogisticRegression(max_iter=1000)
    model.fit(xt, yt)
    y_hat = model.predict(xv)
    acc = accuracy_score(yv, y_hat)
    print(f'Accuracy: {acc}')
    return model

def run_lr(train, test, y, i, tt, std, bins):
    x = [c for c in train.columns if c not in i and c not in y]
    model = instantiate_lr(train, x, y, i, tt=tt, std=std, bins=bins)
    y_hat = pd.DataFrame()
    _test = preprocess_lr(test, x, y, i, tt, standardize=std, test=True, make_bins=bins)
    y_hat[y] = model.predict(_test[x])
    y_hat[i] = test[i]
    return y_hat

# SVM
def preprocess_svm(_df, x, y, i, tt, test=False, make_bins=False, standardize=False):
    df = _df.copy()
    le = LabelEncoder()
    if test:
        cols = [c for c in x if c not in y]
    else:
        cols = [c for c in x if c not in y and c not in i] 
    for i in cols:
        if isinstance(df[i][1], str):
            df[i] = le.fit_transform(df[i])
        if isinstance(df[i][1], float):
            X_bar = np.mean(df[i])
            df[i] = df[i].fillna(X_bar)
        if make_bins:
            df[f'{i}_bin'] = le.fit_transform(pd.cut(df[i], int(sqrt(len(df[i]))), duplicates='drop'))
    if standardize:
        df = scale(df, cols)
    if test:
        return df
    else:
        xt, xv, yt, yv = train_test_split(df[cols], df[y], test_size=tt, random_state=42)
        return xt, xv, yt, yv

def instantiate_svm(df, x, y, i, tt, bins, std=False, kind='c', kernel='linear'):
    xt, xv, yt, yv = preprocess_svm(df, x, y, i, tt, standardize=std, make_bins=bins)
    if kind == 'linear':
        model = svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=False)
    elif kind == 'c':
        model = svm.SVC(kernel=kernel)
    elif kind == 'nu':
        model = svm.NuSVC(kernel=kernel)
    else:
        raise ValueError("Invalid SVM algorithm specified. Expected one of the following: 'linear', 'c', 'nu'. Using SVC with default parameters.")
    model.fit(xt, yt)
    y_hat = model.predict(xv)
    acc = accuracy_score(yv, y_hat)
    print(f'Accuracy: {acc}')
    return model

def run_svm(train, test, y, i, tt, std, bins, kind='c', kernel='linear'):
    x = [c for c in train.columns if c not in i and c not in y]
    model = instantiate_svm(train, x, y, i, tt, std=std, bins=bins, kind=kind, kernel=kernel)
    y_hat = pd.DataFrame()
    _test = preprocess_svm(test, x, y, i, tt, standardize=std, test=True, make_bins=bins)
    y_hat[y] = model.predict(_test[x])
    y_hat[i] = test[i]
    return y_hat

# RF approach
def preprocess_rf(df, x, y, i, tt, test=False, make_bins=False, standardize=False):
    le = LabelEncoder()
    if test:
        cols = [c for c in x if c not in y]
    else:
        cols = [c for c in x if c not in y and c not in i] 
    for i in cols:
        if isinstance(df[i][1], str):
            df[i] = le.fit_transform(df[i])
        if isinstance(df[i][1], float):
            X_bar = np.mean(df[i])
            df[i] = df[i].fillna(X_bar)
        if make_bins:
            df[f'{i}_bin'] = le.fit_transform(pd.cut(df[i], int(sqrt(len(df[i]))), duplicates='drop'))
    if standardize:
        df = scale(df, cols)
    if test:
        return df
    else:
        xt, xv, yt, yv = train_test_split(df[cols], df[y], test_size=tt, random_state=42)
        return xt, xv, yt, yv

def instantiate_rf(df, x, y, i, tt, trees, depth, split, std, bins):
    xt, xv, yt, yv = preprocess_rf(df, x, y, i, tt, standardize=std, make_bins=bins)
    model = RandomForestClassifier(n_estimators=trees, max_depth=depth, max_features='sqrt', min_samples_split=split)
    model.fit(xt, yt)
    y_hat = model.predict(xv)
    acc = accuracy_score(yv, y_hat)
    print(f'Accuracy: {acc}')
    return model

def run_rf(train, test, y, i, tt, trees, depth, split, std=False, bins=False):
    x = [c for c in train.columns if c not in i and c not in y]
    model = instantiate_rf(train, x, y, i, tt, trees, depth, split, std=std, bins=bins)
    y_hat = pd.DataFrame()
    _test = preprocess_rf(test, x, y, i, tt, standardize=std, test=True, make_bins=bins)
    y_hat[y] = model.predict(_test[x])
    y_hat[i] = test[i]
    return y_hat

train = _df_train.copy()
test = _df_test.copy()

y = 'Survived'
i = 'PassengerId'

pred_rf = run_rf(train, test, y, i, tt=0.05, std=True, bins=False, trees=100, depth=15,split=2)
pred_rf.to_csv('result_rf.csv', index=False)

pred_svm = run_svm(train, test, y, i, tt=0.05, std=False, bins=False, kind='linear', kernel='rbf')
pred_svm.to_csv('result_svm.csv', index=False)

pred_lr = run_lr(train, test, y, i, tt=0.05, std=False, bins=False)
pred_lr.to_csv('result_lr.csv', index=False)

preds = pd.merge(pred_rf, pred_lr, on=i)
preds = pd.merge(preds, pred_svm, on=i)
preds.set_index(i, inplace=True)
preds[y] = preds.mean(axis=1)
preds = preds[y]
preds = preds.reset_index()
preds[y] = preds[y].apply(lambda y_i: 0 if y_i < 0.5 else 1)
preds.to_csv('result_ensemble.csv', index=False)

Accuracy: 0.7555555555555555
Accuracy: 0.8222222222222222
Accuracy: 0.8222222222222222


In [None]:
44# DNN approach
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, LeakyReLU
from keras.callbacks import TensorBoard
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD
from keras.initializers import glorot_normal, Zeros
from keras import regularizers

from IPython.display import clear_output

import matplotlib.pyplot as plt
import numpy as np

def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1,len(history.history[loss_list[0]]) + 1)
    
    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.5f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.5f'))+')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

def scale(df, numerics):
    s = StandardScaler()
    s.fit(df[numerics])
    df[numerics] = s.transform(df[numerics])
    return df

def preprocess(_df, x, y, N, y_cat, mode='one-hot'):
    df = _df.copy().dropna()
    ohe = OneHotEncoder()
    le = LabelEncoder()
    categories = [u for u in x if u not in N]
    for i in categories:
        if type(df[i][1]) is str:
            df[i] = le.fit_transform(df[i])
            df[i] = to_categorical(df[i], len(df[i].unique()))
        else:
            df[i] = to_categorical(df[i], len(df[i].unique()))
    if y_cat:
          df[y] = to_categorical(df[y], len(df[y].unique()))
    x_t, x_v, y_t, y_v = train_test_split(
        df[x], df[y], test_size=0.1, random_state=42)
    x_t = scale(x_t, N)
    x_v = scale(x_v, N)
    return x_t, x_v, y_t, y_v

def preprocess_test(_df, x, N):
    df = _df[x]
    le = LabelEncoder()
    for i in x:
        if type(df[i][1]) is str:
            df[i] = le.fit_transform(df[i])
            df[i] = to_categorical(df[i], len(df[i].unique()))
        if type(df[i][1] is float):
            X_bar = np.mean(df[i])
            df[i] = df[i].fillna(X_bar)
    df = scale(df, N)
    df = np.array(df).astype('float32')
    return df

def make_model(df, params, outs, numerics, y_is_categorical=True, epochs=128, batch=32, auto_epoch=False):
    x_train, x_valid, y_train, y_valid = preprocess(df, params, outs, numerics, y_is_categorical)
    if auto_epoch:
      epochs = len(df)//batch
    model = Sequential()
    model.add(Dense(8, activation='relu', input_shape=(len(params),), kernel_initializer=glorot_normal, bias_initializer=Zeros))
    model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-5)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=SGD(learning_rate=0.001), metrics=['accuracy'])
    
    history = model.fit(x_train, y_train, batch_size=batch, epochs=epochs, verbose='auto', validation_data=(x_valid, y_valid))
    return model, history

nums = ['Age', 'Pclass', 'SibSp']
x = ['Age', 'Pclass', 'SibSp', 'Sex', 'Cabin']
y = 'Survived'

In [None]:
model, history = make_model(df_train, params=x, outs=y, numerics=nums, batch=32, epochs=300)
plot_history(history)

In [None]:
X = preprocess_test(df_test, x, nums)
y_hat = model.predict(X).tolist()
result = pd.DataFrame()
result['PassengerId'] = df_test['PassengerId']
result['Survived'] = [1 if x > 0.5 else 0 for [x] in y_hat]

In [None]:
result.to_csv('results.csv', index=False)