# Titanic: Machine Learning from Disaster (TensorFlow Linear Classifier)

My first attempt to build a machine learning model to predict the survival of passengers on Kaggle's Titanic competition.

The model implements a linear classifier in TensorFlow.



### Setup and Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from google.colab import files
files.upload()

### Data

In [None]:
# Construct pandas dataframe from csv files
dftrain = pd.read_csv('train.csv')
dfeval = pd.read_csv('test.csv')

# Preprocess the dataframes
def preprocess(df):
  #df = df.dropna(subset=['Embarked'])
  df['Deck'] = df['Cabin'].str.get(0)
  df = df.drop(columns=['Name', 'Ticket', 'Cabin'])
  df['Age'] = df['Age'].fillna(df['Age'].mean())
  df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
  df['Deck'] = df['Deck'].fillna('M')
  df['Embarked'] = df['Embarked'].fillna('M')
  return df

dftrain = preprocess(dftrain)
y_train = dftrain.pop('Survived')

dfeval = preprocess(dfeval)
dfeval['Survived'] = np.nan
y_eval = dfeval.pop('Survived')
ids = dfeval.pop('PassengerId')

In [None]:
dftrain.head(10) # Check the first 10 entries in the training dataset

Plot out statistical data

In [None]:
dftrain.Fare.hist(bins=80).set_xlabel('Fare')

In [None]:
dftrain.Sex.value_counts().plot(kind='pie')

In [None]:
dftrain.Pclass.value_counts().plot(kind='pie')

In [None]:
pd.concat([dftrain, y_train], axis=1).groupby('Sex').Survived.mean().plot(kind='barh').set_xlabel('% survive')

In [None]:
pd.concat([dftrain, y_train], axis=1).groupby('Embarked').Survived.mean().plot(kind='barh').set_xlabel('% survive')

In [None]:
pd.concat([dftrain, y_train], axis=1).groupby('Deck').Survived.mean().plot(kind='barh').set_xlabel('% survive')

### Feature Columns

Set up the relevant features that will be used in our linear classifier model

In [None]:
categorical_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
numeric_columns = ['Age', 'Fare']

feature_columns = []
for feature_name in categorical_columns:
  vocabulary = dftrain[feature_name].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in numeric_columns:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

### Input Function

Constructs input functions out of our dataframes so they may be processed with TensorFlow. The linear classifier uses mini-batch regression, so further expand and process the data by duplicating and shuffling the entries.

In [None]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

Train the linear classifier.

In [None]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_est.train(input_fn=train_input_fn)

Make predictions on the test data with our trained linear classifier.

In [None]:
predictions = list(linear_est.predict(eval_input_fn))
class_list = []
for pred in predictions:
  class_list.append(pred['class_ids'][0])
len(class_list)

Convert the test dataframe and predictions into a single csv file for submission.

In [None]:
submission = pd.DataFrame(ids)
submission['Survived'] = class_list
submission.reset_index(drop=True)
submission.to_csv('titanicLC_submission.csv', index=False)

files.download('titanicLC_submission.csv')