## Data Cleaning and Preprocessing

We import the original `train.csv` and `test.csv` files and use `PassengerID` as the index column.

The `clean_data` function then performs the following:

* Drops the `Name`, `Ticket` and `Cabin` columns which we currently are not using.
* Modifies `Fare` column to indicate difference from the average fare paid by class.
* Imputes median values (based on sex and passenger class) to null values in the `Age` column.
* Creates dummy variables from the `Sex`, `Pclass` and `Embarked` features.

The cleaned data is saved to `cl_train.csv` and `cl_test.csv`.

In [26]:
import pandas as pd


def clean_data(df):
    # drop unused columns
    df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

    # normalizes fare by passenger class: null values receive zero
    df['Fare'] = df.groupby('Pclass')['Fare'].apply(lambda x: x.sub(x.mean()))
    df['Fare'] = df['Fare'].fillna(0)

    # fill null ages with median age (based on sex, passenger class)
    df['Age'] = df.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

    # return cleaned data
    return df

train = pd.read_csv('train.csv', header=0, index_col='PassengerId')
test = pd.read_csv('test.csv', header=0, index_col='PassengerId')

train = clean_data(train)
test = clean_data(test)

# save transformed data
train.to_csv('cl_train.csv')
test.to_csv('cl_test.csv')

### Random Forest Model

In [116]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

train = pd.read_csv('cl_train.csv', header=0, index_col='PassengerId')

# impute missing 'Embarked' values with 'S' (most common)
train['Embarked'].fillna(value='S', inplace=True)

# encode categorical variables
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
train['Embarked'] = le.fit_transform(train['Embarked'])

# create cross validation set
X = train.drop('Survived', axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=65)

# random forest
clf = RandomForestClassifier(n_estimators = 40, max_depth = 6)

# prediction score
clf.fit(X_train, y_train)
print('Random Forest Train Score: %s' % clf.score(X_train, y_train))
print('Random Forest CV Score: %s' % clf.score(X_test, y_test))
pd.Series(clf.feature_importances_, index=X_train.columns)

Random Forest Train Score: 0.86377245509
Random Forest CV Score: 0.816143497758


Pclass      0.177920
Sex         0.398279
Age         0.159534
SibSp       0.049534
Parch       0.044860
Fare        0.131473
Embarked    0.038400
dtype: float64

### Logistic Regression Model


In [105]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

train = pd.read_csv('cl_train.csv', header=0, index_col='PassengerId')

# create dummy variables
train = pd.get_dummies(train, columns=['Sex', 'Pclass', 'Embarked'])

# create cross validation set
X = train.drop('Survived', axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=53)

# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# logistic regression
polynomial_features = PolynomialFeatures(degree=3, include_bias=True)
logistic_regression = LogisticRegression(C=0.01)
pipeline = Pipeline([('polynomial_features', polynomial_features), ('logistic_regression', logistic_regression)])

# prediction score
pipeline.fit(X_train, y_train)
print('Logistic Regression Score: %s' % pipeline.score(X_train, y_train))
print('Logistic Regression Score: %s' % pipeline.score(X_test, y_test))

Logistic Regression Score: 0.847305389222
Logistic Regression Score: 0.80269058296


### Final Logistic Regression Model

* Import the cleaned Titanic data from `cl_train.csv` and `cl_test.csv`.
* Normalize features by mean and standard deviation.
* Create polynomial features with $ d = 2 $.
* Logistic regression using $L^2$ regularization with $ C = 1 $.

In [122]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

train = pd.read_csv('cl_train.csv', header=0, index_col='PassengerId')
test = pd.read_csv('cl_test.csv', header=0, index_col='PassengerId')

# create training set X and y
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

# combine X train and test for preprocessing
tr_len = len(X_train)
df = pd.concat(objs=[X_train, test], axis=0)

# create dummy variables on train/test
df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked'])

# split X train and test
X_train = df[:tr_len]
test = df[tr_len:]

# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(test)

# L2 logistic polynomial regression with C = 1
polynomial_features = PolynomialFeatures(degree=2, include_bias=True)
logistic_regression = LogisticRegression(C=1.0)
pipeline = Pipeline([('polynomial_features', polynomial_features), ('logistic_regression', logistic_regression)])

# fit and predict
pipeline.fit(X_train, y_train)
prediction = pipeline.predict(X_test)

# save survival predictions to a CSV file
predicted = np.column_stack((test.index.values, prediction))
np.savetxt("pr_logistic.csv", predicted.astype(int), fmt='%d', delimiter=",", header="PassengerId,Survived", comments='')

### Final Random Forest Model

* Import the cleaned Titanic data from `cl_train.csv` and `cl_test.csv`.
* Create encoders for categorical variables.
* Random Forest with 40 estimators and maximum tree depth of 6.

In [121]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

train = pd.read_csv('cl_train.csv', header=0, index_col='PassengerId')
test = pd.read_csv('cl_test.csv', header=0, index_col='PassengerId')

# create training set X and y
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

# combine X train and test for preprocessing
tr_len = len(X_train)
df = pd.concat(objs=[X_train, test], axis=0)

# impute missing 'Embarked' values with 'S' (most common)
df['Embarked'].fillna(value='S', inplace=True)

# encode categorical variables
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

# split X train and test
X_train = df[:tr_len]
test = df[tr_len:]

# random forest with 40 estimators, max depth 6
clf = RandomForestClassifier(n_estimators = 40, max_depth = 6)

# fit and predict
clf.fit(X_train, y_train)
prediction = clf.predict(test)

# save survival predictions to a CSV file
predicted = np.column_stack((test.index.values, prediction))
np.savetxt("pr_forest.csv", predicted.astype(int), fmt='%d', delimiter=",", header="PassengerId,Survived", comments='')