In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

n_users = 6040
n_movies = 3952

def get_train_data():
    return pd.read_csv('titanic/train.csv', engine='python')

def get_test_data():
    return pd.read_csv('titanic/test.csv', engine='python')

def split_train_val_test(ratings, train=0.8, val=0.1):
    shuffled = np.random.RandomState(0).permutation(ratings.index)
    n_train = int(len(shuffled) * train)
    n_val = int(len(shuffled) * val)
    i_train, i_val, i_test = shuffled[:n_train], shuffled[n_train: n_train + n_val], shuffled[-n_val:]
    return ratings.loc[i_train], ratings.loc[i_val], ratings.loc[i_test]

def get_dense_array(ratings_df):
    ratings = np.zeros((n_users, n_movies))
    ratings[ratings_df['user_id'] - 1, ratings_df['movie_id'] - 1] = ratings_df['rating']
    return ratings

In [2]:
train = get_train_data()
train_survived = train.Survived
train = train.drop(columns = ['PassengerId', 'Name', 'Ticket'])

test = get_test_data()
test_id = test.PassengerId
test = test.drop(columns = ['PassengerId', 'Name', 'Ticket'])

train_survived_df, val_survived_df, test_survived_df = split_train_val_test(train)


# movies = get_movie_data()
# ratings = get_rating_data()
# train_ratings_df, val_ratings_df, test_ratings_df = split_train_val_test(ratings)
# train_ratings, val_ratings, test_ratings = get_dense_array(train_ratings_df), get_dense_array(val_ratings_df), get_dense_array(test_ratings_df)

In [171]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,3,male,34.5,0,0,7.8292,,Q,0
1,3,female,47.0,1,0,7.0000,,S,0
2,2,male,62.0,0,0,9.6875,,Q,0
3,3,male,27.0,0,0,8.6625,,S,0
4,3,female,22.0,1,1,12.2875,,S,0
...,...,...,...,...,...,...,...,...,...
413,3,male,,0,0,8.0500,,S,0
414,1,female,39.0,0,0,108.9000,C105,C,0
415,3,male,38.5,0,0,7.2500,,S,0
416,3,male,,0,0,8.0500,,S,0


# 3.2 Neural Network

In [4]:
import torch
from torch import nn
import torch.nn.functional as F

In [5]:
from sklearn.preprocessing import MinMaxScaler
def min_max_normalize(df, ignore_cols=[]):
    ''' Applies min-max normalization to numerical features in df.

    Values are scaled to [-1, 1]
    Optionally ignores columns in ignore_cols
    '''
    numeric_cols = df.select_dtypes('number').columns
    if ignore_cols is not None:
        assert set(ignore_cols) <= set(numeric_cols)
        numeric_cols = set(numeric_cols) - set(ignore_cols)
    numeric_df = df[numeric_cols]
    scaler = MinMaxScaler(feature_range=(-1, 1))
    return pd.DataFrame(scaler.fit_transform(numeric_df), columns=numeric_cols)

In [6]:
# Desired features:
# 1. one hot encode gender, 
# 2. Calculate average rating for a movie and user
def process_features(df_old):
    df = df_old.copy()
    df_num = min_max_normalize(df, ['Pclass', 'Survived'])
    #PassengerId - Survived - Pclass - Name - Sex - Ae- SibSp - Parch - Ticket - Fare - Cabin - Embarked
    df['Age'] = df_num.Age
    df['Fare']= df_num.Fare
    df['SibSp'] = df_num.SibSp
    df['Parch'] = df_num.Parch
#     Change na values
    df['Age'] = df['Age'].fillna(25)
    df['Sex'] = df['Sex'].fillna('Male')
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['SibSp'] = df['SibSp'].fillna(df['SibSp'].mean())
    df['Parch'] = df['Parch'].fillna(df['Parch'].mean())
    # Handle categorical
    sex_categorical = pd.get_dummies(df.Sex)
    df[['F', 'M']] = sex_categorical[['female', 'male']]
    pclass_categorical = pd.get_dummies(df.Pclass)
    df[['P1', 'P2', 'P3']] = pclass_categorical[[1, 2, 3]]
    embarked_categorical = pd.get_dummies(df.Embarked)
    df = pd.concat([df, embarked_categorical], axis=1);
    return df

In [7]:
process_features(train)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,F,M,P1,P2,P3,C,Q,S
0,0,3,male,-0.457653,-0.75,-1.000000,-0.971698,,S,0,1,0,0,1,0,0,1
1,1,1,female,-0.055542,-0.75,-1.000000,-0.721729,C85,C,1,0,1,0,0,1,0,0
2,1,3,female,-0.357125,-1.00,-1.000000,-0.969063,,S,1,0,0,0,1,0,0,1
3,1,1,female,-0.130937,-0.75,-1.000000,-0.792711,C123,S,1,0,1,0,0,0,0,1
4,0,3,male,-0.130937,-1.00,-1.000000,-0.968575,,S,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,-0.331993,-1.00,-1.000000,-0.949251,,S,0,1,0,1,0,0,0,1
887,1,1,female,-0.533049,-1.00,-1.000000,-0.882888,B42,S,1,0,1,0,0,0,0,1
888,0,3,female,25.000000,-0.75,-0.333333,-0.908457,,S,1,0,0,0,1,0,0,1
889,1,1,male,-0.357125,-1.00,-1.000000,-0.882888,C148,C,0,1,1,0,0,1,0,0


In [245]:
# Hyperparameters
args = {
    'width': 100,
    'depth': 4,
    'hidden_size': 10,
    'lr': .5,
    'momentum': 0.9,
    'epochs': 30
}

layers = []
layers.append(nn.Linear(12, args['hidden_size']))
layers.append(nn.ReLU())
for i in range(args['depth']-1):
    layers.append(nn.Linear(args['hidden_size'], args['hidden_size']))
    layers.append(nn.ReLU())
layers.append(nn.Linear(args['hidden_size'], 1))

model = nn.Sequential(*layers)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.input_layer = nn.Linear(8, args['hidden_size'])
        self.hidden_layers = nn.Sequential(*[nn.Linear(args['hidden_size'], args['hidden_size']), nn.ReLU()] * (args['depth']-1))
        self.output_layer = nn.Linear(args['hidden_size'], 1)
    def forward(self, x):
        x = self.input_layer(x)
        x = nn.ReLU(x)
        x = self.hidden_layers(x)
        x = self.output_layer(x)
        return x

In [246]:
processed_train_df = process_features(train_survived_df)
processed_val_df = process_features(val_survived_df)
processed_test_df = process_features(test_survived_df)
# add empty survival column to the test set
test['Survived'] = 0
processed_final_test_df = process_features(test)

In [247]:
from torch.utils.data import Dataset, DataLoader
class TitanicDataset(Dataset):
    def __init__(self, df):
#         Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Cabin	Embarked
        self.data = df[['Age', 'SibSp', 'Parch', 'Fare', 'F','M','P1','P2','P3','C','Q','S']]
        self.labels = df[['Survived']]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return torch.tensor(self.data.iloc[idx]), torch.tensor(self.labels.iloc[idx])


train_data = TitanicDataset(processed_train_df)
val_data = TitanicDataset(processed_val_df)
test_data = TitanicDataset(processed_test_df)
final_test_data = TitanicDataset(processed_final_test_df)

trainloader = DataLoader(train_data, batch_size = 256, num_workers = 0)
valoader = DataLoader(val_data, batch_size = 256, num_workers = 0)
testloader = DataLoader(test_data, batch_size = 256, num_workers = 0)
finalTestLoader = DataLoader(final_test_data, batch_size = 256, num_workers = 0)

In [248]:
processed_final_test_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived,F,M,P1,P2,P3,C,Q,S
0,3,male,-0.094554,-1.00,-1.000000,-0.969437,,Q,0,0,1,0,0,1,0,1,0
1,3,female,0.235131,-0.75,-1.000000,-0.972674,,S,0,1,0,0,0,1,0,0,1
2,2,male,0.630753,-1.00,-1.000000,-0.962183,,Q,0,0,1,0,1,0,0,1,0
3,3,male,-0.292364,-1.00,-1.000000,-0.966184,,S,0,0,1,0,0,1,0,0,1
4,3,female,-0.424238,-0.75,-0.777778,-0.952033,,S,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,male,25.000000,-1.00,-1.000000,-0.968575,,S,0,0,1,0,0,1,0,0,1
414,1,female,0.024133,-1.00,-1.000000,-0.574883,C105,C,0,1,0,1,0,0,1,0,0
415,3,male,0.010946,-1.00,-1.000000,-0.971698,,S,0,0,1,0,0,1,0,0,1
416,3,male,25.000000,-1.00,-1.000000,-0.968575,,S,0,0,1,0,0,1,0,0,1


In [249]:
import torch.optim as optim
import time
def train(trainloader, layers):
    model = nn.Sequential(*layers)
#     criterion = nn.MSELoss()
    criterion = torch.nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr = args['lr'], momentum = args['momentum'])
    for epoch in range(args['epochs']):
        epoch_loss = 0
        start = time.perf_counter()
        for i, batch in enumerate(trainloader):
            inputs, labels = batch
            optimizer.zero_grad()
#             print(inputs.float())
            outputs = model(inputs.float())
            outputs = torch.sigmoid(outputs)
            loss = criterion(outputs, labels.float())
#             print(outputs)
            
#             print(outputs)
#             print(labels.float())
            
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print('Epoch {} - loss: {}'.format(epoch+1, epoch_loss))
        print('Time: {}'.format(time.perf_counter() - start))
#         print(outputs)
    return model



In [250]:
model = train(trainloader, layers)

Epoch 1 - loss: 1.9878022074699402
Time: 0.6130294440004036
Epoch 2 - loss: 1.9776071906089783
Time: 0.47764542300001267
Epoch 3 - loss: 1.9682650566101074
Time: 0.4884004390000882
Epoch 4 - loss: 1.9652297496795654
Time: 0.5553008469996712
Epoch 5 - loss: 1.954335331916809
Time: 0.4041145310002321
Epoch 6 - loss: 1.9358293414115906
Time: 0.4222921159998805
Epoch 7 - loss: 1.88782799243927
Time: 0.4698706540002604
Epoch 8 - loss: 2.0630739331245422
Time: 0.3786639119998654
Epoch 9 - loss: 2.0105234384536743
Time: 0.367351632000009
Epoch 10 - loss: 1.9946983456611633
Time: 0.3676877490001971
Epoch 11 - loss: 1.9322057962417603
Time: 0.4022353310001563
Epoch 12 - loss: 1.9307767152786255
Time: 0.372597770000084
Epoch 13 - loss: 1.8839524984359741
Time: 0.38789939300022525
Epoch 14 - loss: 1.8469932079315186
Time: 0.4173777399996652
Epoch 15 - loss: 1.834463894367218
Time: 0.41166782500022236
Epoch 16 - loss: 1.8288516998291016
Time: 0.3776833120000447
Epoch 17 - loss: 1.8244919180870056


In [251]:
def evaluate(model, valloader):
    criterion = torch.nn.BCELoss()
    loss = 0
    with torch.no_grad():
        for i, batch in enumerate(valloader):
            inputs, labels = batch
            outputs = model(inputs.float())
            outputs = torch.sigmoid(outputs)
            loss += criterion(outputs, labels.float())
    return loss/len(valloader)

In [252]:
mse = evaluate(model, valoader)
print(mse)

tensor(0.6809)


In [253]:
test_mse = evaluate(model, testloader)
print(mse)

tensor(0.6809)


## Write test data to file

In [254]:
def getOutput(model, valloader):
    criterion = torch.nn.BCELoss()
    loss = 0
    allOutputs = []
    with torch.no_grad():
        for i, batch in enumerate(valloader):
            inputs, labels = batch
            outputs = model(inputs.float())
            outputs = torch.sigmoid(outputs)
            loss += criterion(outputs, labels.float())
            allOutputs.extend(outputs.round_())
#     outputs = outputs.round_()
    
    outputs = allOutputs
    outputFile = open('titanic/gender_submission.csv', 'w')
    outputFile.write('PassengerId,Survived\n')
    for i in range(0, len(outputs)):
        outputFile.write("%s,%d\n" %(test_id[i], outputs[i]))

    outputFile.close()


In [255]:
getOutput(model, finalTestLoader)

## Ablation Data

In [None]:
class MovieLensDatasetAblation(Dataset):
    def __init__(self, df):
        self.data = df[['avg_user_rating', 'avg_movie_rating']]
        self.labels = df[['rating']]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return torch.tensor(self.data.iloc[idx]), torch.tensor(self.labels.iloc[idx])


train_data_ablation = MovieLensDataset(processed_train_df)
val_data_ablation = MovieLensDataset(processed_val_df)
test_data_ablation = MovieLensDataset(processed_test_df)

trainloader_ablation = DataLoader(train_data_ablation, batch_size = 256, num_workers = 0)
valoader_ablation = DataLoader(val_data_ablation, batch_size = 256, num_workers = 0)
testloader_ablation = DataLoader(test_data_ablation, batch_size = 256, num_workers = 0)

In [None]:
model_ablation = train(trainloader, layers)

Epoch 1 - loss: 4451.598682403564
Time: 341.829094217
Epoch 2 - loss: 4153.0240265131
Time: 341.310661997
Epoch 3 - loss: 4148.3260371387005
Time: 345.2031759650001
Epoch 4 - loss: 4146.374026298523
Time: 349.5462531249998
Epoch 5 - loss: 4146.16229313612
Time: 355.3534050019998
Epoch 6 - loss: 4146.315999120474
Time: 358.8211854299998
Epoch 7 - loss: 4144.445684820414
Time: 353.86359875000016
Epoch 8 - loss: 4143.0946026444435
Time: 347.375252068
Epoch 9 - loss: 4142.292069166899
Time: 337.717256937
Epoch 10 - loss: 4142.6262129843235
Time: 337.6882162750003


In [None]:
val_mse_ablation = evaluate(model_ablation, valoader)
print(val_mse_ablation)
test_mse_ablation = evaluate(model_ablation, testloader)
print(test_mse_ablation)

tensor(1.0594)
tensor(1.0594)


In [None]:
args['epochs'] = 2
model = train(trainloader, layers)
print(2)
print('Val:')
print(evaluate(model, valoader))
print('Test:')
print(evaluate(model, testloader))

Epoch 1 - loss: 4140.293397843838
Time: 337.1380207760003
Epoch 2 - loss: 4140.029753744602
Time: 340.45485344499957
2
Val:
tensor(1.0602)
Test:
tensor(1.0602)


In [None]:
args['epochs'] = 4
model = train(trainloader, layers)
print(4)
print('Val:')
print(evaluate(model, valoader))
print('Test:')
print(evaluate(model, testloader))

Epoch 1 - loss: 4139.735735446215
Time: 340.71909113499987
Epoch 2 - loss: 4139.335850358009
Time: 338.3462146310003
Epoch 3 - loss: 4139.791591823101
Time: 342.57236025099974
Epoch 4 - loss: 4139.2761444449425
Time: 336.70349016399996
4
Val:
tensor(1.0612)
Test:
tensor(1.0612)


In [None]:
args['epochs'] = 6
model = train(trainloader, layers)
print(6)
print('Val:')
print(evaluate(model, valoader))
print('Test:')
print(evaluate(model, testloader))

Epoch 1 - loss: 4138.676314264536
Time: 337.9464519009998
Epoch 2 - loss: 4139.222398519516
Time: 337.6191665229999
Epoch 3 - loss: 4138.759005069733
Time: 336.7097596900003
Epoch 4 - loss: 4138.750609606504
Time: 337.88217956100016
Epoch 5 - loss: 4138.613515347242
Time: 337.1083042249993
Epoch 6 - loss: 4138.712310671806
Time: 339.8560352859986
6
Val:
tensor(1.0599)
Test:
tensor(1.0599)


In [None]:
args['epochs'] = 8
model = train(trainloader, layers)
print(8)
print('Val:')
print(evaluate(model, valoader))
print('Test:')
print(evaluate(model, testloader))

Epoch 1 - loss: 4138.038381099701
Time: 339.12343950600007
Epoch 2 - loss: 4138.0985932946205
Time: 337.14608776500063
Epoch 3 - loss: 4138.20110604167
Time: 337.49118975499914
Epoch 4 - loss: 4138.158048868179
Time: 337.1486140249999
Epoch 5 - loss: 4137.753827601671
Time: 338.16831340299905
Epoch 6 - loss: 4138.287654042244
Time: 336.5233725829985
Epoch 7 - loss: 4138.065690338612
Time: 337.7555926969999
Epoch 8 - loss: 4137.584454208612
Time: 337.42159865600115
8
Val:
tensor(1.0610)
Test:
tensor(1.0610)


In [None]:
4137.584454208612/len(trainloader)

1.0587473014863389