In [1]:
import pandas as pd
import torch.optim as optim

from Model import *
from Preprocessing import *
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from utils import *

In [2]:
raw_data = pd.read_csv('data/train.csv')

In [3]:
categorical_data = ['Pclass', 'Sex', 'Embarked']
numerical_data = ['Age', 'SibSp', 'Parch', 'Fare']
labels = 'Survived'

preproc = Preprocessor(raw_data, categorical_data, numerical_data, labels)
preproc.X.head()

Unnamed: 0,1,2,3,female,male,C,Q,S,Age,SibSp,Parch,Fare
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.530377,0.432793,-0.473674,-0.502445
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.571831,0.432793,-0.473674,0.786845
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,-0.254825,-0.474545,-0.473674,-0.488854
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.365167,0.432793,-0.473674,0.42073
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.365167,-0.474545,-0.473674,-0.486337


In [4]:
X_train, X_test, y_train, y_test = preproc.get_splits()

In [5]:
# Hyperparams
params = {'batch size': 64,
          'epochs': 100}

In [6]:
train_data = trainData(torch.FloatTensor(X_train.values), 
                       torch.FloatTensor(y_train.values))
test_data = testData(torch.FloatTensor(X_test.values))

train_loader = DataLoader(dataset=train_data, batch_size=params['batch size'], shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [7]:
model = MLP()
criterion = nn.BCEWithLogitsLoss(pos_weight=preproc.pos_weight)
optimizer = optim.Adam(model.parameters())

model, loss_data, acc_data = train(model, criterion, optimizer, train_loader, params)

Epoch 001: | Loss: 0.83697 | Acc: 56.667
Epoch 002: | Loss: 0.82314 | Acc: 61.333
Epoch 003: | Loss: 0.83221 | Acc: 63.250
Epoch 004: | Loss: 0.82002 | Acc: 66.750
Epoch 005: | Loss: 0.80904 | Acc: 64.250
Epoch 006: | Loss: 0.79812 | Acc: 69.500
Epoch 007: | Loss: 0.79765 | Acc: 66.583
Epoch 008: | Loss: 0.80128 | Acc: 66.917
Epoch 009: | Loss: 0.76953 | Acc: 72.583
Epoch 010: | Loss: 0.77449 | Acc: 71.167
Epoch 011: | Loss: 0.74033 | Acc: 73.833
Epoch 012: | Loss: 0.75703 | Acc: 73.417
Epoch 013: | Loss: 0.73762 | Acc: 75.000
Epoch 014: | Loss: 0.71691 | Acc: 76.083
Epoch 015: | Loss: 0.71159 | Acc: 74.500
Epoch 016: | Loss: 0.71134 | Acc: 74.583
Epoch 017: | Loss: 0.68478 | Acc: 75.750
Epoch 018: | Loss: 0.66277 | Acc: 78.083
Epoch 019: | Loss: 0.69889 | Acc: 75.083
Epoch 020: | Loss: 0.67029 | Acc: 76.500
Epoch 021: | Loss: 0.66440 | Acc: 76.750
Epoch 022: | Loss: 0.66569 | Acc: 76.083
Epoch 023: | Loss: 0.63673 | Acc: 78.167
Epoch 024: | Loss: 0.64100 | Acc: 77.167
Epoch 025: | Los

In [8]:
predictions = gen_predictions(model, test_loader)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.84      0.89      0.87       110
           1       0.81      0.74      0.77        69

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179



In [9]:
# Train on all data

In [10]:
train_data = trainData(torch.FloatTensor(preproc.X.values), 
                       torch.FloatTensor(preproc.y.values))
train_loader = DataLoader(dataset=train_data, batch_size=params['batch size'], shuffle=True)

In [11]:
model = MLP()
criterion = nn.BCEWithLogitsLoss(pos_weight=preproc.pos_weight)
optimizer = optim.Adam(model.parameters())

model, loss_data, acc_data = train(model, criterion, optimizer, train_loader, params)

Epoch 001: | Loss: 0.83816 | Acc: 52.929
Epoch 002: | Loss: 0.79662 | Acc: 61.714
Epoch 003: | Loss: 0.77052 | Acc: 70.143
Epoch 004: | Loss: 0.75275 | Acc: 71.929
Epoch 005: | Loss: 0.73942 | Acc: 74.214
Epoch 006: | Loss: 0.72929 | Acc: 75.643
Epoch 007: | Loss: 0.72098 | Acc: 76.143
Epoch 008: | Loss: 0.70530 | Acc: 76.929
Epoch 009: | Loss: 0.69688 | Acc: 77.857
Epoch 010: | Loss: 0.68381 | Acc: 78.429
Epoch 011: | Loss: 0.67286 | Acc: 78.643
Epoch 012: | Loss: 0.66183 | Acc: 77.571
Epoch 013: | Loss: 0.65514 | Acc: 77.714
Epoch 014: | Loss: 0.65175 | Acc: 78.143
Epoch 015: | Loss: 0.62221 | Acc: 81.500
Epoch 016: | Loss: 0.61997 | Acc: 78.857
Epoch 017: | Loss: 0.61920 | Acc: 79.929
Epoch 018: | Loss: 0.62770 | Acc: 79.571
Epoch 019: | Loss: 0.61506 | Acc: 78.786
Epoch 020: | Loss: 0.61173 | Acc: 80.500
Epoch 021: | Loss: 0.60573 | Acc: 79.571
Epoch 022: | Loss: 0.60194 | Acc: 79.143
Epoch 023: | Loss: 0.59160 | Acc: 79.214
Epoch 024: | Loss: 0.59786 | Acc: 80.000
Epoch 025: | Los

In [12]:
# Get predictions for the real test set
raw_test_data = pd.read_csv('data/test.csv')
raw_test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [13]:
test_preproc = Preprocessor(raw_test_data, categorical_data,
                            numerical_data, weight=preproc.pos_weight)

In [31]:
test_preproc.X.head()

Unnamed: 0,1,2,3,female,male,C,Q,S,Age,SibSp,Parch,Fare
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.298549,-0.49947,-0.400248,-0.497811
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.181328,0.616992,-0.400248,-0.51266
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.240662,-0.49947,-0.400248,-0.464532
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.231118,-0.49947,-0.400248,-0.482888
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,-0.584229,0.616992,0.619896,-0.417971


In [15]:
test_data = testData(torch.FloatTensor(test_preproc.X.values))
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [22]:
predictions = gen_predictions(model, test_loader)

In [25]:
#format for submission
pred_df = pd.DataFrame(columns=['Survived'], data=predictions)
submission_df = pd.concat([raw_test_data['PassengerId'], pred_df], axis=1)

In [43]:
submission_df.head()
submission_df = submission_df.astype({'Survived': 'int32'})
submission_df.dtypes

PassengerId    int64
Survived       int32
dtype: object

In [44]:
submission_df.to_csv('data/submission.csv', index=False)

In [45]:
check = pd.read_csv('data/submission.csv')

In [46]:
check.dtypes

PassengerId    int64
Survived       int64
dtype: object