In [85]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns


if torch.cuda.is_available():
    print("CUDA is available")
else:
    print("CUDA is not available")

print("Running on " + torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu")

CUDA is available
Running on NVIDIA GeForce GTX 1060 6GB


# Random Forests Route - Score: 0.77033

First Importing the Data

In [86]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [87]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [88]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", 'Embarked']
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])


model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('titanic_submission.csv', index=False)
print("Your submission was successfully saved!")



Your submission was successfully saved!


# Deep Learning Approach

Cleaning the data first

In [89]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [90]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Determine Title from Name
train_data['Title'] = train_data['Name'].apply(lambda x: x.split('.')[0].split(' ')[-1].strip())
test_data['Title'] = test_data['Name'].apply(lambda x: x.split('.')[0].split(' ')[-1].strip())

# Drop unnecessary columns
train_data.drop(columns=['PassengerId','Name','Ticket'], inplace=True)
test_data.drop(columns=['PassengerId','Name','Ticket'], inplace=True)

# Transform Cabin into binary 'HasCabin'
train_data['Cabin'] = train_data['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
test_data['Cabin'] = test_data['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

# One-hot encode 'Pclass', 'Sex', 'SibSp', 'Parch', 'Title' and 'Embarked'
train_data = pd.get_dummies(train_data, columns=['Pclass', 'Sex', 'SibSp', 'Parch', 'Title','Embarked'])
test_data = pd.get_dummies(test_data, columns=['Pclass', 'Sex', 'SibSp', 'Parch', 'Title','Embarked'])

# min-max normalisation of 'Age' and 'Fare'
for column_to_normalise in ['Age', 'Fare']:
    min_value = train_data[column_to_normalise].min()
    max_value = train_data[column_to_normalise].max()
    train_data[column_to_normalise + '_normalized'] = (train_data[column_to_normalise] - min_value) / (max_value - min_value)
    test_data[column_to_normalise + '_normalized'] = (test_data[column_to_normalise] - min_value) / (max_value - min_value)

len(train_data.columns)

45

In [91]:
import collections

input_size = len(train_data.columns)
hidden_layer_sizes = [256, 512, 512, 256]

model_dense = nn.Sequential(collections.OrderedDict([
    ('hidden1', nn.Linear(input_size, hidden_layer_sizes[0])),
    ('activation1', nn.ReLU()),
    ('hidden2', nn.Linear(hidden_layer_sizes[0], hidden_layer_sizes[1])),
    ('activation2', nn.ReLU()),
    ('hidden3', nn.Linear(hidden_layer_sizes[1], hidden_layer_sizes[2])),
    ('activation3', nn.ReLU()),
    ('hidden4', nn.Linear(hidden_layer_sizes[2], hidden_layer_sizes[3])),
    ('activation4', nn.ReLU()),
    ('output', nn.Linear(hidden_layer_sizes[3], 2)),
    ('activation5', nn.Sigmoid())

        ]))
print("Dense MLP Model")
print(model_dense)

Dense MLP Model
Sequential(
  (hidden1): Linear(in_features=45, out_features=256, bias=True)
  (activation1): ReLU()
  (hidden2): Linear(in_features=256, out_features=512, bias=True)
  (activation2): ReLU()
  (hidden3): Linear(in_features=512, out_features=512, bias=True)
  (activation3): ReLU()
  (hidden4): Linear(in_features=512, out_features=256, bias=True)
  (activation4): ReLU()
  (output): Linear(in_features=256, out_features=2, bias=True)
  (activation5): Sigmoid()
)


In [92]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model_dense.parameters())

y_train = torch.tensor(train_data["Survived"].values)
X_train = torch.tensor(train_data.drop(columns=["Survived"]).values, dtype=torch.float32)

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

num_epochs = 10

# Train the model
for epoch in range(num_epochs):
    for inputs, labels in train_loader:

        outputs = model_dense(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.