In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import pandas as pd
import numpy as np
from tqdm import tqdm

In [0]:
class Net(nn.Module):
  def __init__(self, input_features, out_features, p= 0.4):
    super().__init__()
    self.fc1 = nn.Linear(input_features, 100)
    self.batchnorm1 = nn.BatchNorm1d(100)
    self.fc2 = nn.Linear(100, 150)
    self.batchnorm2 = nn.BatchNorm1d(150)
    self.fc3 = nn.Linear(150, 150)
    self.fc4 = nn.Linear(150, 200)
    self.batchnorm3 = nn.BatchNorm1d(200)
    self.fc5 = nn.Linear(200, out_features)

    self.drpout = nn.Dropout(p)


  def forward(self, x):
    x = self.batchnorm1(self.drpout(F.relu(self.fc1(x)))) 
    x = self.batchnorm2(self.drpout(F.relu(self.fc2(x))))
    x = self.drpout(F.relu(self.fc3(x)))
    x = self.batchnorm3(self.drpout(F.relu(self.fc4(x))))
    x = self.fc5(x)
    return F.log_softmax(x, dim=1)



In [0]:
class TitanicDataset(Dataset):
  def __init__(self, data: pd.DataFrame):
    self.data = data

  def __getitem__(self, idx):
    return self.data.iloc[idx]

  def __len__(self):
    return len(self.data)

In [0]:
class TensorTitanicDataset(TitanicDataset):
  def __getitem__(self, idx):
    sample = super().__getitem__(idx)
    return {'X': torch.Tensor([
                    sample.Pclass,
                    sample.Age,
                    sample.SibSp,
                    sample.Parch,
                    sample.Sex_female,
                    sample.Sex_male,
                    sample.Embarked_C,
                    sample.Embarked_Q,
                    sample.Embarked_S                      
    ]), 
    'Y': sample.Survived
    }

In [0]:
data.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [0]:
data = pd.read_csv('data/train.csv')


In [0]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
def getNumberOfNullValues(data):
  sum(list(data.isnull().sum()))

In [0]:
def getCatAndContColumns(data, label=None):
  cols = data.columns 
  cont_columns = list(data._get_numeric_data().columns)
  cat_columns = list(set(cols) - set(cont_columns))
  if label is not None:
    del cont_columns[cont_columns.index(label)]
  return (cat_columns, cont_columns)

In [0]:
def normalize_columns(df, columns_to_normalize):
  for col in columns_to_normalize:
    df[col] = (df[col] - df[col].min())/ (df[col].max() - df[col].min())

def one_hot_encode(df, columns):
  return pd.get_dummies(df, prefix=columns)

In [0]:
def preprocess_data(data, columns_to_drop, label_name=None, ignore=False):
  df = data
  df.drop(columns=columns_to_drop, inplace=True, axis=1)

  #Get Number of null values
  if ignore:
    cat_columns, cont_columns = getCatAndContColumns(data)
        #Normalize Continuous Columns
    normalize_columns(df, columns_to_normalize=cont_columns)

    #One-Hot Encode Categorical Columns
    df = one_hot_encode(df, columns=cat_columns)
  if not ignore:
    if getNumberOfNullValues(df) != 0:
      df.dropna(how='any', inplace=True)

    if label_name is None:
      cat_columns, cont_columns = getCatAndContColumns(data)
    else:
      cat_columns, cont_columns = getCatAndContColumns(data, label_name)

    #Normalize Continuous Columns
    normalize_columns(df, columns_to_normalize=cont_columns)

    #One-Hot Encode Categorical Columns
    df = one_hot_encode(df, columns=cat_columns)

    if label_name is not None:
      #Add Labels to the Processed data
      df[label_name] = data[label_name]

  return df

In [0]:
data = preprocess_data(data=data, columns_to_drop=["PassengerId", "Cabin", "Fare", "Name", "Ticket"], label_name="Survived")

In [0]:
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1.0,0.271174,0.2,0.0,0,1,0,0,1
1,1,0.0,0.472229,0.2,0.0,1,0,1,0,0
2,1,1.0,0.321438,0.0,0.0,1,0,0,0,1
3,1,0.0,0.434531,0.2,0.0,1,0,0,0,1
4,0,1.0,0.434531,0.0,0.0,0,1,0,0,1


In [0]:
dataset = TensorTitanicDataset(data)

In [0]:

dataset_size = len(dataset)
batch_size = 16
indices = list(range(dataset_size))
np.random.shuffle(indices)
val_size = int(dataset_size * .2)

train_indices = indices[:-val_size]
test_indices = indices[-val_size:]

trainSampler = SubsetRandomSampler(train_indices)
testSampler = SubsetRandomSampler(test_indices)

traindataset = DataLoader(dataset, batch_size=batch_size, sampler=trainSampler)
testdataset = DataLoader(dataset, batch_size=batch_size, sampler=testSampler)



In [0]:
data.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [0]:
model = Net(len(data.columns) -1, 2)
model

Net(
  (fc1): Linear(in_features=9, out_features=100, bias=True)
  (batchnorm1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=100, out_features=150, bias=True)
  (batchnorm2): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=150, out_features=150, bias=True)
  (fc4): Linear(in_features=150, out_features=200, bias=True)
  (batchnorm3): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc5): Linear(in_features=200, out_features=2, bias=True)
  (drpout): Dropout(p=0.4, inplace=False)
)

In [0]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [0]:
EPOCHS = 1000

for epoch in tqdm(range(EPOCHS)):
  for idx, batch in enumerate(traindataset):
    X, y = batch['X'], batch['Y']
   
    output = model(X)
    loss = criterion(output, y.long())

    # print(f'EPOCH: {epoch} batch({idx}) LOSS: {loss.item()}')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  


  0%|          | 0/1000 [00:00<?, ?it/s][A
  0%|          | 1/1000 [00:00<08:43,  1.91it/s][A
  0%|          | 2/1000 [00:01<08:37,  1.93it/s][A
  0%|          | 3/1000 [00:01<08:31,  1.95it/s][A
  0%|          | 4/1000 [00:02<08:30,  1.95it/s][A
  0%|          | 5/1000 [00:02<08:28,  1.96it/s][A
  1%|          | 6/1000 [00:03<08:30,  1.95it/s][A
  1%|          | 7/1000 [00:03<08:28,  1.95it/s][A
  1%|          | 8/1000 [00:04<08:37,  1.92it/s][A
  1%|          | 9/1000 [00:04<08:34,  1.93it/s][A
  1%|          | 10/1000 [00:05<08:34,  1.92it/s][A
  1%|          | 11/1000 [00:05<08:30,  1.94it/s][A
  1%|          | 12/1000 [00:06<08:29,  1.94it/s][A
  1%|▏         | 13/1000 [00:06<08:32,  1.93it/s][A
  1%|▏         | 14/1000 [00:07<08:25,  1.95it/s][A
  2%|▏         | 15/1000 [00:07<08:20,  1.97it/s][A
  2%|▏         | 16/1000 [00:08<08:19,  1.97it/s][A
  2%|▏         | 17/1000 [00:08<08:16,  1.98it/s][A
  2%|▏         | 18/1000 [00:09<08:13,  1.99it/s][A
  2%|▏    

In [0]:
correct = 0
total = 0

model.eval()
with torch.no_grad():
  for batch in testdataset:
    X, y = batch['X'], batch['Y']
    output = model(X)

    for idx, x in enumerate(output):
      if(x.argmax() == y[idx]):
        correct += 1
      total +=1

  print(f'Accuracy: { (correct/total) * 100 }%')

Accuracy: 77.46478873239437%


In [0]:
testdf = pd.read_csv('test.csv')
testdf.shape

(418, 11)

In [0]:
t = preprocess_data(data=testdf, columns_to_drop=["PassengerId", "Cabin", "Fare", "Name", "Ticket"], ignore=True)

In [0]:
t.head()


Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.452723,0.0,0.0,0,1,0,1,0
1,1.0,0.617566,0.125,0.0,1,0,0,0,1
2,0.5,0.815377,0.0,0.0,0,1,0,1,0
3,1.0,0.353818,0.0,0.0,0,1,0,0,1
4,1.0,0.287881,0.125,0.111111,1,0,0,0,1


In [0]:
testx = torch.tensor(t.values)
testx

tensor([[1.0000, 0.4527, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [1.0000, 0.6176, 0.1250,  ..., 0.0000, 0.0000, 1.0000],
        [0.5000, 0.8154, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        ...,
        [1.0000, 0.5055, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [1.0000,    nan, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [1.0000,    nan, 0.1250,  ..., 1.0000, 0.0000, 0.0000]],
       dtype=torch.float64)

In [0]:
passen = testdf['PassengerId']
pIDs = list(passen.values)

In [0]:
rslt = pd.DataFrame({
    "PassengerId": [],
    "Survived": []
})

In [0]:
model.eval()
with torch.no_grad():
  predictions = model(testx)
  for i, predictions in enumerate(predictions):
   rslt = rslt.append({'PassengerId': int(pIDs[i]), "Survived": int(predictions.argmax().item()) }, ignore_index=True)

In [0]:
rslt

Unnamed: 0,PassengerId,Survived
0,892.0,0.0
1,893.0,0.0
2,894.0,0.0
3,895.0,0.0
4,896.0,0.0
...,...,...
413,1305.0,0.0
414,1306.0,1.0
415,1307.0,0.0
416,1308.0,0.0


In [0]:
rslt.to_csv('result.csv')