In [38]:
import os
from pathlib import Path
import subprocess
import sklearn

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: path = Path('../input/titanic')
else:
    path = Path('titanic')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
import torch, numpy as np, pandas as pd
np.set_printoptions(linewidth=140)
torch.set_printoptions(linewidth=140, sci_mode=False, edgeitems=7)
pd.set_option('display.width', 140)

# Explore the Data

In [2]:
titanic_df = pd.read_csv(path/'train.csv')
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Impute missing values with the median

In [5]:
titanic_df = titanic_df.fillna(titanic_df.mode().iloc[0])

In [6]:
titanic_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

# Convert values with large outliers to log

In [7]:
titanic_df['LogFare'] = np.log(titanic_df['Fare']+1)

# One-Hot encode categorical variables

In [8]:
categories = ['Pclass', 'Sex', 'Embarked']

In [9]:
titanic_df = pd.get_dummies(titanic_df, columns=categories)

In [10]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'LogFare', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [11]:
y = torch.tensor(titanic_df['Survived'])
y

tensor([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 

In [12]:
design = ['Age', 'SibSp', 'Parch', 'LogFare', 'Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

In [13]:
titanic_df = titanic_df[design].astype(int)

In [14]:
X = torch.tensor(titanic_df[design].values)
X

tensor([[22,  1,  0,  2,  1,  0,  0,  0,  1,  0,  0,  1],
        [38,  1,  0,  4,  0,  1,  1,  0,  0,  1,  0,  0],
        [26,  0,  0,  2,  0,  1,  0,  0,  1,  0,  0,  1],
        [35,  1,  0,  3,  0,  1,  1,  0,  0,  0,  0,  1],
        [35,  0,  0,  2,  1,  0,  0,  0,  1,  0,  0,  1],
        [24,  0,  0,  2,  1,  0,  0,  0,  1,  0,  1,  0],
        [54,  0,  0,  3,  1,  0,  1,  0,  0,  0,  0,  1],
        ...,
        [25,  0,  0,  2,  1,  0,  0,  0,  1,  0,  0,  1],
        [39,  0,  5,  3,  0,  1,  0,  0,  1,  0,  1,  0],
        [27,  0,  0,  2,  1,  0,  0,  1,  0,  0,  0,  1],
        [19,  0,  0,  3,  0,  1,  1,  0,  0,  0,  0,  1],
        [24,  1,  2,  3,  0,  1,  0,  0,  1,  0,  0,  1],
        [26,  0,  0,  3,  1,  0,  1,  0,  0,  1,  0,  0],
        [32,  0,  0,  2,  1,  0,  0,  0,  1,  0,  1,  0]])

In [15]:
X.shape

torch.Size([891, 12])

In [16]:
vals,indices = X.max(dim=0)
print(vals)
X = X / vals

tensor([80,  8,  6,  6,  1,  1,  1,  1,  1,  1,  1,  1])


In [17]:
from sklearn.model_selection import train_test_split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.float()
y_test = y_test.float()

# Make the Neural Net

In [24]:
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(in_features=12, out_features=36)
        self.fc2 = nn.Linear(in_features=36, out_features=1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        #x = self.dropout(x)
        #x = self.dropout(x)
        #x = self.relu(self.fc3(x))
        #x = self.relu(self.fc4(x))
        x = self.fc2(x)
        return x
        

In [25]:
model = MyModel()
crit = nn.BCEWithLogitsLoss()
opt = optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(opt, step_size=50, gamma=0.5)



num_epochs = 1000
for epoch in range(num_epochs):
    
    # Forward pass
    outputs = model(x_train)
    outputs = outputs.squeeze()
    loss = crit(outputs, y_train.float())
    
    # Backward pass and optimization
    opt.zero_grad()
    loss.backward()
    opt.step()
    #scheduler.step()
    
    # Print progress
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        


Epoch [10/1000], Loss: 0.6953
Epoch [20/1000], Loss: 0.6680
Epoch [30/1000], Loss: 0.6426
Epoch [40/1000], Loss: 0.6185
Epoch [50/1000], Loss: 0.5950
Epoch [60/1000], Loss: 0.5724
Epoch [70/1000], Loss: 0.5511
Epoch [80/1000], Loss: 0.5314
Epoch [90/1000], Loss: 0.5134
Epoch [100/1000], Loss: 0.4975
Epoch [110/1000], Loss: 0.4838
Epoch [120/1000], Loss: 0.4725
Epoch [130/1000], Loss: 0.4635
Epoch [140/1000], Loss: 0.4564
Epoch [150/1000], Loss: 0.4507
Epoch [160/1000], Loss: 0.4463
Epoch [170/1000], Loss: 0.4429
Epoch [180/1000], Loss: 0.4401
Epoch [190/1000], Loss: 0.4378
Epoch [200/1000], Loss: 0.4358
Epoch [210/1000], Loss: 0.4341
Epoch [220/1000], Loss: 0.4326
Epoch [230/1000], Loss: 0.4312
Epoch [240/1000], Loss: 0.4300
Epoch [250/1000], Loss: 0.4288
Epoch [260/1000], Loss: 0.4277
Epoch [270/1000], Loss: 0.4268
Epoch [280/1000], Loss: 0.4258
Epoch [290/1000], Loss: 0.4249
Epoch [300/1000], Loss: 0.4241
Epoch [310/1000], Loss: 0.4234
Epoch [320/1000], Loss: 0.4227
Epoch [330/1000],

In [26]:
with torch.no_grad():
    predictions = model(x_test)



In [27]:
def calculate_accuracy(y_pred, y_true):
    y_pred_labels = torch.round(torch.sigmoid(y_pred)).squeeze()
    correct = (y_pred_labels == y_true).sum().item()

    
    accuracy = correct / y_true.size(0)
    return round(accuracy * 100, 2)



In [28]:
acc = calculate_accuracy(predictions, y_test)
print(f"Accuracy: {acc}%")

Accuracy: 81.56%


# Prep submission

In [29]:
test_df = pd.read_csv(path/'test.csv')
test_df = test_df.fillna(test_df.mode().iloc[0])
test_df['LogFare'] = np.log(test_df['Fare']+1)

test = pd.get_dummies(test_df, columns=categories)

test= test[design].astype(int)
test = torch.tensor(test[design].values)

test = test / vals

In [30]:
test.shape

torch.Size([418, 12])

In [31]:
with torch.no_grad():
    test_predictions = model(test)
    
len(test_predictions)

418

In [32]:
test_labels = torch.round(torch.sigmoid(test_predictions))  
len(test_labels)

418

In [33]:

tensor_series = pd.Series(torch.squeeze(test_labels), name='Survived')  

tensor_series

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
413    0.0
414    1.0
415    0.0
416    0.0
417    0.0
Name: Survived, Length: 418, dtype: float32

In [34]:
# Concatenate DataFrame and Series side by side
merged_df = pd.concat([test_df['PassengerId'], tensor_series], axis=1)

In [35]:
merged_df = merged_df.astype(int)

In [36]:
filename = 'submission_3.csv'

merged_df.to_csv(filename, index=False)

In [37]:


def submit_to_kaggle(filepath, comp_name, message):
    
    file_path = filepath
    competition_name = comp_name  


    command = f'kaggle competitions submit -c {competition_name} -f {file_path} -m "{message}"'

    
    try:
        output = subprocess.check_output(command, shell=True)
        print(output.decode('utf-8'))  
        print("Submission successful!")
    except subprocess.CalledProcessError as e:
        print("Submission failed. Error:", e)


100%|██████████| 2.77k/2.77k [00:00<00:00, 5.55kB/s]


Successfully submitted to Titanic - Machine Learning from Disaster
Submission successful!
