# Submission 1. XGBoost new features

| Variable | Definition                      | Key                                         |
|----------|---------------------------------|---------------------------------------------|
| survival | Survival                        | 0 = No, 1 = Yes                             |
| pclass   | Ticket class                    | 1 = 1st, 2 = 2nd, 3 = 3rd                   |
| sex      | Sex                             |                                             |
| Age      | Age in years                    |                                             |
| sibsp    | # of siblings / spouses aboard the Titanic |                               |
| parch    | # of parents / children aboard the Titanic  |                               |
| ticket   | Ticket number                   |                                             |
| fare     | Passenger fare                  |                                             |
| cabin    | Cabin number                    |                                             |
| embarked | Port of Embarkation             | C = Cherbourg, Q = Queenstown, S = Southampton |

In [108]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.12.1


In [109]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
# from torch.torchmetrics import Accuracy

In [110]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Henrique Menegaz" --iversions

Author: Henrique Menegaz

numpy : 1.26.3
pandas: 2.1.4
torch : 2.2.1



In [111]:
# Read train and test files into pandas dataframes
df_original = pd.read_csv('./titanic/train.csv')
df_submission_original = pd.read_csv('./titanic/test.csv')

# Preprocessing

### Filling Missing values

In [112]:
modes = df_original.mode().iloc[0]
df_original.fillna(modes, inplace=True)
df_submission_original.fillna(modes, inplace=True)

In [113]:
df_original.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Feature Engineering

In [114]:
def add_features(df):
    df['LogFare'] = np.log1p(df['Fare'])
    df['Deck'] = df.Cabin.str[0].map(dict(A="ABC",B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG"))
    df['Family'] = df.SibSp+df.Parch
    df['Alone'] = df.Family==0
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    df['Title'] = df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master"))

add_features(df_original)
add_features(df_submission_original)
df_original.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LogFare,Deck,Family,Alone,TicketFreq,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S,2.110213,ABC,1,False,1,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4.280593,ABC,1,False,1,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S,2.188856,ABC,0,True,1,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3.990834,ABC,1,False,2,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S,2.202765,ABC,0,True,1,Mr


In [115]:
categorical_cols = ["Sex","Pclass","Embarked","Deck", "Title", 'Alone']
numerical_cols = ['Age', 'SibSp', 'Parch', 'LogFare', 'TicketFreq', 'Family']
y_cols = ['Survived']

In [116]:
df_x = df_original[categorical_cols + numerical_cols].copy()
#df[categorical_cols] = df[categorical_cols].astype('category')
df_y = df_original[y_cols].copy()
#y = y.astype('category')

df_submission = df_submission_original[categorical_cols + numerical_cols].copy()
df_submission[categorical_cols] = df_submission[categorical_cols]#.astype('category')


### Preprocessing Categorical Variables

In [117]:
df_submission['Pclass'].value_counts().sort_index()

Pclass
1    107
2     93
3    218
Name: count, dtype: int64

In [118]:
## One-Hot Encoding to categorical variables
# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(drop='first')

#encoded
encoded_train = encoder.fit_transform(df_x[categorical_cols])
encoded_test = encoder.transform(df_submission[categorical_cols])

# Fit and transform the categorical columns using the encoder
df_train_encoded = pd.DataFrame(encoded_train.toarray())
df_test_encoded = pd.DataFrame(encoded_test.toarray())

# Assign column names to the encoded DataFrame
df_train_encoded.columns = encoder.get_feature_names_out(categorical_cols)
df_test_encoded.columns = encoder.get_feature_names_out(categorical_cols)

# Concatenate the encoded DataFrame with the original DataFrame
df_x = pd.concat([df_x, df_train_encoded], axis=1)
df_submission = pd.concat([df_submission, df_test_encoded], axis=1)

# Drop the original categorical columns
df_x.drop(categorical_cols, axis=1, inplace=True)
df_submission.drop(categorical_cols, axis=1, inplace=True)

### Standardizing the data

In [128]:
from sklearn.preprocessing import StandardScaler

# Create an instance of the StandardScaler
scaler = StandardScaler()

# Apply standard scaling to the numerical features
df_x[numerical_cols] = pd.DataFrame(scaler.fit_transform(df_x[numerical_cols]),columns=numerical_cols)
df_submission[numerical_cols] = pd.DataFrame(scaler.transform(df_submission[numerical_cols]),columns=numerical_cols)

# Pytorch

## Prepare Datasets and Data Loaders

In [120]:
# Create Dataset Class
class CreateFeaturesDataset(Dataset):
	def __init__(self, df):
		super().__init__()		
		self.data = torch.tensor(df.to_numpy(),dtype=torch.float32)
		
	def __len__(self):
		return self.data.shape[0]
		
	def __getitem__(self, idx):
		features = self.data[idx, :-1]
		label = self.data[idx, -1]
		return features, label

## Model

In [121]:
# Build Model class
class Net(nn.Module):
	def __init__(self,n_x,n_y):
		super(Net, self).__init__()
		self.fc1 = nn.Linear(n_x, 16)
		self.fc2 = nn.Linear(16, 8)
		self.fc3 = nn.Linear(8, n_y)

	def forward(self, x):
		x = nn.functional.relu(self.fc1(x))
		x = nn.functional.relu(self.fc2(x))
		x = nn.functional.sigmoid(self.fc3(x))
		return x

## Training

In [122]:
from torch.autograd import Variable
# As observações são minhas e podem estar erradas porque as coloquei quando ainda estava estudando.
def train(model, device, train_dataloader, optim, epoch,criterion):
    model.train() # put in training model. It updates gradients
    for b_i, (X, y) in enumerate(train_dataloader):
        # y = y.type(torch.LongTensor)
        X, y = X.to(device), y.to(device) # send to device
        optim.zero_grad() # reset gradients from last batch
        pred_prob = model(X) # foward propagation        
        # get 
        loss = criterion(pred_prob[:,0].double(), y.double()) # loss
        loss.backward() # backward propagation. Compute gradients
        optim.step() # Update the model with gradients
        # Create online log
        if b_i % 10 == 0:
            print('epoch: {} [{}/{} ({:.0f}%)]\t training loss: {:.6f}'.format(
                epoch, b_i * len(X), len(train_dataloader.dataset),
                100. * b_i / len(train_dataloader), loss.item()))

In [123]:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
# device= "cpu"


# Instatiate Dataset
df_x_y = pd.concat([df_x,df_y],axis=1)
dataset_train = CreateFeaturesDataset(df_x_y)

# Instatiate DataLoader
dataloader_train = DataLoader(
	dataset_train,
	batch_size=32,
	shuffle=True,
)

# Instantiate model
model = Net(df_x.shape[1],1).to(device)

# Instantiate Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [124]:
## Model Training
for epoch in range(1, 3):
    train(model, device, dataloader_train, optimizer, epoch,criterion)



In [125]:
# Saving model parameters
torch.save(model.state_dict(),'./models/model-params.pth')

# Final predictions and submission

In [126]:
# Loading model parameters
model = Net(df_x.shape[1],1).to(device)
model.load_state_dict(torch.load('./models/model-params.pth'))

<All keys matched successfully>

In [131]:
# Predict first case of df_submission
X_submission = torch.tensor(df_submission.to_numpy(),dtype=torch.float32).to(device)
predictions_submission = model(X_submission).round().cpu().detach().numpy()[:,0].T
predictions_submission
df_final = pd.DataFrame({
    'PassengerId': df_submission_original['PassengerId'],
    'Survived': predictions_submission.astype(int)
})

In [133]:
df_final.to_csv('submission_csvs/sub5_pytorch_pure',index=False)
!head submission_csvs/sub5_pytorch_pure

PassengerId,Survived
892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,0
900,0


In [134]:
!kaggle competitions submit -c titanic -f submission_csvs/sub5_pytorch_pure -m "Simple Net with Pytorch"

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 5.22kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster