# Submission 1. XGBoost new features

| Variable | Definition                      | Key                                         |
|----------|---------------------------------|---------------------------------------------|
| survival | Survival                        | 0 = No, 1 = Yes                             |
| pclass   | Ticket class                    | 1 = 1st, 2 = 2nd, 3 = 3rd                   |
| sex      | Sex                             |                                             |
| Age      | Age in years                    |                                             |
| sibsp    | # of siblings / spouses aboard the Titanic |                               |
| parch    | # of parents / children aboard the Titanic  |                               |
| ticket   | Ticket number                   |                                             |
| fare     | Passenger fare                  |                                             |
| cabin    | Cabin number                    |                                             |
| embarked | Port of Embarkation             | C = Cherbourg, Q = Queenstown, S = Southampton |

In [30]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.12.1


In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from torchmetrics import Accuracy

ModuleNotFoundError: No module named 'torchmetrics'

In [32]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Henrique Menegaz" --iversions

Author: Henrique Menegaz

matplotlib: 3.8.2
pandas    : 2.1.4
numpy     : 1.26.3
seaborn   : 0.13.1
torch     : 2.2.1



In [33]:
# Verificando o dispositivo
device = "cuda" if torch.cuda.is_available() else "cpu"

In [34]:
# Read train and test files into pandas dataframes
df_original = pd.read_csv('train.csv')
df_submission_original = pd.read_csv('test.csv')

# Preprocessing

### Filling Missing values

In [35]:
# Check for missing values
df_original.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [36]:
modes = df_original.mode().iloc[0]
modes

PassengerId                      1
Survived                       0.0
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

In [37]:
df_original.fillna(modes, inplace=True)

In [38]:
df_original.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [39]:
def add_features(df):
    df['LogFare'] = np.log1p(df['Fare'])
    df['Deck'] = df.Cabin.str[0].map(dict(A="ABC",B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG"))
    df['Family'] = df.SibSp+df.Parch
    df['Alone'] = df.Family==0
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    df['Title'] = df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master"))

add_features(df_original)
add_features(df_submission_original)

In [40]:
categorical_cols = ["Sex","Pclass","Embarked","Deck", "Title"]
numerical_cols = ['Age', 'SibSp', 'Parch', 'LogFare',
                 'Alone', 'TicketFreq', 'Family']
y_cols = ['Survived']

In [41]:
df_x = df_original[categorical_cols + numerical_cols].copy()
#df[categorical_cols] = df[categorical_cols].astype('category')
df_y = df_original[y_cols].copy()
#y = y.astype('category')

df_submission = df_submission_original[categorical_cols + numerical_cols].copy()
df_submission[categorical_cols] = df_submission[categorical_cols]#.astype('category')


### Preprocessing Categorical Variables

In [42]:
df_submission['Pclass'].value_counts().sort_index()

Pclass
1    107
2     93
3    218
Name: count, dtype: int64

In [43]:
## One-Hot Encoding to categorical variables
# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(drop='first')

#encoded
encoded_train = encoder.fit_transform(df_x[categorical_cols])
encoded_test = encoder.transform(df_submission[categorical_cols])

# Fit and transform the categorical columns using the encoder
df_train_encoded = pd.DataFrame(encoded_train.toarray())
df_test_encoded = pd.DataFrame(encoded_test.toarray())

# Assign column names to the encoded DataFrame
df_train_encoded.columns = encoder.get_feature_names_out(categorical_cols)
df_test_encoded.columns = encoder.get_feature_names_out(categorical_cols)

# Concatenate the encoded DataFrame with the original DataFrame
df_x = pd.concat([df_x, df_train_encoded], axis=1)
df_submission = pd.concat([df_submission, df_test_encoded], axis=1)

# Drop the original categorical columns
df_x.drop(categorical_cols, axis=1, inplace=True)
df_submission.drop(categorical_cols, axis=1, inplace=True)
df_x

Unnamed: 0,Age,SibSp,Parch,LogFare,Alone,TicketFreq,Family,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Deck_DE,Deck_FG,Deck_nan,Title_Miss,Title_Mr,Title_Mrs,Title_nan
0,22.0,1,0,2.110213,False,1,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38.0,1,0,4.280593,False,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,26.0,0,0,2.188856,True,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,35.0,1,0,3.990834,False,2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,35.0,0,0,2.202765,True,1,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,2.639057,True,1,0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
887,19.0,0,0,3.433987,True,1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
888,24.0,1,2,3.196630,False,2,3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
889,26.0,0,0,3.433987,True,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Standardizing the data

In [44]:
from sklearn.preprocessing import StandardScaler

# Create an instance of the StandardScaler
scaler = StandardScaler()

# Apply standard scaling to the numerical features
df_x[numerical_cols] = pd.DataFrame(scaler.fit_transform(df_x[numerical_cols]),columns=numerical_cols)
df_submission[numerical_cols] = pd.DataFrame(scaler.transform(df_submission[numerical_cols]),columns=numerical_cols)
df_x[numerical_cols].head()

Unnamed: 0,Age,SibSp,Parch,LogFare,Alone,TicketFreq,Family
0,-0.497793,0.432793,-0.473674,-0.879741,-1.231645,-0.579162,0.05916
1,0.715048,0.432793,-0.473674,1.36122,-1.231645,-0.579162,0.05916
2,-0.194583,-0.474545,-0.473674,-0.79854,0.811922,-0.579162,-0.560975
3,0.48764,0.432793,-0.473674,1.062038,-1.231645,0.155928,0.05916
4,0.48764,-0.474545,-0.473674,-0.784179,0.811922,-0.579162,-0.560975


# Pytorch

In [45]:
# Create Dataset Class
class CreateFeaturesDataset(Dataset):
	def __init__(self, df):
		super().__init__()		
		self.data = df.to_numpy()
		
	def __len__(self):
		return self.data.shape[0]
		
	def __getitem__(self, idx):
		features = self.data[idx, :-1]
		label = self.data[idx, -1]
		return features, label

In [51]:
# Instatiate Dataset
df_x_y = pd.concat([df_x,df_y],axis=1)
dataset_train = CreateFeaturesDataset(df_x_y)

In [None]:
# Create DataLoader
dataloader_train = DataLoader(
	dataset_train,
	batch_size=32,
	shuffle=True,
)

In [None]:
# Build Model class
class Net(nn.Module):
	def __init__(self):
		super(Net, self).__init__()
		self.fc1 = nn.Linear(9, 16)
		self.fc2 = nn.Linear(16, 8)
		self.fc3 = nn.Linear(8, 1)

	def forward(self, x):
		x = nn.functional.relu(self.fc1(x))
		x = nn.functional.relu(self.fc2(x))
		x = nn.functional.sigmoid(self.fc3(x))
		return x

In [None]:
# Instantiate model
net = Net()

In [None]:
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

In [None]:
# Training Loop
for epoch in range(1000):
	for features, labels in dataloader_train:
		optimizer.zero_grad()
		outputs = net(features)
		loss = criterion(outputs, labels.view(-1, 1))
		loss.backward()
		optimizer.step()

In [None]:
acc = Accuracy(task="binary")

In [None]:
# Model Evaluation
acc = Accuracy(task="binary")
net.eval()
with torch.no_grad():
	for features, labels in dataloader_test:
		outputs = net(features)
		preds = (outputs >= 0.5).float()
		acc(preds, labels.view(-1, 1))

accuracy = acc.compute()
print(f"Accuracy: {accuracy}")