# Download the data

In [1]:
import os
from pathlib import Path
import subprocess
import sklearn

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle: path = Path('../input/titanic')
else:
    path = Path('titanic')
    if not path.exists():
        import zipfile,kaggle
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
import torch, numpy as np, pandas as pd
np.set_printoptions(linewidth=140)
torch.set_printoptions(linewidth=140, sci_mode=False, edgeitems=7)
pd.set_option('display.width', 140)

# Explore the Data

In [2]:
titanic_df = pd.read_csv(path/'train.csv')
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
titanic_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Impute missing values with the median

In [5]:
titanic_df = titanic_df.fillna(titanic_df.mode().iloc[0])

In [6]:
titanic_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

# One-Hot encode categorical variables

In [7]:
categories = ['Pclass', 'Sex', 'Embarked']

In [8]:
titanic_df = pd.get_dummies(titanic_df, columns=categories)

In [9]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [10]:
y = torch.tensor(titanic_df['Survived'])
y

tensor([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 

In [11]:
design = ['Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

In [12]:
titanic_df = titanic_df[design].astype(int)

In [13]:
X = torch.tensor(titanic_df[design].values)
X

tensor([[22,  1,  0,  7,  1,  0,  0,  0,  1,  0,  0,  1],
        [38,  1,  0, 71,  0,  1,  1,  0,  0,  1,  0,  0],
        [26,  0,  0,  7,  0,  1,  0,  0,  1,  0,  0,  1],
        [35,  1,  0, 53,  0,  1,  1,  0,  0,  0,  0,  1],
        [35,  0,  0,  8,  1,  0,  0,  0,  1,  0,  0,  1],
        [24,  0,  0,  8,  1,  0,  0,  0,  1,  0,  1,  0],
        [54,  0,  0, 51,  1,  0,  1,  0,  0,  0,  0,  1],
        ...,
        [25,  0,  0,  7,  1,  0,  0,  0,  1,  0,  0,  1],
        [39,  0,  5, 29,  0,  1,  0,  0,  1,  0,  1,  0],
        [27,  0,  0, 13,  1,  0,  0,  1,  0,  0,  0,  1],
        [19,  0,  0, 30,  0,  1,  1,  0,  0,  0,  0,  1],
        [24,  1,  2, 23,  0,  1,  0,  0,  1,  0,  0,  1],
        [26,  0,  0, 30,  1,  0,  1,  0,  0,  1,  0,  0],
        [32,  0,  0,  7,  1,  0,  0,  0,  1,  0,  1,  0]])

In [14]:
X.shape

torch.Size([891, 12])

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.float()
y_test = y_test.float()

In [17]:
# Initialize the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [18]:
# Train the model
xgb_model.fit(x_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [19]:
# Make predictions
y_pred = xgb_model.predict(x_test)

In [20]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8212


# Prep submission

In [29]:
test_df = pd.read_csv(path/'test.csv')
test_df = test_df.fillna(test_df.mode().iloc[0])

test = pd.get_dummies(test_df, columns=categories)

test= test[design].astype(int)
test = torch.tensor(test[design].values)


In [30]:
test_pred = xgb_model.predict(test)

In [31]:

tensor_series = pd.Series(test_pred, name='Survived')  

tensor_series

0      0
1      0
2      0
3      1
4      1
      ..
413    0
414    1
415    0
416    0
417    1
Name: Survived, Length: 418, dtype: int64

In [32]:
# Concatenate DataFrame and Series side by side
merged_df = pd.concat([test_df['PassengerId'], tensor_series], axis=1)

In [33]:
merged_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [34]:
merged_df = merged_df.astype(int)

In [35]:
filename = 'submission_4.csv'

merged_df.to_csv(filename, index=False)

In [36]:


def submit_to_kaggle(filepath, comp_name, message):
    
    file_path = filepath
    competition_name = comp_name 

    # Command to submit to Kaggle
    command = f'kaggle competitions submit -c {competition_name} -f {file_path} -m "{message}"'

 
    try:
        output = subprocess.check_output(command, shell=True)
        print(output.decode('utf-8'))  
        print("Submission successful!")
    except subprocess.CalledProcessError as e:
        print("Submission failed. Error:", e)


In [37]:
submit_to_kaggle('submission_4.csv', 'titanic', 'Try 4. XGB Model')

100%|██████████| 2.77k/2.77k [00:01<00:00, 1.80kB/s]


Successfully submitted to Titanic - Machine Learning from Disaster
Submission successful!
