In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
from xgboost import XGBClassifier
from sklearn import preprocessing
from collections import defaultdict

In [6]:
#Load the data into the notebook
train_dataset = pd.read_csv("train.csv")
test_dataset = pd.read_csv("test.csv")

In [7]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
print(train_dataset.isnull().any())

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool


In [10]:
train_dataset = train_dataset.dropna(subset=["Embarked"])

In [11]:
avg_age = train_dataset.loc[:, "Age"].mean()
train_dataset["Age"] = train_dataset["Age"].fillna(value=avg_age)

In [12]:
encoder = defaultdict(preprocessing.LabelEncoder)
train_dataset["Sex"] = encoder["Sex"].fit_transform(train_dataset["Sex"])
train_dataset["Embarked"] = encoder["Embarked"].fit_transform(train_dataset["Embarked"])

In [14]:
test_matrix = train_dataset.drop(["PassengerId", "Cabin", "Name", "Ticket"], axis=1)
corr_matrix = test_matrix.corr()
print(corr_matrix["Survived"])

Survived    1.000000
Pclass     -0.335549
Sex        -0.541585
Age        -0.074513
SibSp      -0.034040
Parch       0.083151
Fare        0.255290
Embarked   -0.169718
Name: Survived, dtype: float64


In [15]:
relevant_columns_list = []
relevant_index_list = []
for i in range(len(corr_matrix['Survived'])):
    corr = corr_matrix['Survived'][i]
    if abs(corr) > 0.1 and 1 > abs(corr):
        relevant_index_list.append(i)
column_list = test_matrix.columns
for i in range(len(relevant_index_list)):
    relevant_columns_list.append(column_list[relevant_index_list[i]])
print(relevant_columns_list)

['Pclass', 'Sex', 'Fare', 'Embarked']


  corr = corr_matrix['Survived'][i]


In [16]:
print(test_dataset.isna().any())

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool


In [17]:
avg_fare = test_dataset.loc[:, 'Fare'].mean()
test_dataset['Fare'] = test_dataset['Fare'].fillna(value=avg_fare)

In [18]:
#label encode Sex and Embarked columns
test_dataset['Sex'] = encoder['Sex'].fit_transform(test_dataset['Sex'])
test_dataset['Embarked'] = encoder['Embarked'].fit_transform(test_dataset['Embarked'])

In [19]:
#test dataset should be good to go
print(test_dataset.isnull().any())

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked       False
dtype: bool


In [21]:
#split the dataset into the appropriate splits
x_train = train_dataset.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', "Survived", "Age", "SibSp", "Parch"], axis=1)
y_train = train_dataset['Survived']
x_test = test_dataset.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', "Age", "SibSp", "Parch"], axis=1)

In [22]:
#normalize the data
mean = x_train.mean(axis=0)
x_train -= mean
std = x_train.std(axis=0)
x_train /= std
x_test -= mean
x_test /= std

In [23]:
final_model = XGBClassifier()
final_model.fit(x_train, y_train)

In [24]:
final_predictions = final_model.predict(x_test)

In [25]:
output = pd.DataFrame({"PassengerId": test_dataset["PassengerId"], "Survived": final_predictions})
output.to_csv("titanic_predictions_xgb.csv", index=False)