## Data loading

In [95]:
import numpy as np
import pandas as pd
import seaborn as sns


In [96]:
from sklearn.model_selection import train_test_split

dataset_path = './data/train.csv'

# Read in data
X_full = pd.read_csv("./data/train.csv", index_col='PassengerId')
X_test_full = pd.read_csv("./data/test.csv", index_col='PassengerId')

# Extract target
y = X_full.Survived
X_full.drop(columns=['Survived'], inplace=True)


In [97]:
# Split into training and testing data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Find the catagorical and numerical column types
catagorical_columns = [
    col for col in X_train_full.columns if X_train_full[col].nunique() < 10
    and X_train_full[col].dtype == 'object']
numerical_columns = [
    col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]

columns_used = catagorical_columns + numerical_columns

X_train = X_train_full[columns_used]
X_valid = X_valid_full[columns_used]
X_test = X_test_full[columns_used]


In [98]:
X_train.head()


Unnamed: 0_level_0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
141,female,C,3,,0,2,15.2458
440,male,S,2,31.0,0,0,10.5
818,male,C,2,31.0,1,1,37.0042
379,male,C,3,20.0,0,0,4.0125
492,male,S,3,21.0,0,0,7.25


In [99]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'),),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, catagorical_columns)
    ])


In [100]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=500, learning_rate=0.1,
                     early_stopping_rounds=10, n_jobs=4)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                      ])

clf.fit(X_train, y_train, model__eval_set=[
        (preprocessor.fit_transform(X_valid), y_valid)], model__verbose=4)

preds = clf.predict(X_valid)

((preds > 0.5).astype(int) == y_valid).sum() / y_valid.size  # type: ignore


[0]	validation_0-rmse:0.47373
[4]	validation_0-rmse:0.40489
[8]	validation_0-rmse:0.36839
[12]	validation_0-rmse:0.35048
[16]	validation_0-rmse:0.34460


[20]	validation_0-rmse:0.34434
[24]	validation_0-rmse:0.34432
[28]	validation_0-rmse:0.34476
[32]	validation_0-rmse:0.34326
[36]	validation_0-rmse:0.34155
[40]	validation_0-rmse:0.34155
[44]	validation_0-rmse:0.34124
[48]	validation_0-rmse:0.34108
[52]	validation_0-rmse:0.34199
[55]	validation_0-rmse:0.34194


0.8603351955307262

In [101]:
final_model = XGBRegressor(n_estimators=56, learning_rate=0.1, n_jobs=4)

final_pipline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', final_model)
                                ])
final_pipline.fit(X_full, y)
predictions = (final_pipline.predict(X_test) > 0.5).astype(int).flatten()


In [103]:
output = pd.DataFrame(
    {'PassengerId': X_test.index, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
