# XGBoost

XGBoost with the following configuration reached an accuracy of 0.758.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [4]:
dataset = pd.read_csv('dataset.csv')
testset = pd.read_csv('testset.csv')
# Needed for the final submission file.
passenger_ids = pd.read_csv('test.csv')

In [5]:
# Transform categorical data into one-hot encoded values.
def transform_columns(X, test_data, trasformed_cols):
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), trasformed_cols)], remainder='passthrough')

    X = np.array(ct.fit_transform(X))
    test_data = np.array(ct.transform(test_data))

    return X, test_data

# Standardize data.
def standardize(X_train, X_test, test_data):
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    test_data = sc.transform(test_data)

    return X_train, X_test, test_data


def create_xg_boost():
    return XGBClassifier(
        learning_rate=0.07, 
        n_estimators=190, 
        max_depth=5,
        min_child_weight=3,
        gamma=0.2, 
        subsample=0.6, 
        colsample_bytree=1.0,
        objective='binary:logistic', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=101
    )

# Save predictions in a file to be submitted to Kaggle.
def preds_to_file(test_all, preds, filename):        
    output = pd.DataFrame({'PassengerId': test_all.PassengerId, 'Survived': preds})
    output.to_csv(filename, index=False)
    print("Your submission was successfully saved!", filename)

In [6]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values
test_data = testset.iloc[:,:].values

In [7]:
X[0]

array([3, 'male', 22.0, 1, 0, 7.25, 'S', 7.0, 'group_2', 1, 'not_alone',
       'Mr'], dtype=object)

Columns 0,1,6,8,10,11 are categorical and must be one-hot encoded.

In [8]:
X, test_data = transform_columns(X, test_data, [0,1,6,8,10,11])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

Data must be standardized.

In [10]:
X_train, X_test, test_data = standardize(X_train, X_test, test_data)

In [11]:
model = create_xg_boost()

In [12]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.07, max_delta_step=0, max_depth=5,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=190, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=101, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=101, subsample=0.6, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [13]:
predictions = model.predict(test_data)

In [14]:
preds_to_file(passenger_ids, predictions, 'titanic_xgboost_01.csv')

Your submission was successfully saved! titanic_xgboost_01.csv
