# Titanic: Machine Learning from Disaster XGBoost

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

Determine if there are any missing values.

In [7]:
pd.DataFrame({'train_null': train.drop(columns='Survived').isnull().sum(), 'test_null': test.isnull().sum()})

Unnamed: 0,train_null,test_null
PassengerId,0,0
Pclass,0,0
Name,0,0
Sex,0,0
Age,177,86
SibSp,0,0
Parch,0,0
Ticket,0,0
Fare,0,1
Cabin,687,327


In [8]:
def fill_age(df):
    pclass = df.groupby(by='Pclass').mean()['Age']
    mapping = pclass.to_dict()
    df.loc[df['Age'].isnull(),'Age'] = df['Pclass'].map(mapping)

def fill_cabin(df):
    df[['Cabin']] = df[['Cabin']].fillna(value=df['Cabin'].mode()[0])

def fill_embarked(df):
    df[['Embarked']] = df[['Embarked']].fillna(value=df['Embarked'].mode()[0])

fill_age(train)
fill_cabin(train)
fill_embarked(train)

fill_age(test)
fill_cabin(test)
fill_embarked(test)

Feature selection.

In [9]:
def feature_eng(df):
    drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
    df.drop(drop, axis=1, inplace=True)
    df['LogFare'] = np.log(train['Fare']+1)
    
    return pd.get_dummies(train, columns=['Sex', 'Pclass', 'Embarked'])

train = feature_eng(train)

### XGBoost
- XGBoost was built to optimize large-scale boosted tree algorithms

In [10]:
X = train.drop(['Survived'], axis=1)
y = train['Survived']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
import xgboost as xgb
model = xgb.XGBClassifier(
 #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1)

In [15]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.9, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=2,
              missing=nan, monotone_constraints='()', n_estimators=2000,
              n_jobs=-1, nthread=-1, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [17]:
predictions = model.predict(X_test)

In [19]:
predictions.shape

(295,)

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.7864406779661017