In [1]:
# 可视化显示魔术命令
%matplotlib inline

from __future__ import print_function
import warnings

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
import xgboost as xgb

In [2]:
# Importing the dataset
X_train = pd.read_csv("./data/train.csv")
X_test = pd.read_csv("./data/test.csv")

In [3]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Replace names with titles
X_train['Name'] = X_train['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
titles = X_train['Name'].unique()

X_test['Name'] = X_test['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
titles = X_test['Name'].unique()
titles

array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
      dtype=object)

In [5]:
medians = dict()
for title in titles:
    median = X_train.Age[(X_train["Age"] != -1) & (X_train['Name'] == title)].median()
    medians[title] = median

In [6]:
for index, row in X_train.iterrows():
    if row['Age'] == -1:
        X_train.loc[index, 'Age'] = medians[row['Name']]

for index, row in X_test.iterrows():
    if row['Age'] == -1:
        X_test.loc[index, 'Age'] = medians[row['Name']]

In [7]:
# Replace titles with numerical values
replacement = {
    'Don': 0,
    'Rev': 0,
    'Jonkheer': 0,
    'Capt': 0,
    'Mr': 1,
    'Dr': 2,
    'Col': 3,
    'Major': 3,
    'Master': 4,
    'Miss': 5,
    'Mrs': 6,
    'Mme': 7,
    'Ms': 7,
    'Mlle': 7,
    'Sir': 7,
    'Lady': 7,
    'the Countess': 7
}

X_train['Name'] = X_train['Name'].apply(lambda x: replacement.get(x))        
X_test['Name'] = X_test['Name'].apply(lambda x: replacement.get(x))

In [8]:
# Replace missing fare with class median
X_train['Fare'].fillna(-1, inplace=True)
X_test['Fare'].fillna(-1, inplace=True)

In [9]:
medians = dict()
for pclass in X_train['Pclass'].unique():
    median = X_train.Fare[(X_train["Fare"] != -1) & (X_train['Pclass'] == pclass)].median()
    medians[pclass] = median
    
for index, row in X_train.iterrows():
    if row['Fare'] == -1:
        X_train.loc[index, 'Fare'] = medians[row['Pclass']]

for index, row in X_test.iterrows():
    if row['Fare'] == -1:
        X_test.loc[index, 'Fare'] = medians[row['Pclass']]

In [10]:
replacement = {
    6: 0,
    4: 0,
    5: 1,
    0: 2,
    2: 3,
    1: 4,
    3: 5
}
X_train['Parch'] = X_train['Parch'].apply(lambda x: replacement.get(x))
X_test['Parch'] = X_test['Parch'].apply(lambda x: replacement.get(x))

In [11]:
X_train['Embarked'] = X_train['Embarked'].fillna('S')
X_test['Embarked'] = X_test['Embarked'].fillna('S')

In [12]:
replacement = {
    'S': 0,
    'Q': 1,
    'C': 2
}

X_train['Embarked'] = X_train['Embarked'].apply(lambda x: replacement.get(x))
X_test['Embarked'] = X_test['Embarked'].apply(lambda x: replacement.get(x))

In [13]:
replacement  = {
    5: 0,
    8: 0,
    4: 1,
    3: 2,
    0: 3,
    2: 4,
    1: 5
}

X_train['SibSp'] = X_train['SibSp'].apply(lambda x: replacement.get(x))
X_test['SibSp'] = X_test['SibSp'].apply(lambda x: replacement.get(x))

In [14]:
X_train['Cabin'] = X_train['Cabin'].fillna('U')
X_test['Cabin'] = X_test['Cabin'].fillna('U')

In [15]:
# Retain first letter only of cabin
X_train['Cabin'] = X_train['Cabin'].map(lambda x: x[0])
X_test['Cabin'] = X_test['Cabin'].map(lambda x: x[0])

In [16]:
replacement = {
    'T': 0,
    'U': 1,
    'A': 2,
    'G': 3,
    'C': 4,
    'F': 5,
    'B': 6,
    'E': 7,
    'D': 8
}

X_train['Cabin'] = X_train['Cabin'].apply(lambda x: replacement.get(x))
X_test['Cabin'] = X_test['Cabin'].apply(lambda x: replacement.get(x))

In [17]:
X_train['Sex'] = LabelEncoder().fit_transform(X_train['Sex'])
X_test['Sex'] = LabelEncoder().fit_transform(X_test['Sex'])

In [18]:
y_train = X_train.iloc[:, 1].values
submission = X_test.iloc[:, 0].values
submission = pd.DataFrame(submission) 
submission.columns = ['PassengerId'] 

In [19]:
# Delete redundant features
X_train = X_train.drop(X_train.columns[[1, 8]], axis=1)
X_test = X_test.drop(X_test.columns[[7]], axis=1)

In [20]:
def objective(space):

    warnings.filterwarnings(action='ignore', category=DeprecationWarning)
    classifier = xgb.XGBClassifier(n_estimators = space['n_estimators'],
                            max_depth = int(space['max_depth']),
                            learning_rate = space['learning_rate'],
                            gamma = space['gamma'],
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],
                            colsample_bytree = space['colsample_bytree']
                            )
    
    classifier.fit(X_train, y_train)

    # Applying k-Fold Cross Validation
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    CrossValMean = accuracies.mean()

    print("CrossValMean:", CrossValMean)

    return{'loss':1-CrossValMean, 'status': STATUS_OK }

In [21]:
space = {
    'max_depth' : hp.choice('max_depth', range(5, 30, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators' : hp.choice('n_estimators', range(20, 205, 5)),
    'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)}

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
print("Best: ", best)

CrossValMean: 0.7846407899216887
CrossValMean: 0.7834916581545794
CrossValMean: 0.762130859153331
CrossValMean: 0.8037172284644194
CrossValMean: 0.8170619112473044
CrossValMean: 0.8003087050278062
CrossValMean: 0.8204204971058905
CrossValMean: 0.7812935535126547
CrossValMean: 0.8003081375553286
CrossValMean: 0.7946266031097492
CrossValMean: 0.7991851095221881
CrossValMean: 0.7913074565883554
CrossValMean: 0.7767001475428442
CrossValMean: 0.8115066394279878
CrossValMean: 0.7700087958234026
CrossValMean: 0.8059766201339235
CrossValMean: 0.8261014640789922
CrossValMean: 0.8193593235728068
CrossValMean: 0.8014808194302576
CrossValMean: 0.7912316990125978
CrossValMean: 0.7788954148223812
CrossValMean: 0.8283234025649756
CrossValMean: 0.8070375099307683
CrossValMean: 0.8249401316536147
CrossValMean: 0.8171246169560776
CrossValMean: 0.7767004312790831
CrossValMean: 0.7812824877993417
CrossValMean: 0.8070122574055159
CrossValMean: 0.8227806151401659
CrossValMean: 0.7744274202701169
CrossValMea

In [None]:
# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = best['n_estimators'],
                            max_depth = best['max_depth'],
                            learning_rate = best['learning_rate'],
                            gamma = best['gamma'],
                            min_child_weight = best['min_child_weight'],
                            subsample = best['subsample'],
                            colsample_bytree = best['colsample_bytree']
                            )

classifier.fit(X_train, y_train)

In [None]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
CrossValMean = accuracies.mean()
print("Final CrossValMean: ", CrossValMean)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = pd.DataFrame(y_pred) 
y_pred.columns = ['Survived'] 
submission = submission.join(y_pred) 

# Exporting dataset to csv
submission.to_csv("./data/Titanic_Submission.csv", index=False, sep=',')