In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [26]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [27]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
all_df = pd.concat((train_df.loc[:, 'Pclass':'Embarked'], test_df.loc[:, 'Pclass':'Embarked']))

In [29]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [30]:
all_df['Age'] = all_df['Age'].fillna(all_df['Age'].mean())
all_df['Fare'] = all_df['Fare'].fillna(all_df['Fare'].mean())
all_df['Embarked'] = all_df['Embarked'].fillna(all_df['Embarked'].mode()[0])

In [31]:
all_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [32]:
cat_features = ['Sex', 'Embarked']

In [33]:
for col in cat_features:
    lbl = LabelEncoder()
    all_df[col] = lbl.fit_transform(list(all_df[col].values))

In [34]:
all_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [35]:
all_df = all_df.drop(columns=['Name', 'Ticket', 'Cabin'])

In [36]:
all_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [37]:
train = all_df[:train_df.shape[0]]
test = all_df[train_df.shape[0]:]

In [38]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(5)
memory usage: 55.7 KB


In [39]:
y = train_df['Survived']
ID = test_df['PassengerId']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=0)

In [41]:
import xgboost as xgb

In [58]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 1,
    'colsample_bytree': 1,
    'silent': 1
}

In [59]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [60]:
model = xgb.train(params=params,
                 dtrain=dtrain,
                 num_boost_round=100,
                 early_stopping_rounds=10,
                 evals=[(dtest, 'test')]
                 )

[0]	test-auc:0.886905
Will train until test-auc hasn't improved in 10 rounds.
[1]	test-auc:0.89624
[2]	test-auc:0.893243
[3]	test-auc:0.889603
[4]	test-auc:0.892857
[5]	test-auc:0.886005
[6]	test-auc:0.890673
[7]	test-auc:0.894741
[8]	test-auc:0.889603
[9]	test-auc:0.888832
[10]	test-auc:0.889431
[11]	test-auc:0.89153
Stopping. Best iteration:
[1]	test-auc:0.89624



In [70]:
prediction = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
prediction.shape

(418,)

In [71]:
prediction = np.where(prediction < 0.5, 0, 1)
prediction.shape

(418,)

In [68]:
submission = pd.DataFrame({
    'PassengerId': ID,
    'Survived': prediction
})

In [72]:
submission.shape

(418, 2)

In [69]:
submission.to_csv('submission.csv', index=False)