In [3]:
# XGBoost - Decision Trees - ensembles
# Preparing data for modeling
# Scoring the XGBoost models
# Feature Selection

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [8]:
data = pd.read_csv("titanic.csv")

In [9]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
data = data[['Pclass', 'Sex', 'Age', 'Survived']]

In [11]:
data.head()

Unnamed: 0,Pclass,Sex,Age,Survived
0,3,male,22.0,0
1,1,female,38.0,1
2,3,female,26.0,1
3,1,female,35.0,1
4,3,male,35.0,0


In [12]:
# just a little bit of cleaning up:
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

In [13]:
data.head()

Unnamed: 0,Pclass,Sex,Age,Survived
0,3,0,22.0,0
1,1,1,38.0,1
2,3,1,26.0,1
3,1,1,35.0,1
4,3,0,35.0,0


In [14]:
data = data.dropna()

In [19]:
X = data.drop('Survived', axis=1)
y = data['Survived']

In [20]:
# Now ready to spilt the data for training

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [21]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [22]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [23]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 82.68%


In [24]:
# Now let`s play with a data set that PIMA containing details on diabetics in India

In [25]:
from numpy import loadtxt 
from sklearn.metrics import accuracy_score 

In [26]:
dataset = loadtxt('pima.txt', delimiter=",")

In [27]:
dataset

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [32]:
X = dataset[:,0:8] 
Y = dataset[:,8]

In [29]:
seed = 7 #reproducibility of your model
test_size = 0.30 # 30% for testing

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 

In [30]:
model = XGBClassifier() 
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [34]:
# early stopping is used to avoid overfitting - since gradient boost models are prone for it.
eval_set = [(X_test, y_test)] 
model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set, verbose=True)
# note that the parameter has been added. The parameter is early_stopping_rounds=36

[0]	validation_0-error:0.25974
[1]	validation_0-error:0.25974
[2]	validation_0-error:0.25974
[3]	validation_0-error:0.25974
[4]	validation_0-error:0.233766
[5]	validation_0-error:0.242424
[6]	validation_0-error:0.255411
[7]	validation_0-error:0.238095
[8]	validation_0-error:0.233766
[9]	validation_0-error:0.238095
[10]	validation_0-error:0.238095
[11]	validation_0-error:0.233766
[12]	validation_0-error:0.238095
[13]	validation_0-error:0.238095
[14]	validation_0-error:0.242424
[15]	validation_0-error:0.233766
[16]	validation_0-error:0.229437
[17]	validation_0-error:0.229437
[18]	validation_0-error:0.229437
[19]	validation_0-error:0.229437
[20]	validation_0-error:0.229437
[21]	validation_0-error:0.229437
[22]	validation_0-error:0.238095
[23]	validation_0-error:0.225108
[24]	validation_0-error:0.212121
[25]	validation_0-error:0.220779
[26]	validation_0-error:0.220779
[27]	validation_0-error:0.212121
[28]	validation_0-error:0.21645
[29]	validation_0-error:0.21645
[30]	validation_0-error:0.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [35]:
eval_set = [(X_test, y_test)] 
model.fit(X_train, y_train, eval_metric="error",early_stopping_rounds=36, eval_set=eval_set, verbose=True)

[0]	validation_0-error:0.25974
Will train until validation_0-error hasn't improved in 36 rounds.
[1]	validation_0-error:0.25974
[2]	validation_0-error:0.25974
[3]	validation_0-error:0.25974
[4]	validation_0-error:0.233766
[5]	validation_0-error:0.242424
[6]	validation_0-error:0.255411
[7]	validation_0-error:0.238095
[8]	validation_0-error:0.233766
[9]	validation_0-error:0.238095
[10]	validation_0-error:0.238095
[11]	validation_0-error:0.233766
[12]	validation_0-error:0.238095
[13]	validation_0-error:0.238095
[14]	validation_0-error:0.242424
[15]	validation_0-error:0.233766
[16]	validation_0-error:0.229437
[17]	validation_0-error:0.229437
[18]	validation_0-error:0.229437
[19]	validation_0-error:0.229437
[20]	validation_0-error:0.229437
[21]	validation_0-error:0.229437
[22]	validation_0-error:0.238095
[23]	validation_0-error:0.225108
[24]	validation_0-error:0.212121
[25]	validation_0-error:0.220779
[26]	validation_0-error:0.220779
[27]	validation_0-error:0.212121
[28]	validation_0-error:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [36]:
y_pred = model.predict(X_test) 
predictions = [round(value) for value in y_pred] 

accuracy = accuracy_score(y_test, predictions) 
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 79.22%


In [41]:
# now let`s play with Kfold
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import cross_val_score 

In [42]:
model = XGBClassifier() 

In [43]:
kfold = StratifiedKFold(n_splits=10, random_state=7) 
results = cross_val_score(model, X, Y, cv=kfold)

In [44]:
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 76.95% (5.88%)
