# Data preparation for  XGBoost

In [1]:
from xgboost import XGBClassifier
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder



In [75]:
data = pd.read_csv("iris.data.csv", header=None)
features = data.loc[:,0:3]
labels = data.loc[:,4]

XGBoost requires numeric classification labels, so we must encode them

In [76]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
label_encoded_y = label_encoder.transform(labels)

Splitting data for cross-validation:

In [12]:
seed, test_size = 7, 0.33
X_train, X_test, Y_train, Y_test = train_test_split(features, label_encoded_y,
                                                   test_size=test_size,
                                                   random_state=seed)

In [13]:
model = XGBClassifier()
model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

"The XGBoost model is configured to automatically model the multiclass classification problem using the multi:softprob objective, a variation on the softmax loss function to model class probabilities. This suggests that internally, that the output class is converted into a one hot type encoding automatically."

Make predictions using model

In [14]:
Y_pred = model.predict(X_test)
predictions = [round(value) for value in Y_pred]

In [15]:
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: {}".format(accuracy*100.0))

Accuracy: 92.0


## Categorical data sets - Breast cancer

In [2]:
!ls

datasets-uci-breast-cancer.h5  iris.data.csv  xgboost-data-prep.ipynb


In [49]:
import h5py
f = h5py.File('datasets-uci-breast-cancer.h5', 'r')
# Data stores
keys = [key for key in f.keys()]

# Columns
columns = [column for column in f['data']]



In [50]:
breast_data = []

for column in columns:
    column_data = []
    column_data.append(column)
    for entry in f[keys[0]][column]:
        column_data.append(entry.decode().replace("'",""))
    
    breast_data.append(column_data)
    
f.close()

In [60]:
breast_df = pd.DataFrame(breast_data).T

In [65]:
breast_df.columns = breast_df.iloc[0]
breast_df.drop(breast_df.index[0], inplace=True)

In [81]:
breast_df.head()

Unnamed: 0,Class,age,breast,breast-quad,deg-malig,inv-nodes,irradiat,menopause,node-caps,tumor-size
1,recurrence-events,40-49,right,left_up,3,0-2,no,premeno,yes,15-19
2,no-recurrence-events,50-59,right,central,1,0-2,no,ge40,no,15-19
3,recurrence-events,50-59,left,left_low,2,0-2,no,ge40,no,35-39
4,no-recurrence-events,40-49,right,left_low,3,0-2,yes,premeno,yes,35-39
5,recurrence-events,40-49,left,right_up,2,3-5,no,premeno,yes,30-34


The `sklearn` label encoder will simply encode each unique label within the column using a new integer. The classifier / model may interpret the numbers as having a meaningful relationship as integers. As this is intrue, the labels must be encoded as binary variables.

In [83]:
breast_encoded = pd.get_dummies(breast_df.drop('Class', axis=1));

The encoding feature, however, can be used for the labels / class.

In [88]:
breast_label_encoder = LabelEncoder()
breast_label_encoder = label_encoder.fit(breast_df['Class'])
breast_classes_encoded = label_encoder.transform(breast_df['Class'])

In [89]:
seed, test_size = 7, 0.33
X_train, X_test, Y_train, Y_test = train_test_split(breast_encoded, breast_classes_encoded,
                                                   test_size=test_size,
                                                   random_state=seed)

In [90]:
model = XGBClassifier()
model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [91]:
Y_pred = model.predict(X_test)
predictions = [round(value) for value in Y_pred]

In [92]:
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: {}".format(accuracy*100.0))

Accuracy: 71.57894736842105
