# XGBoost - Dealing with missing data

In [131]:
import numpy as np
import pandas as pd

In [132]:
df = pd.read_csv("horse-colic.data.csv", delim_whitespace=True, header=None)

"We can change these missing values to the sparse value expected by XGBoost which is the value zero (0)."
Replace missing data represented as `?` with 0 or use `sklearn` Imputer class:

In [124]:
df = df.replace('?', 0)
#df = df.replace('?', np.nan)

In [125]:
features = df.ix[:,0:26]
labels = df.ix[:,27]

features.ix[:,:] = features.astype('float32')

In [126]:
from sklearn.preprocessing import Imputer
imputer = Imputer(np.nan, 'median')
features = imputer.fit_transform(features)

"Finally, this is a binary classification problem although the class values are marked with the integers 1 and 2. We model binary classification problems in XGBoost as logistic 0 and 1 values. We can easily convert the Y dataset to 0 and 1 integers using the LabelEncoder, as we did in the iris flowers example."

In [127]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels_encoded = label_encoder.transform(labels)

In [128]:
from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

seed, test_size = 7, 0.33
X_train, X_test, Y_train, Y_test = train_test_split(features, labels_encoded,
                                                   test_size=test_size,
                                                   random_state=seed)

model = XGBClassifier()
model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [129]:
Y_pred = model.predict(X_test)
predictions = [round(value) for value in Y_pred]

accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: {}".format(accuracy*100.0))

Accuracy: 83.83838383838383


In [141]:
import pickle
pickle.dump(model, open("horse-colic.dat", 'wb'))