# Breast Cancer Training (XGBoost)

In [26]:
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.externals import joblib
import numpy as np

# load data
data = read_csv('data/breast-cancer.data.csv', header=None)
dataset = data.values

# all values leading up to index 9 (features)
X = dataset[:,0:9]
# all values in index 9 (classification)
Y = dataset[:,9]

# encode all feature fields since they are all non-numeric.
# xgboost can only deal with numeric values since it represents
# everything as a regression.

# encode string input values as integers
columns = []
for i in range(0, X.shape[1]):
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    feature = feature.reshape(X.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False)
    feature = onehot_encoder.fit_transform(feature)
    columns.append(feature)
# collapse columns into array
encoded_x = np.column_stack(columns)

print('X shape: ', encoded_x.shape)

label_encoded = LabelEncoder()
label_encoded = label_encoded.fit(Y)
label_encoded_y = label_encoded.transform(Y)

# print(label_encoder_y)

seed = 7
test_size = 0.33

X_train, X_test, y_train, y_test = train_test_split(encoded_x, label_encoded_y, test_size=test_size, random_state=seed)

model = XGBClassifier()
model.fit(X_train, y_train)

print(model)
joblib.dump(model, 'model.joblib')

# make predictions for test data
predictions = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)

print("Accuracy: %.2f%%" % (accuracy * 100.0))

X shape:  (286, 43)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Accuracy: 74.74%


  if diff:
