In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import time

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, average_precision_score, f1_score
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

In [2]:
dataset_bc = load_breast_cancer()
X = pd.DataFrame(dataset_bc.data, columns=dataset_bc.feature_names)
y = pd.Series(dataset_bc.target)

In [3]:
# Encode string
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
y = label_encoder.transform(y)

In [4]:
# Split data 60% train 20% val, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1337)

In [6]:
clf = RandomForestClassifier(max_depth=2, random_state=1337)
clf.fit(X_train, y_train)
accuracy_score(y_test, clf.predict(X_test))

0.956140350877193

In [10]:
weights = (y == 0).sum() / (1.0 * (y == 1).sum())
weights

0.5938375350140056

In [12]:
model = xgb.XGBClassifier(
    scale_pos_weight=weights,
    n_jobs=4,
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss'
)

start = time.time()
modelfit = model.fit(X_train, y_train)
fittingTime = time.time() - start

start = time.time()
prediction = modelfit.predict(X_test)
InferenceTime = time.time() - start

F1score = f1_score(y_test, prediction)

# Get probabilities and calculate metrics
probabilities = modelfit.predict_proba(X_test)
AUPRC = average_precision_score(y_test, probabilities[:, 1])
acc = accuracy_score(y_test, prediction)

print('AUPRC = {:.4f}'.format(AUPRC))
print('F1 Score = {:.4f}'.format(F1score))
print('Fitting Time = {:.4f}'.format(fittingTime))
print('Inference Time = {:.4f}'.format(InferenceTime))
print('Accuracy = {:.4f}'.format(acc))

AUPRC = 0.9950
F1 Score = 0.9790
Fitting Time = 0.1617
Inference Time = 0.0145
Accuracy = 0.9737
