In [None]:
# multiclass classification
import pandas as pd
import xgboost
import time
import numpy as np
import matplotlib.pylab as plt
from matplotlib import pyplot

from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from numpy import mean
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import plot_importance


In [None]:
# define some variables
param_grid = {
        'max_depth': range(4,26,4),
        'scale_pos_weight' : [1, 25, 50, 75, 100],
        'colsample_bytree': np.arange(0.5,1.0,0.3),
}


cv_method = RepeatedStratifiedKFold(n_splits=2, 
                                    n_repeats=1, 
                                    random_state=42)


In [None]:
## load data
df = pd.read_pickle("C:\\VERTROUWELIJK\\final_dataSet.pkl")

In [None]:
# split the data in indepentend variable an dependent variable. 
X = df.drop(['Voorziening'], axis=1)
X_encoded = pd.get_dummies(X, columns=['Geslacht'])

In [None]:
y = df['Voorziening'].copy()

In [None]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
label_encoded_y = label_encoder.transform(y)

In [None]:
# Make train and test set. 
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_encoded, label_encoded_y)

In [None]:
# fit model no training data
model = xgboost.XGBClassifier()
start_time=time.time()
model.fit(X_train, y_train)
print(model)
print(time.time()-start_time)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = f1_score(y_test, predictions, average='micro')
print("f1_score: %.2f%%" % (accuracy * 100.0))

In [None]:
steps = [('under', RandomUnderSampler()), ('model', xgboost.XGBClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=42)
scores = cross_val_score(pipeline, X_encoded, label_encoded_y, scoring='f1_micro', cv=cv, n_jobs=-1, verbose = 10)
score = mean(scores)
print('F1 Score: %.3f' % score)

In [None]:
# make an gridSearch
start_time=time.time()
grid = GridSearchCV(xgboost.XGBClassifier(),
                   param_grid=param_grid,
                    scoring='f1_micro',
                   verbose=10,
                    cv= cv_method,
                   n_jobs=-1)
start_time=time.time()

grid.fit(X_train,y_train)
print(time.time()-start_time)

In [None]:
# get the best F1 score, coming from the gridSearch.
grid.best_score_

In [None]:
# get the best parameters of coming from the gridSearch.
grid.best_params_

In [None]:
# Plot the top 10 most important features of the XGboost model
plot_importance(model, max_num_features=10)
plt.show()