## Gradient Boosting Classifier
This notebook defines the python script corresponding to the non-DL scenario. The multinomial classifier I used is the **XGB** implementation which supports the GPU acceleration

In [10]:
# load training set
import pandas as pd
import numpy as np

XTrain = pd.read_csv('../../data/staging_data/mispelling_fixed_clean_input_train.csv', sep=',')
YTrain = pd.read_csv('c:/Users/I051796/Projects/CES/data/label.csv', sep=';')

##  Data Preparation

In [11]:
# vectorize the text with TF-IDF transform
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

vectorizer = TfidfVectorizer(strip_accents='ascii')
XTFIDFVectorizedTrain = vectorizer.fit_transform(XTrain['question'])
XTFIDFVectorizedTrain = pd.DataFrame(XTFIDFVectorizedTrain.toarray())
XTFIDFVectorizedTrain.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8060,8061,8062,8063,8064,8065,8066,8067,8068,8069
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# perform a dimensional reduction into 600 dimensional space
# use the same dimensionality than the one provided by fasttext embedding model in the DL context
from sklearn.decomposition import PCA

PCATransform = PCA(n_components=600)
PCAXTrain = pd.DataFrame(PCATransform.fit_transform(XTFIDFVectorizedTrain))

In [13]:
from sklearn.model_selection import train_test_split
# combine input features from above text vectorization with text characteristics features (../../data/staging_data/text_extracted_features.csv)
extractedFeatures = pd.read_csv('../../data/staging_data/text_extracted_features.csv', sep=',')
mergedXTrain = PCAXTrain.join(extractedFeatures, lsuffix='', rsuffix='')

# split into train and test
mergedXTrain, mergedXTest, YTrain, YTest = train_test_split(mergedXTrain, YTrain, test_size=0.15, random_state=42)

mergedXTrain.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,likelihoo_topic_41,likelihoo_topic_42,likelihoo_topic_43,likelihoo_topic_44,likelihoo_topic_45,likelihoo_topic_46,likelihoo_topic_47,likelihoo_topic_48,likelihoo_topic_49,likelihoo_topic_50
1188,0.143385,-0.123189,-0.046976,-0.02505,-0.036219,-0.031148,0.030762,-0.04941,0.067782,0.058946,...,0.0,0.0,0.02,0.0,0.04,0.0,0.0,0.0,0.04,0.0
4252,-0.105876,0.036921,-0.089553,-0.02162,0.211266,0.115868,0.115149,-0.028494,-0.000708,-0.025395,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447,-0.080307,-0.038724,-0.039224,-0.152588,0.05937,-0.133382,0.115162,-0.044077,0.044808,0.083296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0
2926,0.065817,0.111077,0.150822,-0.027654,0.049189,-0.028348,-0.076515,0.058111,0.10499,0.103282,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3733,-0.074552,0.025587,0.039834,0.003135,0.030514,-0.033057,-0.081776,0.068321,-0.017575,0.020644,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Hyper-parameter Search

In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# run a grid search with key parameter value candidates
# max_depth
# min_child_weight
# n_estimators
# early stopping
# learning_rate (eta)

grid_parameters = {
    'max_depth':(4,6,8),
    "min_child_weight" :(2, 5, 10),
    "learning_rate" :(0.05, 0.1)
}

gbm = xgb.XGBClassifier(
    objective = "multi:softprob",
    n_estimators=100,
    eval_metric ="mlogloss",
    n_jobs=2,    
    tree_method='gpu_hist',
    n_gpus=1)

gridSearch = GridSearchCV(
    estimator=gbm,
    fit_params = None,
    param_grid = grid_parameters,
    cv=4,
    verbose=1)

gridSearch.fit(mergedXTrain, YTrain.intention)

Fitting 4 folds for each of 18 candidates, totalling 72 fits


In [None]:
gridSearch.best_estimator_

In [None]:
%matplotlib inline
pd.DataFrame(gridSearch.cv_results_['mean_test_score']).plot(figsize=(15,10))

Grid search indicates that below parameters give the best accuracy:

* min_child_weight=10
* max_depth=8
* learning_rate=0.1


## Fit final model

In [None]:
import xgboost as xgb

gbm = xgb.XGBClassifier(
    min_child_weight=10,
    max_depth=8,
    learning_rate=0.1,
    objective = "multi:softprob",
    n_estimators=100,
    eval_metric ="mlogloss",
    n_jobs=2,
    tree_method='gpu_hist',
    n_gpus=1)


gbm.fit(mergedXTrain, YTrain.intention, early_stopping_rounds=10, eval_set=[(mergedXTrain,YTrain.intention)], eval_metric='mlogloss')

In [None]:
pd.Series(gbm.feature_importances_).describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

pd.DataFrame(gbm.evals_result_.get('validation_0').get('mlogloss')).plot(figsize=(12,8), title='logloss descent over boosting iteration')

In [None]:
YPredicted = gbm.predict(mergedXTest)
YTrue = YTest.intention

In [None]:

import sys
sys.path.append('..')
from utils import vizu

import sklearn
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(YTrue, YPredicted)

print(sklearn.metrics.classification_report(YTrue, YPredicted))

plt.figure(figsize=(20,20))

vizu.plot_confusion_matrix(cnf_matrix, normalize=False, classes = np.unique(YTrain['intention']))

plt.show()
