## Imports

In [5]:
import pandas as pd

# import spacy
import numpy as np
import sklearn as skl 
import xgboost as xgb


## Training

In [6]:
target = 'label'
input_column = 'cleanTitle'

train_data = pd.read_pickle('../../../Files/Submissions/train/train_split_submission.pickle') 
valid_data = pd.read_pickle('../../../Files/Submissions/train/val_split_submission.pickle')
test_data = pd.read_pickle('../../../Files/Submissions/train/test_split_submission.pickle')

train_data = train_data[[target, input_column]]
valid_data = valid_data[[target, input_column]]
test_data = test_data[[target, input_column]]

data = pd.concat([train_data, valid_data, test_data])


train_instances = train_data[input_column].apply(str).apply(str.split)
train_labels = train_data[target]

# collect known word tokens and tags
wordset, labelset = set(), set()

# collect tags from all data, to prevent unseen labels
labelset.update(set(data[target]))

# get the vocabulary
for words in train_instances:
    wordset.update(set(words))

# map words and tags into ints
PAD = '-PAD-'
UNK = '-UNK-'
word2int = {word: i + 2 for i, word in enumerate(sorted(wordset))}
word2int[PAD] = 0  # special token for padding
word2int[UNK] = 1  # special token for unknown words
 
label2int = {label: i for i, label in enumerate(sorted(labelset))}
# inverted index to translate it back
int2label = {i:label for label, i in label2int.items()}


def convert2ints(instances):
    """
    function to apply the mapping to all words
    """
    result = []
    for words in instances:
        # replace words with int, 1 for unknown words
        word_ints = [word2int.get(word, 1) for word in words]
        result.append(word_ints)
    return result
                          
train_instances_int = convert2ints(train_instances)
train_labels_int = [label2int[label] for label in train_labels]

In [7]:
test_instances = test_data[input_column].apply(str).apply(str.split)
test_labels = test_data[target]

test_instances_int = convert2ints(test_instances)
test_labels_int = [label2int[label] for label in test_labels]

# convert dev data
val_instances = valid_data[input_column].apply(str).apply(str.split)
val_labels = valid_data[target]

val_instances_int = convert2ints(val_instances)
val_labels_int = [label2int[label] for label in val_labels]

In [8]:
from keras.utils import to_categorical

train_labels_1hot = to_categorical(train_labels_int, len(label2int))
test_labels_1hot = to_categorical(test_labels_int, len(label2int))
val_labels_1hot = to_categorical(val_labels_int, len(label2int))

train_labels_1hot[0]

array([0., 1., 0.], dtype=float32)

In [9]:
# compute 95th percentile of training sentence lengths
L = sorted(map(len, train_instances))
MAX_LENGTH = L[int(len(L)*0.95)]
print(MAX_LENGTH)

# apply padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_instances_int = pad_sequences(train_instances_int, padding='post', maxlen=MAX_LENGTH)
test_instances_int = pad_sequences(test_instances_int, padding='post', maxlen=MAX_LENGTH)
val_instances_int = pad_sequences(val_instances_int, padding='post', maxlen=MAX_LENGTH)

print(train_instances[0], len(train_instances[0]))
print(train_instances_int[0], len(train_instances_int[0]))

15
['shill', 'organization'] 2
[22655 17709     0     0     0     0     0     0     0     0     0     0
     0     0     0] 15


In [None]:
print(len(train_instances_int)) 

In [13]:
dtrain = xgb.DMatrix(train_instances_int, label=train_labels_int)
dval = xgb.DMatrix(val_instances_int, label=val_labels_int)
dtest = xgb.DMatrix(test_instances_int, label=test_labels_int)

In [14]:
param = {'max_depth': 8, 'eta': 1, 'objective': 'multi:softmax', 'num_class': 3, 'eval_metric': ['auc', 'ams@0'], 'nthread': 4, 'silent': 1}

evallist = [(dval, 'eval'), (dtrain, 'train')]

In [15]:
num_round = 500
seed = 42
cv_results = xgb.cv(
    param, 
    dtrain, 
    num_boost_round=num_round,
    seed=seed, 
    nfold=5,
)


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be 

In [16]:
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,train-ams@0-mean,train-ams@0-std,test-auc-mean,test-auc-std,test-ams@0-mean,test-ams@0-std
0,0.759845,0.001814,218.330054,0.676217,0.733313,0.004254,109.108588,1.326160
1,0.815736,0.002401,218.333337,0.671118,0.783347,0.004543,109.116321,1.333643
2,0.850129,0.001867,218.337195,0.673382,0.812958,0.003231,109.114368,1.335949
3,0.871339,0.004394,218.337015,0.673368,0.829098,0.005645,109.124072,1.341030
4,0.887703,0.005562,218.336182,0.669097,0.841703,0.007221,109.114505,1.333645
...,...,...,...,...,...,...,...,...
495,0.999699,0.000006,218.330725,0.675186,0.918024,0.001223,109.115831,1.328617
496,0.999700,0.000006,218.329904,0.675854,0.918009,0.001208,109.115831,1.328617
497,0.999701,0.000006,218.329904,0.675854,0.918017,0.001216,109.115831,1.328617
498,0.999702,0.000007,218.329904,0.675854,0.918000,0.001231,109.115831,1.328617


## Hyperparameter Tuning

In [3]:
from sklearn.model_selection import GridSearchCV

params = { 'mad_depth' : [3, 5, 6, 8, 10],
        'learning_rate' : [0.01, 0.05, 0.1],
        'n_estimators' : [100, 500, 1000],
        'objective': ['multi:softmax', 'multi:softprob'],
        'colsample_bytree': [0.5, 0.7, 1]
}

In [4]:
xgbc = xgb.XGBClassifier(seed = 42)

In [5]:
clf = GridSearchCV(estimator = xgbc,
                param_grid = params,
                scoring='roc_auc_ovo',
                verbose = 1
            
)

In [11]:
clf.fit(train_instances_int, train_labels_1hot)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


KeyboardInterrupt: 

## Analysis 

In [1]:
from joblib import dump, load

In [2]:
Gridsearch = load('xgbCVGridSearch.joblib')

In [11]:
model = Gridsearch.best_estimator_

In [12]:
type(model)

xgboost.sklearn.XGBClassifier

In [4]:
print("Best parameters:", Gridsearch.best_params_)
print("Highest AUC: ", (Gridsearch.best_score_))


Best parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 1000, 'num_class': 3, 'objective': 'multi:softprob'}
Highest AUC:  0.9309728292375551


In [8]:
df1 = pd.DataFrame(Gridesearch.cv_results_)
# df1.to_csv('xgboost_results.csv')

In [10]:
df1.sort_values(['rank_test_score']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_num_class,param_objective,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
44,80.091854,8.578351,0.422838,0.032511,0.5,0.1,10,1000,3,multi:softprob,"{'colsample_bytree': 0.5, 'learning_rate': 0.1...",0.928852,0.929915,0.932881,0.929763,0.933452,0.930973,0.001837,1
41,61.540091,6.199461,0.324311,0.029329,0.5,0.1,8,1000,3,multi:softprob,"{'colsample_bytree': 0.5, 'learning_rate': 0.1...",0.927215,0.928086,0.931972,0.927805,0.931833,0.929382,0.002077,2
29,84.471425,9.330232,0.453256,0.045889,0.5,0.05,10,1000,3,multi:softprob,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",0.925553,0.927085,0.930634,0.926786,0.930606,0.928133,0.002095,3
89,92.961855,7.584533,0.471483,0.070459,0.7,0.1,10,1000,3,multi:softprob,"{'colsample_bytree': 0.7, 'learning_rate': 0.1...",0.926387,0.926844,0.930113,0.926399,0.930192,0.927987,0.001776,4
43,45.372425,1.339221,0.232754,0.016557,0.5,0.1,10,500,3,multi:softprob,"{'colsample_bytree': 0.5, 'learning_rate': 0.1...",0.925062,0.926473,0.929667,0.926238,0.930366,0.927561,0.002073,5


In [11]:
df1.to_csv('xgboost_results.csv')

In [13]:
proba = model.predict_proba(test_instances_int)

In [14]:
test = model.predict(test_instances_int)

In [20]:
proba.tofile('../../../Files/models/xgb_proba.txt', sep=' ')

In [21]:
test.tofile('../../../Files/models/xgb_test.txt', sep=' ')