## Imports

In [1]:
import pandas as pd

import spacy
import numpy as np
import sklearn as skl 
import xgboost as xgb


In [2]:
target = 'label'
input_column = 'cleanTitle'

train_data = pd.read_pickle('../../../Files/Submissions/train/train_split_submission.pickle') 
valid_data = pd.read_pickle('../../../Files/Submissions/train/val_split_submission.pickle')
test_data = pd.read_pickle('../../../Files/Submissions/train/test_split_submission.pickle')

train_data = train_data[[target, input_column]]
valid_data = valid_data[[target, input_column]]
test_data = test_data[[target, input_column]]

data = pd.concat([train_data, valid_data, test_data])


train_instances = train_data[input_column].apply(str).apply(str.split)
train_labels = train_data[target]

# collect known word tokens and tags
wordset, labelset = set(), set()

# collect tags from all data, to prevent unseen labels
labelset.update(set(data[target]))

# get the vocabulary
for words in train_instances:
    wordset.update(set(words))

# map words and tags into ints
PAD = '-PAD-'
UNK = '-UNK-'
word2int = {word: i + 2 for i, word in enumerate(sorted(wordset))}
word2int[PAD] = 0  # special token for padding
word2int[UNK] = 1  # special token for unknown words
 
label2int = {label: i for i, label in enumerate(sorted(labelset))}
# inverted index to translate it back
int2label = {i:label for label, i in label2int.items()}


def convert2ints(instances):
    """
    function to apply the mapping to all words
    """
    result = []
    for words in instances:
        # replace words with int, 1 for unknown words
        word_ints = [word2int.get(word, 1) for word in words]
        result.append(word_ints)
    return result
                          
train_instances_int = convert2ints(train_instances)
train_labels_int = [label2int[label] for label in train_labels]

In [3]:
test_instances = test_data[input_column].apply(str).apply(str.split)
test_labels = test_data[target]

test_instances_int = convert2ints(test_instances)
test_labels_int = [label2int[label] for label in test_labels]

# convert dev data
val_instances = valid_data[input_column].apply(str).apply(str.split)
val_labels = valid_data[target]

val_instances_int = convert2ints(val_instances)
val_labels_int = [label2int[label] for label in val_labels]

In [4]:
from keras.utils import to_categorical

train_labels_1hot = to_categorical(train_labels_int, len(label2int))
test_labels_1hot = to_categorical(test_labels_int, len(label2int))
val_labels_1hot = to_categorical(val_labels_int, len(label2int))

train_labels_1hot[0]

array([0., 1., 0.], dtype=float32)

In [5]:
# compute 95th percentile of training sentence lengths
L = sorted(map(len, train_instances))
MAX_LENGTH = L[int(len(L)*0.95)]
print(MAX_LENGTH)

# apply padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_instances_int = pad_sequences(train_instances_int, padding='post', maxlen=MAX_LENGTH)
test_instances_int = pad_sequences(test_instances_int, padding='post', maxlen=MAX_LENGTH)
val_instances_int = pad_sequences(val_instances_int, padding='post', maxlen=MAX_LENGTH)

print(train_instances[0], len(train_instances[0]))
print(train_instances_int[0], len(train_instances_int[0]))

15
['shill', 'organization'] 2
[22655 17709     0     0     0     0     0     0     0     0     0     0
     0     0     0] 15


In [15]:
print(len(train_instances_int), 

AttributeError: 'list' object has no attribute 'size'

In [17]:
dtrain = xgb.DMatrix(train_instances_int, label=train_labels_int)
dval = xgb.DMatrix(val_instances_int, label=val_labels_int)
dtest = xgb.DMatrix(test_instances_int, label=test_labels_int)

In [22]:
param = {'max_depth': 8, 'eta': 1, 'objective': 'multi:softmax', 'num_class': 3, 'eval_metric': ['auc', 'ams@0'], 'nthread': 4, 'silent': 1}

evallist = [(dval, 'eval'), (dtrain, 'train')]

In [27]:
num_round = 500
seed = 42
cv_results = xgb.cv(
    param, 
    dtrain, 
    num_boost_round=num_round,
    seed=seed, 
    nfold=5,
)


In [29]:
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,train-ams@0-mean,train-ams@0-std,test-auc-mean,test-auc-std,test-ams@0-mean,test-ams@0-std
0,0.629418,0.001968,218.150577,0.747202,0.628457,0.005338,109.073407,1.310451
1,0.659238,0.001865,218.328955,0.672002,0.657398,0.004045,109.110620,1.329719
2,0.674442,0.001330,218.331677,0.670202,0.673267,0.004630,109.132208,1.338270
3,0.686991,0.002981,218.334494,0.672477,0.684855,0.005718,109.122501,1.336336
4,0.699018,0.001159,218.338000,0.671468,0.696113,0.004645,109.107233,1.325778
...,...,...,...,...,...,...,...,...
495,0.942751,0.000362,218.333984,0.674818,0.909610,0.001936,109.118729,1.331167
496,0.942808,0.000386,218.333984,0.674818,0.909649,0.002002,109.118729,1.331167
497,0.942894,0.000384,218.332883,0.673912,0.909704,0.002010,109.118729,1.331167
498,0.942959,0.000385,218.332883,0.673912,0.909735,0.002010,109.117049,1.329781
