In [1]:
from pathlib import Path

import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

from tokenizers import Tokenizer
import xgboost as xgb


In [103]:
SAVE_PATH = Path('model')
TOKENIZER_PATH = SAVE_PATH / 'norec_tokenizer_clean.json'
MAX_SENTENCE_LENGTH = 5000
DATA_PATH = Path('../data/norec')

In [3]:
subset_names = ['train', 'test', 'dev']
subsets = {name: pd.read_pickle(DATA_PATH / f'norsk_kategori_{name}.pkl') for name in subset_names}

In [4]:
text = subsets['train'].iloc[0]['text']

In [5]:
text

"«Poison». Som alle store artister passer Timberlake på å synliggjøre hvor han kommer fra musikalsk.. Derav denne relativt obskure new jack swing-saken fra Bell Biv DeVoe, gruppen som ble til New Edition og som sådan forløpere til N'Sync.. Fenomenalt frekk låt som skreddersydd for Justin."

In [6]:
subsets['train'].groupby(['rating']).count()

Unnamed: 0_level_0,text
rating,Unnamed: 1_level_1
0,2326
1,11597


In [80]:
tokenizer = Tokenizer.from_file(str(TOKENIZER_PATH))
tokenizer.enable_padding(length=MAX_SENTENCE_LENGTH)

In [116]:
texts = {name: np.array([encoding.ids if len(encoding.ids) < MAX_SENTENCE_LENGTH else encoding.ids[:MAX_SENTENCE_LENGTH] for encoding in tokenizer.encode_batch(subsets[name]['text'])]) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.2)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      2326
           1       1.00      0.99      1.00     11597

    accuracy                           0.99     13923
   macro avg       0.98      1.00      0.99     13923
weighted avg       0.99      0.99      0.99     13923

Development metrics
              precision    recall  f1-score   support

           0       0.18      0.12      0.14       230
           1       0.88      0.92      0.90      1569

    accuracy                           0.82      1799
   macro avg       0.53      0.52      0.52      1799
weighted avg       0.79      0.82      0.80      1799



In [119]:
text_tokenizer = Tokenizer.from_file(str(TOKENIZER_PATH))
def norec_tokenizer(text):
    encoding = text_tokenizer.encode(text)
    return encoding.tokens

In [121]:
vectorizer = CountVectorizer(tokenizer=norec_tokenizer, stop_words=None, min_df=5, max_df=1000, max_features=10000)
vectorizer.fit_transform(subsets['train']['text'])
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.2)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.71      0.99      0.83      2326
           1       1.00      0.92      0.96     11597

    accuracy                           0.93     13923
   macro avg       0.85      0.95      0.89     13923
weighted avg       0.95      0.93      0.93     13923

Development metrics
              precision    recall  f1-score   support

           0       0.59      0.80      0.68       230
           1       0.97      0.92      0.94      1569

    accuracy                           0.90      1799
   macro avg       0.78      0.86      0.81      1799
weighted avg       0.92      0.90      0.91      1799



In [122]:
vectorizer = CountVectorizer(tokenizer=norec_tokenizer, stop_words=None, min_df=5, max_df=1000, max_features=10000)
vectorizer.fit_transform(subsets['train']['text'])
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.1)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.53      1.00      0.69      2326
           1       1.00      0.82      0.90     11597

    accuracy                           0.85     13923
   macro avg       0.77      0.91      0.80     13923
weighted avg       0.92      0.85      0.87     13923

Development metrics
              precision    recall  f1-score   support

           0       0.43      0.88      0.58       230
           1       0.98      0.83      0.90      1569

    accuracy                           0.84      1799
   macro avg       0.71      0.86      0.74      1799
weighted avg       0.91      0.84      0.86      1799



In [123]:
vectorizer = CountVectorizer(tokenizer=norec_tokenizer, stop_words=None, min_df=5, max_df=1000, max_features=5000)
vectorizer.fit_transform(subsets['train']['text'])
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.2)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.70      0.99      0.82      2326
           1       1.00      0.92      0.96     11597

    accuracy                           0.93     13923
   macro avg       0.85      0.95      0.89     13923
weighted avg       0.95      0.93      0.93     13923

Development metrics
              precision    recall  f1-score   support

           0       0.56      0.77      0.65       230
           1       0.96      0.91      0.94      1569

    accuracy                           0.89      1799
   macro avg       0.76      0.84      0.79      1799
weighted avg       0.91      0.89      0.90      1799



Same hyperparameteres as pure XGBoost seems to work best. Tokenizers with CountVectorizer improves on the old version.

Time to test with stop word removal

In [124]:
from spacy.lang.nb.stop_words import STOP_WORDS

In [125]:
vectorizer = CountVectorizer(tokenizer=norec_tokenizer, stop_words=STOP_WORDS, min_df=5, max_df=1000, max_features=10000)
vectorizer.fit_transform(subsets['train']['text'])
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.2)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.72      0.99      0.83      2326
           1       1.00      0.92      0.96     11597

    accuracy                           0.93     13923
   macro avg       0.86      0.96      0.90     13923
weighted avg       0.95      0.93      0.94     13923

Development metrics
              precision    recall  f1-score   support

           0       0.57      0.77      0.66       230
           1       0.97      0.92      0.94      1569

    accuracy                           0.90      1799
   macro avg       0.77      0.84      0.80      1799
weighted avg       0.92      0.90      0.90      1799



Stop word removal is not helpful when using Tokenizers.

In [128]:
vectorizer = CountVectorizer(tokenizer=norec_tokenizer, stop_words=None, min_df=5, max_df=1000, max_features=10000)
vectorizer.fit_transform(subsets['train']['text'])
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
random_forest_model = xgb.XGBClassifier(
    booster='gbtree', 
    colsample_bynode=0.8,
    learning_rate=1,
    max_depth=5,
    num_parallell_tree=100,
    scale_pos_weight=0.2)
random_forest_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Parameters: { num_parallell_tree } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Training metrics
              precision    recall  f1-score   support

           0       0.31      0.70      0.43      2326
           1       0.92      0.69      0.79     11597

    accuracy                           0.69     13923
   macro avg       0.62      0.70      0.61     13923
weighted avg       0.82      0.69      0.73     13923

Development metrics
              precision    recall  f1-score   support

           0       0.29      0.67      0.41       230
           1       0.94      0.77      0.84      1569

    accuracy                           0.75      1799
   macro avg       0.62      0.72      0.63      1799
weighted avg       0.86      0.75      0.79      1799



And that's enough Random Forest :)