In [1]:
from pathlib import Path

import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from spacy.lang.nb.stop_words import STOP_WORDS
import xgboost as xgb

In [2]:
SAVE_PATH = Path('model')
SAVE_PATH.mkdir(exist_ok=True)
DATA_PATH = Path('../data/norec')

In [3]:
subset_names = ['train', 'test', 'dev']
subsets = {name: pd.read_pickle(DATA_PATH / f'norsk_kategori_{name}.pkl') for name in subset_names}

In [4]:
text = subsets['train'].iloc[0]['text']

In [5]:
text

"«Poison». Som alle store artister passer Timberlake på å synliggjøre hvor han kommer fra musikalsk.. Derav denne relativt obskure new jack swing-saken fra Bell Biv DeVoe, gruppen som ble til New Edition og som sådan forløpere til N'Sync.. Fenomenalt frekk låt som skreddersydd for Justin."

In [6]:
subsets['train'].groupby(['rating']).count()

Unnamed: 0_level_0,text
rating,Unnamed: 1_level_1
0,2326
1,11597


In [7]:
vectorizer = CountVectorizer(stop_words=STOP_WORDS, min_df=5, max_df=1000, max_features=5000)
vectorizer.fit_transform(subsets['train']['text'])

<13923x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1007500 stored elements in Compressed Sparse Row format>

In [8]:
vectorizer.get_feature_names()[:10]

['00', '000', '10', '100', '1000', '1080p', '11', '12', '120', '1200']

In [9]:
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}

In [10]:
boosted_model = xgb.XGBClassifier()
parameters = {
    'learning_rate': [0.1, 0.25, 0.3, 0.5, 0.75]
}
searcher = GridSearchCV(boosted_model, parameters, scoring='roc_auc', n_jobs=4)
searcher.fit(texts['train'], categories['train'])
print(f'Best parameters {searcher.best_params_} and best score: {searcher.best_score_}')

Best parameters {'learning_rate': 0.5} and best score: 0.9183358383665414


In [11]:
boosted_model = xgb.XGBClassifier()
parameters = {
    'scale_pos_weight': [0.1, 0.2, 0.3, 1]
}
searcher = GridSearchCV(boosted_model, parameters, scoring='roc_auc', n_jobs=4)
searcher.fit(texts['train'], categories['train'])
print(f'Best parameters {searcher.best_params_} and best score: {searcher.best_score_}')

Best parameters {'scale_pos_weight': 0.2} and best score: 0.9160680599849325


In [12]:
boosted_model = xgb.XGBClassifier()
parameters = {
    'num_parallel_tree': [1, 5, 10]
}
searcher = GridSearchCV(boosted_model, parameters, scoring='roc_auc', n_jobs=4)
searcher.fit(texts['train'], categories['train'])
print(f'Best parameters {searcher.best_params_} and best score: {searcher.best_score_}')

Best parameters {'num_parallel_tree': 1} and best score: 0.914525300608984


In [13]:
boosted_model = xgb.XGBClassifier()
parameters = {
    'n_estimators': [50, 100, 500, 1000]
}
searcher = GridSearchCV(boosted_model, parameters, scoring='roc_auc', n_jobs=4)
searcher.fit(texts['train'], categories['train'])
print(f'Best parameters {searcher.best_params_} and best score: {searcher.best_score_}')

Best parameters {'n_estimators': 1000} and best score: 0.9319266446030696


Training with the best parameters selection gives us:

In [15]:
boosted_model = xgb.XGBClassifier(learning_rate=0.5, num_parallel_tree=1, scale_pos_weight=0.2, n_estimators=1000)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2326
           1       1.00      0.99      1.00     11597

    accuracy                           1.00     13923
   macro avg       0.99      1.00      0.99     13923
weighted avg       1.00      1.00      1.00     13923

Development metrics
              precision    recall  f1-score   support

           0       0.71      0.72      0.71       230
           1       0.96      0.96      0.96      1569

    accuracy                           0.93      1799
   macro avg       0.83      0.84      0.84      1799
weighted avg       0.93      0.93      0.93      1799



These result are better than we got before. The numbers for the dev set are good so while the training metrics look overfitted, this is good.