In [1]:
from pathlib import Path

import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.nb.stop_words import STOP_WORDS
import xgboost as xgb


In [2]:
SAVE_PATH = Path('model')
SAVE_PATH.mkdir(exist_ok=True)
DATA_PATH = Path('../data/norec')

In [3]:
subset_names = ['train', 'test', 'dev']
subsets = {name: pd.read_pickle(DATA_PATH / f'norsk_kategori_{name}.pkl') for name in subset_names}

In [4]:
text = subsets['train'].iloc[0]['text']

In [5]:
text

"«Poison». Som alle store artister passer Timberlake på å synliggjøre hvor han kommer fra musikalsk.. Derav denne relativt obskure new jack swing-saken fra Bell Biv DeVoe, gruppen som ble til New Edition og som sådan forløpere til N'Sync.. Fenomenalt frekk låt som skreddersydd for Justin."

In [6]:
subsets['train'].groupby(['rating']).count()

Unnamed: 0_level_0,text
rating,Unnamed: 1_level_1
0,2326
1,11597


In [7]:
vectorizer = CountVectorizer(stop_words=STOP_WORDS, min_df=5, max_df=1000, max_features=10000)
vectorizer.fit_transform(subsets['train']['text'])

In [9]:
vectorizer.get_feature_names()[:10]

['00', '000', '10', '100', '1000', '1024', '105', '1080', '1080p', '11']

In [10]:
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}

In [11]:
boosted_model = xgb.XGBClassifier()
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.98      0.69      0.81      2326
           1       0.94      1.00      0.97     11597

    accuracy                           0.95     13923
   macro avg       0.96      0.84      0.89     13923
weighted avg       0.95      0.95      0.94     13923

Development metrics
              precision    recall  f1-score   support

           0       0.88      0.50      0.64       230
           1       0.93      0.99      0.96      1569

    accuracy                           0.93      1799
   macro avg       0.91      0.75      0.80      1799
weighted avg       0.93      0.93      0.92      1799



In [13]:
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.2)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.67      0.98      0.80      2326
           1       1.00      0.90      0.95     11597

    accuracy                           0.92     13923
   macro avg       0.83      0.94      0.87     13923
weighted avg       0.94      0.92      0.92     13923

Development metrics
              precision    recall  f1-score   support

           0       0.56      0.79      0.65       230
           1       0.97      0.91      0.94      1569

    accuracy                           0.89      1799
   macro avg       0.76      0.85      0.80      1799
weighted avg       0.92      0.89      0.90      1799



In [17]:
vectorizer = CountVectorizer(stop_words=STOP_WORDS, min_df=5, max_df=1000, max_features=5000)
vectorizer.fit_transform(subsets['train']['text'])
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.2)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.67      0.98      0.80      2326
           1       1.00      0.90      0.95     11597

    accuracy                           0.92     13923
   macro avg       0.83      0.94      0.87     13923
weighted avg       0.94      0.92      0.92     13923

Development metrics
              precision    recall  f1-score   support

           0       0.54      0.80      0.64       230
           1       0.97      0.90      0.93      1569

    accuracy                           0.89      1799
   macro avg       0.75      0.85      0.79      1799
weighted avg       0.91      0.89      0.90      1799



In [19]:
vectorizer = CountVectorizer(stop_words=STOP_WORDS, min_df=5, max_df=1000, max_features=20000)
vectorizer.fit_transform(subsets['train']['text'])
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.2)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.69      0.98      0.81      2326
           1       1.00      0.91      0.95     11597

    accuracy                           0.92     13923
   macro avg       0.84      0.95      0.88     13923
weighted avg       0.94      0.92      0.93     13923

Development metrics
              precision    recall  f1-score   support

           0       0.56      0.81      0.66       230
           1       0.97      0.91      0.94      1569

    accuracy                           0.89      1799
   macro avg       0.76      0.86      0.80      1799
weighted avg       0.92      0.89      0.90      1799



In [20]:
vectorizer = CountVectorizer(stop_words=STOP_WORDS, min_df=5, max_df=1000, max_features=1000)
vectorizer.fit_transform(subsets['train']['text'])
texts = {name: vectorizer.transform(subsets[name]['text']) for name in subset_names}
categories = {name: subsets[name]['rating'] for name in subset_names}
boosted_model = xgb.XGBClassifier(scale_pos_weight=0.2)
boosted_model.fit(texts['train'], categories['train'])
print('Training metrics')
print(classification_report(categories['train'], boosted_model.predict(texts['train'])))
print('Development metrics')
print(classification_report(categories['dev'], boosted_model.predict(texts['dev'])))

Training metrics
              precision    recall  f1-score   support

           0       0.64      0.97      0.77      2326
           1       0.99      0.89      0.94     11597

    accuracy                           0.90     13923
   macro avg       0.81      0.93      0.85     13923
weighted avg       0.93      0.90      0.91     13923

Development metrics
              precision    recall  f1-score   support

           0       0.51      0.74      0.60       230
           1       0.96      0.89      0.93      1569

    accuracy                           0.87      1799
   macro avg       0.73      0.82      0.76      1799
weighted avg       0.90      0.87      0.88      1799



Not much difference in 5000, 10000, 20000 features for XGBoost. With 1000 features however, the result is noticably worse.

While it's not overfitting as much as the Logistic Regression, going with 5000 features will make everything faster.