In [1]:
!pip install spacy scikit-learn wandb
!pip install --upgrade pandas
!python -m spacy download nb_core_news_lg

Collecting spacy
  Downloading spacy-2.3.5-cp37-cp37m-manylinux2014_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 10.1 MB/s eta 0:00:01
Collecting catalogue<1.1.0,>=0.0.7
  Using cached catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting wasabi<1.1.0,>=0.4.0
  Using cached wasabi-0.8.0-py3-none-any.whl (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.5-cp37-cp37m-manylinux2014_x86_64.whl (20 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.5-cp37-cp37m-manylinux2014_x86_64.whl (126 kB)
[K     |████████████████████████████████| 126 kB 69.8 MB/s eta 0:00:01
[?25hCollecting thinc<7.5.0,>=7.4.1
  Downloading thinc-7.4.5-cp37-cp37m-manylinux2014_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 70.8 MB/s eta 0:00:01
[?25hCollecting plac<1.2.0,>=0.9.6
  Using cached plac-1.1.3-py2.py3-none-any.whl (20 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp37-cp37m-manylinux2014_x86_64.whl (35

In [1]:
from pathlib import Path
import random

import pandas as pd
from sklearn.metrics import classification_report
import spacy
from spacy.lang.nb.stop_words import STOP_WORDS
from spacy.pipeline import TextCategorizer
from spacy.util import minibatch, compounding
import wandb

In [2]:
nlp = spacy.load('nb_core_news_lg')

In [3]:
SAVE_PATH = Path('model')
SAVE_PATH.mkdir(exist_ok=True)
DATA_PATH = Path('../data/norec')

In [4]:
subset_names = ['train', 'test', 'dev']
subsets = {name: pd.read_pickle(DATA_PATH / f'norsk_kategori_4_{name}.pkl') for name in subset_names}

In [5]:
text = subsets['train'].iloc[0]['text']

In [6]:
doc = nlp(text)
doc

«Poison». Som alle store artister passer Timberlake på å synliggjøre hvor han kommer fra musikalsk.. Derav denne relativt obskure new jack swing-saken fra Bell Biv DeVoe, gruppen som ble til New Edition og som sådan forløpere til N'Sync.. Fenomenalt frekk låt som skreddersydd for Justin.

In [7]:
doc.ents

(Poison, Timberlake, Bell Biv DeVoe, New Edition, N'Sync, Justin)

In [8]:
subsets['train']['rating'][:10]

2676     1
12603    0
6845     1
2433     1
5987     1
7707     1
5319     1
5805     1
4439     1
12240    1
Name: rating, dtype: int64

In [9]:
nlp = spacy.load('nb_core_news_lg')
textcat = nlp.create_pipe(
    "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
)
nlp.add_pipe(textcat, last=True)

In [10]:
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

1

In [11]:
def determine_categories(rating):
    return {"POSITIVE": bool(rating), "NEGATIVE": not bool(rating)}
for name in subset_names:
    subsets[name]['categories'] = subsets[name]['rating'].apply(determine_categories)

In [12]:
texts = {name: subsets[name]['text'] for name in subset_names}
categories = {name: subsets[name]['categories'] for name in subset_names}
data = {name: list(zip(texts[name], [{"cats": cats} for cats in categories[name]])) for name in subset_names}

In [13]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [14]:
wandb.init(project="capra_text")
NUMBER_OF_ITERATIONS=5
wandb.config.iterations=NUMBER_OF_ITERATIONS
wandb.config.framework='spacy'
wandb.config.dataset='norec'

wandb: Currently logged in as: khellan (use `wandb login --relogin` to force relogin)


In [15]:
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(NUMBER_OF_ITERATIONS):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(data['train'])
        batches = minibatch(data['train'], size=batch_sizes)
        for batch in batches:
            batch_texts, annotations = zip(*batch)
            nlp.update(batch_texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, texts['dev'], categories['dev'])
        wandb.log({'Loss': losses["textcat"], 'Precision': scores["textcat_p"], 'Recall': scores["textcat_r"], 'F1': scores["textcat_f"]})
        print(
            f'{losses["textcat"]:.3f}\t{scores["textcat_p"]:.3f}\t{scores["textcat_r"]:.3f}\t{scores["textcat_f"]:.3f}'
        )

Training the model...
LOSS 	  P  	  R  	  F  
9.240	0.871	0.966	0.916
0.185	0.871	0.936	0.902
0.063	0.873	0.921	0.896
0.052	0.872	0.912	0.892
0.047	0.873	0.901	0.887


The model seems to overtrain from the start. Precision on the testset remains constant while recall drops.