In [1]:
from __future__ import unicode_literals, print_function
import math
import os
import os.path
import random
from urllib.request import urlretrieve

import numpy as np
import pandas as pd
import spacy
from spacy.util import minibatch, compounding, decaying

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
# constants
n_iter = 20
data_path = './data'
train_data_url = 'https://raw.githubusercontent.com/uds-lsv/GermEval-2018-Data/master/germeval2018.training.txt'
train_data_path = f'{data_path}/germeval2018.training.txt'
model_ext = '_100d_100k'

In [3]:
def fetch_data():
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    if not os.path.isdir(data_path):
        raise FileExistsError('data path exists, but is not directory (or not accessible)')
    urlretrieve(train_data_url, train_data_path)
    
    
def get_data(validation_ratio=0.1):
    df_trn = pd.read_csv(train_data_path, sep='\t', header=None, names=['text', 'bin', 'detail']).drop('detail', axis=1)
    
    # split into trn and validation
    idx = np.arange(len(df_trn))
    np.random.shuffle(idx)
    val_size = math.ceil(len(df_trn) * validation_ratio)
    
    val_df = df_trn.iloc[idx[:val_size]]
    trn_df = df_trn.iloc[idx[val_size:]]
    
    trn_labels = [{'OFFENSE': x == 'OFFENSE'} for x in trn_df['bin'].values]
    val_labels = [{'OFFENSE': x == 'OFFENSE'} for x in val_df['bin'].values]
    
    return (tuple(trn_df['text'].values), trn_labels), (tuple(val_df['text'].values), val_labels)

In [4]:
fetch_data()
(train_texts, train_cats), (eval_texts, eval_cats) = get_data(validation_ratio=0.1)

print("Using examples ({} training, {} evaluation)".format(len(train_texts), len(eval_texts)))
train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats]))
eval_data = list(zip(eval_texts, eval_cats))
train_data[3:5]

Using examples (4508 training, 501 evaluation)


[('@MGrosseBroemer Sprach nicht ein gewisser Herr Seehofer / #CSU davon, dass man die #Afd - Wähler zurückholen möchte? Macht sich ganz klasse, die potentiellen Wähler vorher erst mal noch gründlich zu beschimpfen...Sie haben wirklich ALLES verstanden! |LBR| @Beatrix_vStorch @cducsubt @AfDimBundestag',
  {'cats': {'OFFENSE': False}}),
 ('Der ewige Kanzler und große Europäer geht. Seine Verdienste um die Deutsche Einheit bleiben. Wir trauern um Helmut Kohl.',
  {'cats': {'OFFENSE': False}})]

Create an adapted, vectorized model for german text on command-line with:

```
wget http://4530.hostserv.eu/resources/embed_tweets_de_100D_fasttext.zip
python -m spacy init-model de data/de_vec_twitter_100d_100k --vectors-loc embed_tweets_de_100D_fasttext.zip --prune-vectors 100000```

In [5]:
nlp = spacy.load(f'data/de_vec_twitter{model_ext}')

if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe('textcat')

textcat.add_label('OFFENSE')

print(f'pipeline {nlp.pipe_names}')

pipeline ['textcat']


In [6]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}


In [7]:
%xmode Verbose
def train_model(nlp, train_data, eval_data):
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        optimizer.max_grad_norm = 0.6
        print("Training the model...")
        print('\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        best_f = 0
        last_f = 0
        n_iter_nogain = 0
        dropout = decaying(0.45, 0.2, 1e-4)
        eval_texts, eval_cats = zip(*eval_data)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(1, 8, 1.01))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, eval_texts, eval_cats)
            if scores['textcat_f'] > best_f:
                best_f = scores['textcat_f']
                n_iter_nogain = 0
            #elif scores['textcat_f'] > last_f:
            #    n_iter_nogain = 0
            else:
                n_iter_nogain += 1
            last_f = scores['textcat_f']
            print('{4}\t{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f'], i))
            if n_iter_nogain > 3:
                print('early stopping')
                break
    return optimizer


Exception reporting mode: Verbose


In [8]:
optimizer = train_model(nlp, train_data, eval_data)
with nlp.use_params(optimizer.averages):
    nlp.to_disk(f'data/de_cat{model_ext}')

Training the model...
	LOSS 	  P  	  R  	  F  
0	173.222	0.690	0.195	0.304
1	121.184	0.658	0.336	0.444
2	87.401	0.663	0.423	0.516
3	62.140	0.647	0.517	0.575
4	46.587	0.625	0.503	0.558
5	34.685	0.625	0.503	0.558
6	27.226	0.646	0.564	0.602
7	22.280	0.640	0.584	0.611
8	16.681	0.638	0.557	0.595
9	14.156	0.619	0.557	0.587
10	11.124	0.600	0.564	0.581
11	10.292	0.627	0.597	0.612
12	8.654	0.610	0.577	0.593
13	6.837	0.607	0.591	0.599
14	5.844	0.613	0.584	0.598
15	4.416	0.622	0.597	0.610
early stopping


In [9]:
nlp2 = spacy.load(f'data/de_cat{model_ext}')


In [15]:
doc = nlp2("Warum sind die Geier so gierig")
doc.cats['OFFENSE']

0.7233520150184631