In [None]:
!pip install transformers
!pip install spacy
!python -m spacy download de_core_news_lg

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

import torch

import numpy as np
import pandas as pd
from tqdm import tqdm

import re
from collections import Counter

from IPython.display import display, Markdown

import spacy
from spacy.matcher import PhraseMatcher

import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

tqdm.pandas()

nlp = spacy.load('de_core_news_lg', disable=['ner'])

In [None]:
speeches = pd.read_csv('../data/speeches_2010_sample.csv',
                       encoding='utf-8', sep=',', index_col=0)
print(len(speeches))

speeches = speeches[speeches.speech_content.notnull()].copy()
print(len(speeches))
speeches.head()

In [None]:
speeches['spacydoc'] = [doc for doc in tqdm(nlp.pipe(speeches.speech_content))]
speeches['tokens'] = speeches['spacydoc'].progress_apply(lambda doc: [t.text for t in doc])

In [None]:
dkz_path = '../data/DKZ_Suchworte_Systematik_und_Berufe_gueltig.xml'
dkz_df = pd.read_xml(dkz_path)

dkz_mw_df = dkz_df[dkz_df.suchwortGruppe.isin(['m', 'w'])].copy()
berufe_list = list(dkz_mw_df['name'].unique())
matcher_mw = PhraseMatcher(nlp.vocab, attr='LEMMA')
occ_nlp_mw = list(nlp.pipe(berufe_list))

for occ, occ_proc in zip(berufe_list, occ_nlp_mw):
    matcher_mw.add(occ, [occ_proc])

In [None]:
model_checkpoint = 'johannabi/german_tc_professions_debates'
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="max")

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

In [None]:
def predict_to_list(tokens):
    string = ' '.join(tokens)    
    predictions = token_classifier(string)
    if len(predictions) == 0:
        return [0] * len(tokens)
    # idenfity start and end char of tokens
    start_end_list = list()
    pointer = 0
    for t in tokens:
        start_end_list.append((t, pointer, pointer+len(t)))
        pointer+=len(t)+1
    se_df = pd.DataFrame(start_end_list, columns=['token', 'start', 'end'])
    
    # align predictions with tokens
    annotations = [0] * len(tokens)
    for pred in predictions:
        try:
            #greedy annotation
            start_token = max(se_df[se_df.start <= pred['start']].index) # nearst possible under boundary
            end_token = min(se_df[se_df.end >= pred['end']].index) # nearst possible upper boundary

            for idx in range(start_token, end_token+1):
                annotations[idx] = 1
        except IndexError:
            print(pred, '\n', tokens)
    return annotations


def predict_spacydoc_to_list(spacydoc):
    annotations = []
    for sent in spacydoc.sents:
        if len(sent.text.strip()) == 0:
            annotations.extend([0] * len(sent))
            continue
        annotations.extend(predict_to_list([t.text for t in sent]))
    return annotations

speeches['BERT'] = speeches['spacydoc'].progress_apply(lambda x: predict_spacydoc_to_list(x))

In [None]:
def phrasematch_to_list(doc, matcher):
    matches = matcher(doc)
    annotations = [0] * len(doc)
    
    for match_id, start, end in matches:
        for i in range(start, end):
            annotations[i] = 1
    return annotations

speeches['matcher'] = speeches['spacydoc'].progress_apply(lambda x: phrasematch_to_list(x, matcher_mw))

In [None]:
export = speeches[['speech_id', 'tokens', 'speech_content', 'BERT', 'matcher']]
export.to_csv('../data/speeches_2010_sample_result.csv')