# NLP pipeline tests

## 1. Lemmatizer test: stanza
Performance test, speed and quality

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
import stanza
stanza.download('sv') # download Swedish model, run only once

Compare 2 pipelines with different settings

In [2]:
nlp1 = stanza.Pipeline('sv')

2021-12-10 19:47:44 INFO: Loading these models for language: sv (Swedish):
| Processor | Package   |
-------------------------
| tokenize  | talbanken |
| pos       | talbanken |
| lemma     | talbanken |
| depparse  | talbanken |

2021-12-10 19:47:44 INFO: Use device: cpu
2021-12-10 19:47:44 INFO: Loading: tokenize
2021-12-10 19:47:44 INFO: Loading: pos
2021-12-10 19:47:45 INFO: Loading: lemma
2021-12-10 19:47:45 INFO: Loading: depparse
2021-12-10 19:47:45 INFO: Done loading processors!


In [3]:
nlp2 = stanza.Pipeline('sv', processors='tokenize,lemma', use_gpu=True, pos_batch_size=3000)

2021-12-10 19:47:45 INFO: Loading these models for language: sv (Swedish):
| Processor | Package   |
-------------------------
| tokenize  | talbanken |
| lemma     | talbanken |

2021-12-10 19:47:45 INFO: Use device: cpu
2021-12-10 19:47:45 INFO: Loading: tokenize
2021-12-10 19:47:45 INFO: Loading: lemma
2021-12-10 19:47:46 INFO: Done loading processors!


In [4]:
nlp3 = stanza.Pipeline('sv', processors='tokenize,pos,lemma', use_gpu=True, pos_batch_size=3000)

2021-12-10 19:47:46 INFO: Loading these models for language: sv (Swedish):
| Processor | Package   |
-------------------------
| tokenize  | talbanken |
| pos       | talbanken |
| lemma     | talbanken |

2021-12-10 19:47:46 INFO: Use device: cpu
2021-12-10 19:47:46 INFO: Loading: tokenize
2021-12-10 19:47:46 INFO: Loading: pos
2021-12-10 19:47:46 INFO: Loading: lemma
2021-12-10 19:47:46 INFO: Done loading processors!


Load test data, downloaded from https://data.riksdagen.se/

In [5]:
speeches1718 = pd.read_csv('anforande-201718.csvt', sep=',').iloc[1:11,:]
print(speeches1718.anforandetext)
speech_joint = "\n\n".join(speeches1718.anforandetext.values)

1      Svar på interpellationer  Herr talman! Jeff A...
2      Herr talman! Lotta Finstorp har frågat mig vi...
3      Herr talman! Jag tackar ministern för svaret....
4      Herr talman! Jag tackar Lotta Finstorp för de...
5      Herr talman! Jag tackar ministern. Det känns ...
6      Herr talman! Tack för inlägget  Lotta Finstor...
7      Herr talman! Att den enskilde rent fysiskt si...
8      Herr talman! Tack  Lotta Finstorp  för ditt i...
9      Herr talman! Maria Stockhaus har frågat mig v...
10     Herr talman! Tack  utbildningsministern  för ...
Name: anforandetext, dtype: object


In [6]:
# Preprocessing functions for the pipelines
def preprocess1(text):
    doc = nlp1(text)
    tokens = []
    for sentence in doc.sentences:
        for word in sentence.words:
            tokens.append(word.lemma)
    return tokens

def preprocess2(text):
    doc = nlp2(text)
    tokens = []
    for sentence in doc.sentences:
        for word in sentence.words:
            tokens.append(word.lemma)
    return tokens

def preprocess3(text):
    doc = nlp3(text)
    tokens = []
    for sentence in doc.sentences:
        for word in sentence.words:
            tokens.append(word.lemma)
    return tokens


In [7]:
p1 = preprocess1(speeches1718.anforandetext[1])
p2 = preprocess2(speeches1718.anforandetext[1])
p3 = preprocess3(speeches1718.anforandetext[1])

In [8]:
# Pipeline 2 yields worse results due to the missing POS tagger
for i in range(len(p1)):
    if p1[i] != p2[i]:
        print(p1[i], p2[i])

print('------')

# Pipeline 1 and 3 yield identical result on the second text
for i in range(len(p1)):
    if p3[i] != p3[i]:
        print(p1[i], p3[i])

Herr her
Jeff jeff
Ahl ahl
åtgärda åtgära
Jeff jeff
Ahl ahl
sjukhusläkare Sjukhusläkaren
understryka understryk
artikel artikeln
inkoma inkom
inspektion inspektionen
personalbrist personalbri
dödsfall dödsfallen
vidta vidtaga
patientsäkerhetslag patientsäkerhetslagen
minimera minimer
vårdskada vårdskadar
socialstyrelse Socialstyrelsen
lansera lanserat
myndighet myndigheten
vårdskada vårdskadar
en den
kommun kommuner
yrkesgrupp yrkesgrupper
vårdplats vårdplatser
vidta vidtaga
avsätta avsatt
öronmärka öronmärk
barnmorska barnmorskar
socialstyrelse Socialstyrelsen
förstärka förstärk
ge gen
socialstyrelse Socialstyrelsen
myndighet myndigheten
vårdskada vårdskadar
MERGEFORMAT meRGEFORMe
rikta riktad
bemanning bemanningen
rätta rätt
kompetensförsörjningen kompetensförsörjningsproblemen
vårdplats vårdplatser
vårdplats vårdplatser
uppehålla uppehåll
utskrivningsklara utskrivningsklar
sliten slita
vårdplats vårdplatser
utskrivningsklara utskrivningsklar
prioritera prioriter
behov behova
poängte

In [9]:
t = time()
vectorizer1 = TfidfVectorizer(tokenizer=preprocess1)
speech_vec1 = vectorizer1.fit_transform(speeches1718.anforandetext.values)
t = time() - t
print(f"Pipeline 1: {round(t, 2)} seconds.")
print(speech_vec1.shape)

Pipeline 1: 53.35 seconds.
(10, 970)


In [11]:
t = time()
vectorizer2 = TfidfVectorizer(tokenizer=preprocess2)
speech_vec2 = vectorizer2.fit_transform(speeches1718.anforandetext.values)
t = time() - t
print(f"Pipeline 2: {round(t, 2)} seconds.")
print(speech_vec2.shape)

Pipeline 2: 30.09 seconds.
(10, 1008)


In [12]:
t = time()
vectorizer2 = TfidfVectorizer(tokenizer=preprocess2)
speech_vec21 = vectorizer2.fit_transform([speech_joint])
t = time() - t
print(f"Pipeline 2 (with concatenated text): {round(t, 2)} seconds.")

# Problem with concatenating, nlp does not seem to recognize double
# line breaks as in https://stanfordnlp.github.io/stanza/pipeline.html
print(speech_vec21.shape)

Pipeline 2 (with concatenated text): 9.16 seconds.
(1, 1008)


Using preprocessor outside of TfidfVectorizer

In [14]:
t = time()

# Wrap each document with a stanza.Document object
in_docs = [stanza.Document([], text=d) for d in speeches1718.anforandetext.values]
out_docs = nlp2(in_docs)

# print(out_docs)

tokens = []
for doc in out_docs:
    tokens_ = []
    for sentence in doc.sentences:
        tokens_.extend([word.lemma for word in sentence.words])
    tokens.append(' '.join(tokens_))

vectorizer22 = TfidfVectorizer()
speech_vec22 = vectorizer22.fit_transform(tokens)    
    
t = time() - t
print(f"Pipeline 2 (with stanza docs): {round(t, 2)} seconds.")
print(speech_vec22.shape) # OK!

#print(tokens)

Pipeline 2 (with stanza docs): 9.76 seconds.
(10, 984)


In [15]:
t = time()

out_docs1 = nlp1(in_docs)

tokens = []
for doc in out_docs1:
    tokens_ = []
    for sentence in doc.sentences:
        tokens_.extend([word.lemma for word in sentence.words])
    tokens.append(' '.join(tokens_))

vectorizer12 = TfidfVectorizer()
speech_vec12 = vectorizer12.fit_transform(tokens)    
    
t = time() - t
print(f"Pipeline 1 (with stanza docs): {round(t, 2)} seconds.")
print(speech_vec12.shape) # OK!


Pipeline 1 (with stanza docs): 31.16 seconds.
(10, 941)


In [21]:
# The method with stanza documents provides equally good results
p12 = tokens[0].split(' ')
for i in range(len(p1)):
    if p1[i] != p12[i]:
        print(p1[i], p12[i])

It seems to be possible to cut processing time in almost half using POS tagger, and more than half if the POS tagger is not used.

In [None]:
# vocab = vectorizer.vocabulary_
# inv_vocab = {val:key for key, val in vocab.items()}