In [2]:
import spacy
import stanza

import pandas as pd
import numpy as np
import re
import string

## Setting up the pipeline

In [3]:
LANG_1 = 'en'
LANG_2 = 'fr'

MODELS = {
    'en': 'en_core_web_sm',
    'fr': 'fr_core_news_sm',
    'nl': 'nl_core_news_sm',
    'fi': 'fi_core_news_sm'
}

PIPELINE = 'spacy' # 'spacy' or 'stanza'

# Load the language models
if PIPELINE == 'stanza':
    nlp1 = stanza.Pipeline(lang=LANG_1, processors='tokenize,mwt,pos,lemma')
    nlp2 = stanza.Pipeline(lang=LANG_2, processors='tokenize,mwt,pos,lemma')
else:
    nlp1 = spacy.load(MODELS[LANG_1])
    nlp2 = spacy.load(MODELS[LANG_2])

AttributeError: module 'spacy' has no attribute 'load'

## Loading the documents

In [None]:
with open(f'{LANG_1}.txt',encoding="utf8") as f:
    doc1_raw = f.readlines()

In [None]:
# Clean the text

doc1_clean = list()
pattern = re.compile('<[^>]*>')
for line in range(0,len(doc1_raw)):
    text = doc1_raw[line]      
    if not (pattern.match(text)):
        if len(text)>1:
            doc1_clean.append(text.strip())

In [None]:
with open(f'{LANG_2}.txt',encoding="utf8") as f:
    doc2_raw = f.readlines()

In [None]:
# Clean the text

doc2_clean = list()
pattern = re.compile('<[^>]*>')
for line in range(0,len(doc2_raw)):
    text = doc2_raw[line]      
    if not (pattern.match(text)):
        if len(text)>1:
            doc2_clean.append(text.strip())

In [None]:
docs = {
    LANG_1: doc1_clean,
    LANG_2: doc2_clean
}

## Loading the data into the pipeline

In [None]:
# Load the data
if PIPELINE == 'stanza':
    doc1 = [ stanza.Document(text=doc, lang=LANG_1) for doc in docs[LANG_1] ]
    doc2 = [ stanza.Document(text=doc, lang=LANG_2) for doc in docs[LANG_2] ]
else:
    doc1 = [ doc for doc in docs[LANG_1] ]
    doc2 = [ doc for doc in docs[LANG_2] ]

TypeError: Document.__init__() got an unexpected keyword argument 'lang'

In [None]:
# Process the data
if PIPELINE == 'stanza':
    doc1_processed = [ nlp1(doc) for doc in doc1 ]
    doc2_processed = [ nlp2(doc) for doc in doc2 ]
else:
    doc1_processed = [ nlp1(doc) for doc in doc1 ]
    doc2_processed = [ nlp2(doc) for doc in doc2 ]

## Restructuring data into a tree

In [None]:
doc1_tree = dict()

if PIPELINE == 'stanza':
    for line in doc1_processed:
        for sentence in line.sentences:
            for token in sentence:
                if token.pos != 'PUNCT':
                    doc1_tree[token.id] = (
                        token.xpos, 
                        [child.id if child.head == token.id else None for child in doc1_processed]
                    )

In [None]:
doc2_tree = dict()

if PIPELINE == 'stanza':
    for sentence in doc2_processed:
        for token in sentence:
            if token.pos != 'PUNCT':
                doc2_tree[token.id] = (
                    token.xpos, 
                    [child.id if child.head == token.id else None for child in doc2_processed]
                )

In [None]:
print(doc1_processed)

[]
