### Проверяем качество работы морфологического и синтаксического анализаторов SpaCy на корпусах из Universal Dependencies, размеченных в формате CoNLL-U
Подробнее: https://universaldependencies.org/

In [1]:
import conllu
import spacy
import pandas as pd

In [2]:
syntagrus = []
f1 = open('ru_syntagrus-full.conllu', 'r', encoding='UTF-8')
for tokenlist in conllu.parse_incr(f1):
    syntagrus.append(tokenlist)

In [3]:
pud = []
f2 = open('ru_pud-ud-test.conllu', 'r', encoding='UTF-8')
for tokenlist in conllu.parse_incr(f2):
    pud.append(tokenlist)

In [4]:
gsd = []
f3 = open('ru_gsd-full.conllu', 'r', encoding='UTF-8')
for tokenlist in conllu.parse_incr(f3):
    gsd.append(tokenlist)

In [5]:
taiga = []
f4 = open('ru_taiga-full.conllu', 'r', encoding='UTF-8')
for tokenlist in conllu.parse_incr(f4):
    taiga.append(tokenlist)

* предложние - объект класса conllu.models.TokenList; хранит токены
* токен - объект класса conllu.models.Token; хранит грамматическую и синтаксическую информацию

In [28]:
pud[0], type(pud[0]), pud[0][2], type(pud[0][2])

(TokenList<«, Если, передача, цифровых, технологий, сегодня, в, США, происходит, впервые, ,, то, о, мирной, передаче, власти, такого, не, скажешь, », ,, –, написала, Кори, Шульман, ,, специальный, помощник, президента, Обамы, в, своем, блоге, в, понедельник, .>,
 conllu.models.TokenList,
 {'id': 3,
  'form': 'передача',
  'lemma': 'передача',
  'upos': 'NOUN',
  'xpos': 'NN',
  'feats': {'Animacy': 'Inan',
   'Case': 'Nom',
   'Gender': 'Fem',
   'Number': 'Sing'},
  'head': 9,
  'deprel': 'nsubj',
  'deps': None,
  'misc': None},
 conllu.models.Token)

In [6]:
nlp = spacy.load('ru_core_news_lg')

In [31]:
def tokens_filter(corpus):
    
    '''takes a list of conllu labeled sentences
    returns an output_list (list of lists), where output_list[x][0] is a spacy parsed sentence 
    and output_list[1] is an initial sentence from the corpus
    and their tokens are identical'''
    
    output_list = []
    
    for sent in corpus:
        actual_tokens = []
        spacy_tokens = []
        for w in sent:
            actual_tokens.append(w['form'])
            
        for w in nlp(sent.metadata['text']):
            spacy_tokens.append(w.text)
        
        if actual_tokens == spacy_tokens and len(spacy_tokens) == len(sent):
            
            output_list.append([nlp(sent.metadata['text']), sent])
    return output_list

In [30]:
def compare_morph(corpus):
    
    '''takes output of tokens_filter function
    returns accuracy metrics for lemmas, parts of speech and grammatical features '''
    
    sim_lemmas = 0
    all_lemmas = 0
    sim_pos = 0
    all_pos = 0
    sim_tags = 0
    all_tags = 0
    
    for pair in corpus:
        for i in range(len(pair[0])):
            if pair[1][i]['upos'] != 'PUNCT':
            
                if pair[0][i].lemma_.lower() == pair[1][i]['lemma'].lower():
                    sim_lemmas += 1
                all_lemmas += 1
            
                if pair[0][i].pos_ == pair[1][i]['upos']:
                    sim_pos += 1
                all_pos += 1
            
                if pair[0][i].morph.to_dict() != {} and pair[1][i]['feats'] != None:
                    
                    # consider the same feature named differently in corpus sentence and in spacy parsed sentence   
                    if 'StyleVariant' in pair[0][i].morph.to_dict() and 'Variant' in pair[1][i]['feats']:
                        if pair[0][i].morph.to_dict()['StyleVariant'] == pair[1][i]['feats']['Variant']:
                            sim_tags+=1
                            
                    # consider the same meaning named differently in corpus sentence and in spacy parsed sentence        
                    if 'Person' in pair[0][i].morph.to_dict() and 'Person' in pair[1][i]['feats']:
                    #    
                        dict = pair[0][i].morph.to_dict()
                        
                        if dict['Person'] == 'First' and pair[1][i]['feats']['Person'] == '1':
                            dict['Person'] = '1'
                            
                        elif dict['Person'] == 'Second' and pair[1][i]['feats']['Person'] == '2':
                            dict['Person'] = '2'
                        
                        elif dict['Person'] == 'Third' and pair[1][i]['feats']['Person'] == '3':
                            dict['Person'] = '3'
                        
                        if dict == pair[1][i]['feats']:
                            sim_tags+=1           
                        
                    if pair[0][i].morph.to_dict() == pair[1][i]['feats']:
                        sim_tags += 1
                    all_tags += 1
    
    acc_lemmas = sim_lemmas / all_lemmas
    acc_pos = sim_pos / all_pos
    acc_tags = sim_tags / all_tags
    
    return acc_lemmas, acc_pos, acc_tags

In [29]:
def compare_syntax(corpus):
    
    '''takes output of tokens_filter function
    returns UAS and LAS metrics'''
    
    sim_heads = 0
    sim_deprels = 0
    heads = 0
    
    for pair in corpus:
        for i in range(len(pair[0])):
            if pair[1][i]['upos'] != 'PUNCT':
                
                if pair[1][i]['head'] == pair[0][i].head.i+1 \
                or (pair[1][i]['head'] == 0 and pair[1][i]['form'] == pair[0][i].head.text):
                    sim_heads += 1
                    
                if pair[1][i]['deprel'] == pair[0][i].dep_ \
                or (pair[1][i]['deprel'] == 'root' and pair[0][i].dep_ == 'ROOT'):
                    sim_deprels += 1
                heads+=1
                
    UAS = sim_heads / heads
    LAS = sim_deprels / heads
    return UAS, LAS            

In [10]:
taiga_doc = tokens_filter(taiga)
taiga_morph, taiga_syntax = compare_morph(taiga_doc), compare_syntax(taiga_doc)

In [11]:
pud_doc = tokens_filter(pud)
pud_morph, pud_syntax = compare_morph(pud_doc), compare_syntax(pud_doc)

In [12]:
gsd_doc = tokens_filter(gsd)
gsd_morph, gsd_syntax = compare_morph(gsd_doc), compare_syntax(gsd_doc)

In [344]:
syntagrus_doc = tokens_filter(syntagrus)
syntagrus_morph, syntagrus_syntax = compare_morph(syntagrus_doc), compare_syntax(syntagrus_doc)

In [349]:
df_morph = pd.DataFrame()
df_morph['gsd'] = gsd_morph
df_morph['taiga'] = taiga_morph
df_morph['pud'] = pud_morph
df_morph['syntagrus'] = syntagrus_morph
df_morph.rename({0: 'acc_lemma', 1: 'acc_pos', 2: 'acc_features'})

Unnamed: 0,gsd,taiga,pud,syntagrus
acc_lemma,0.916095,0.898678,0.917125,0.906177
acc_pos,0.951431,0.924969,0.967212,0.964253
acc_features,0.853498,0.701178,0.905653,0.842082


In [350]:
df_syntax = pd.DataFrame()
df_syntax['gsd'] = gsd_syntax
df_syntax['taiga'] = taiga_syntax
df_syntax['pud'] = pud_syntax
df_syntax['syntagrus'] = syntagrus_syntax
df_syntax.rename({0: 'UAS', 1: 'LAS'})

Unnamed: 0,gsd,taiga,pud,syntagrus
UAS,0.891469,0.837283,0.935365,0.903223
LAS,0.860596,0.828951,0.912203,0.88822
