# HW1: Графематический и морфологический анализ текста

In [1]:
import pandas as pd
import numpy as np

import pymorphy2
import xml.etree.ElementTree as etree
import opencorpora

morph = pymorphy2.MorphAnalyzer()

Проверим работу pymorphy2

In [2]:
morph.parse('матери')

[Parse(word='матери', tag=OpencorporaTag('NOUN,anim,femn sing,gent'), normal_form='матерь', score=0.2, methods_stack=((<DictionaryAnalyzer>, 'матери', 379, 1),)),
 Parse(word='матери', tag=OpencorporaTag('NOUN,anim,femn sing,gent'), normal_form='мать', score=0.2, methods_stack=((<DictionaryAnalyzer>, 'матери', 1917, 1),)),
 Parse(word='матери', tag=OpencorporaTag('NOUN,anim,femn sing,datv'), normal_form='матерь', score=0.1, methods_stack=((<DictionaryAnalyzer>, 'матери', 379, 2),)),
 Parse(word='матери', tag=OpencorporaTag('NOUN,anim,femn sing,loct'), normal_form='матерь', score=0.1, methods_stack=((<DictionaryAnalyzer>, 'матери', 379, 5),)),
 Parse(word='матери', tag=OpencorporaTag('NOUN,anim,femn plur,nomn'), normal_form='матерь', score=0.1, methods_stack=((<DictionaryAnalyzer>, 'матери', 379, 6),)),
 Parse(word='матери', tag=OpencorporaTag('VERB,impf,tran sing,impr,excl'), normal_form='материть', score=0.1, methods_stack=((<DictionaryAnalyzer>, 'матери', 463, 11),)),
 Parse(word='ма

Загружаем корпус:

In [3]:
corpus = opencorpora.load('annot.opcorpora.no_ambig.xml') 

In [4]:
tokens = corpus.tokens

Создаем датасет:

In [5]:
frame = pd.DataFrame({'tokens':[t.source for t in tokens], 
                      'morph_POS':[morph.parse(t.source)[0].tag.POS for t in tokens], 
                      'OC_POS':[t.gramm emes[0] for t in tokens], 
                      'morph':[morph.parse(t.source)[0].tag for t in tokens],
                      'OC':[t.grammemes for t in tokens], 
                      'ambig':[True if len(morph.parse(t.source))>1 else False for t in tokens]})

Убираем знаки препинания и неопределенные в OpenCorpa слова. Создаем новую таблицу, в которой оставляем только те слова, для которых возможна омонимия. 

In [6]:
frame = frame[frame.OC_POS != 'PNCT']
frame = frame[frame.OC_POS != 'UNKN']
ambigf = frame[frame.ambig == True]
ambigf

Unnamed: 0,tokens,morph_POS,OC_POS,morph,OC,ambig
2,злословия,NOUN,NOUN,"NOUN,inan,neut sing,gent","[NOUN, inan, neut, sing, gent]",True
6,язык,NOUN,NOUN,"NOUN,inan,masc sing,accs","[NOUN, inan, masc, sing, accs]",True
8,ли,PRCL,PRCL,PRCL,[PRCL],True
9,градус,NOUN,NOUN,"NOUN,inan,masc sing,nomn","[NOUN, inan, masc, sing, nomn]",True
10,дискуссии,NOUN,NOUN,"NOUN,inan,femn sing,gent","[NOUN, inan, femn, sing, gent]",True
11,в,PREP,PREP,PREP,[PREP],True
12,новом,ADJF,ADJF,"ADJF,Qual masc,sing,loct","[ADJF, Qual, masc, sing, loct]",True
18,злословия,NOUN,NOUN,"NOUN,inan,neut sing,gent","[NOUN, inan, neut, sing, gent]",True
21,в,PREP,PREP,PREP,[PREP],True
22,эфир,NOUN,NOUN,"NOUN,inan,masc sing,accs","[NOUN, inan, masc, sing, accs]",True


In [7]:
def check_homonimy(row):
    if row['OC_POS']=='LATN':
        if 'LATN' not in row['morph']:
            return 'LATN_homonimy'
        else:
            return 'NO'
    elif row['OC_POS']=='ROMN':
        if 'ROMN' not in row['morph']:
            return 'ROMN_homonimy'
        else:
            return 'NO' 
    elif row['morph_POS']!=row['OC_POS']:
        return 'POS_homonymy'
    elif len(row['OC'])==1:
        return 'NO'
    elif row['morph_POS']=='NOUN':
        if str(row['morph'].case) not in list(row['OC']):
            return 'Morphol_homonimy_Noun_Case'
        if str(row['morph'].number) not in list(row['OC']):
            return 'Morphol_homonimy_Noun_Number'        
        if str(row['morph'].gender) not in list(row['OC']):
            return 'Morphol_homonimy_Noun_Gender'
        if str(row['morph'].animacy) not in list(row['OC']):
            return 'Morphol_homonimy_Noun_Animacy'
        else:
            return 'NO'
    elif row['morph_POS']=='ADJF':
        if str(row['morph'].case) not in list(row['OC']):
            return 'Morphol_homonimy_ADJF_Case'
        if str(row['morph'].number) not in list(row['OC']):
            return 'Morphol_homonimy_ADJF_Number'
        if (row['morph'].number != 'plur') and (str(row['morph'].gender) not in list(row['OC'])):
                return 'Morphol_homonimy_ADJF_Gender'
        else:
            return 'NO'
    elif row['morph_POS']=='PRTF':                                 # причастия
        if str(row['morph'].case) not in list(row['OC']):
            return 'Morphol_homonimy_PRTF_Case'
        if str(row['morph'].number) not in list(row['OC']):
            return 'Morphol_homonimy_PRTF_Number'        
        if str(row['morph'].gender) not in list(row['OC']):
            return 'Morphol_homonimy_PRTF_Gender'
        if str(row['morph'].transitivity) not in list(row['OC']):   # переходность
            return 'Morphol_homonimy_PRTF_transitivity'
        if str(row['morph'].aspect) not in list(row['OC']):
            return 'Morphol_homonimy_PRTF_Aspect'
        if str(row['morph'].tense) not in list(row['OC']):
            return 'Morphol_homonimy_PRTF_Tense'
        if str(row['morph'].voice) not in list(row['OC']):
            return 'Morphol_homonimy_PRTF_Voice'                 # залог
        else:
            return 'NO'
    elif row['morph_POS']=='NPRO':                                # местоимение-существительное
        if str(row['morph'].case) not in list(row['OC']):
            return 'Morphol_homonimy_NPRO_Case'
        if str(row['morph'].number) not in list(row['OC']):
            return 'Morphol_homonimy_NPRO_Number'
#        if str(row['morph'].person) not in list(row['OC']):
#            return 'Morphol_homonimy_NPRO_person'                 # лицо      
        else:
            return 'NO'
    elif row['morph_POS']=='NUMR':                               # числительные                    
        if str(row['morph'].case) not in list(row['OC']):
            return 'Morphol_homonimy_NUMR_Case'    
        else:
            return 'NO'       
    try:
        if set(row['OC']) in row['morph']:
            return 'NO'
    except ValueError:
        return "UNKNOWN"
    return "UNKNOWN"

In [8]:
def check_identity(row):
    try:
        r=set(row['OC']) in row['morph']
    except ValueError:
        return 0
    return r

def check_POS_ambig(row):
    return len(set([k.tag.POS for k in morph.parse(row['tokens'])]))>1

Создадим колонку Differ, содержащую True, если совпадает часть речи

In [9]:
ambigf['Differ'] = ambigf.apply(lambda row: check_identity(row),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Создадим колонку POS_ambig, содержащую True в случае возможности лексико-морфологической омонимии (множественности распознавания частей речи)

In [10]:
ambigf['POS_ambig'] = ambigf.apply(lambda row: check_POS_ambig(row),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Создадим главную колонку Homonimy, содержащую теги соответствующей омонимии. 

In [11]:
ambigf['Homonimy'] = ambigf.apply(lambda row: check_homonimy(row),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


UNKNOWN означает, что тег не присвоен по какой-то причине. Таких слов всего 6, помотрим вручную, что ошибок нет:

In [12]:
ambigf[ambigf.Homonimy == 'UNKNOWN']

Unnamed: 0,tokens,morph_POS,OC_POS,morph,OC,ambig,Differ,POS_ambig,Homonimy
13736,честнее,COMP,COMP,COMP,"[COMP, Qual]",True,False,False,UNKNOWN
31193,кругом,ADVB,ADVB,ADVB,"[ADVB, Prdx]",True,False,True,UNKNOWN
65073,кругом,ADVB,ADVB,ADVB,"[ADVB, Prdx]",True,False,True,UNKNOWN
77541,Кругом,ADVB,ADVB,ADVB,"[ADVB, Prdx]",True,False,True,UNKNOWN
82365,кругом,ADVB,ADVB,ADVB,"[ADVB, Prdx]",True,False,True,UNKNOWN
83332,кругом,ADVB,ADVB,ADVB,"[ADVB, Prdx]",True,False,True,UNKNOWN


In [13]:
k = ambigf['Homonimy'].value_counts()
print(k)

NO                               22269
Morphol_homonimy_Noun_Case        4086
Morphol_homonimy_ADJF_Case        1270
POS_homonymy                      1024
Morphol_homonimy_ADJF_Gender       294
Morphol_homonimy_Noun_Gender       226
Morphol_homonimy_Noun_Animacy      180
Morphol_homonimy_NPRO_Case         108
Morphol_homonimy_PRTF_Gender        71
Morphol_homonimy_PRTF_Case          44
ROMN_homonimy                       15
Morphol_homonimy_Noun_Number        14
Morphol_homonimy_NUMR_Case          13
UNKNOWN                              6
LATN_homonimy                        3
Name: Homonimy, dtype: int64


In [14]:
possible_hom = ambigf.shape[0]
num_words = frame.shape[0]
no_mistakes=k[0]
mistakes = sum(k[1:-1])
print('\n Correct:', num_words-mistakes, '\n wrong:', mistakes, '\n Percent of correct:', (num_words-mistakes)/num_words )


 Correct: 59963 
 wrong: 7351 
 Percent of correct: 0.8907953768903942


In [15]:
for i in k[1:-1]:
    print(i/mistakes*100)

55.58427424840158
17.276561012107198
13.930077540470684
3.9994558563460756
3.074411644674194
2.4486464426608627
1.4691878655965174
0.9658549857162292
0.5985580193170997
0.20405387022173854
0.19045027887362262
0.17684668752550675
0.08162154808869541


In [16]:
ambigf[ambigf.Homonimy == 'POS_homonymy']

Unnamed: 0,tokens,morph_POS,OC_POS,morph,OC,ambig,Differ,POS_ambig,Homonimy
33,это,PRCL,NPRO,PRCL,"[NPRO, neut, sing, nomn]",True,False,True,POS_homonymy
51,Это,PRCL,NPRO,PRCL,"[NPRO, neut, sing, nomn]",True,False,True,POS_homonymy
86,Это,PRCL,NPRO,PRCL,"[NPRO, neut, sing, nomn]",True,False,True,POS_homonymy
291,ночному,NOUN,ADJF,"NOUN,inan,neut sing,datv","[ADJF, masc, sing, datv]",True,False,True,POS_homonymy
375,и,CONJ,PRCL,CONJ,[PRCL],True,False,True,POS_homonymy
379,Маленькой,NOUN,ADJF,"NOUN,anim,femn sing,gent","[ADJF, Qual, femn, sing, loct]",True,False,True,POS_homonymy
435,осенью,NOUN,ADVB,"NOUN,inan,femn sing,ablt",[ADVB],True,False,True,POS_homonymy
438,зимой,ADVB,NOUN,ADVB,"[NOUN, inan, femn, sing, ablt]",True,False,True,POS_homonymy
596,тоже,ADVB,PRCL,ADVB,[PRCL],True,False,True,POS_homonymy
606,устарела,ADJS,VERB,"ADJS femn,sing","[VERB, perf, intr, femn, sing, past, indc]",True,False,True,POS_homonymy


In [17]:
toc = ambigf[ambigf.Homonimy!='NO']['tokens'].value_counts()
print(toc[1:10])

России     52
тоже       45
этом       44
США        42
человек    32
мира       29
Ссылки     26
века       25
этой       25
Name: tokens, dtype: int64


In [18]:
morph.parse('арене')

[Parse(word='арене', tag=OpencorporaTag('NOUN,anim,masc,Name sing,loct'), normal_form='арен', score=0.3333333333333333, methods_stack=((<DictionaryAnalyzer>, 'арене', 26, 5),)),
 Parse(word='арене', tag=OpencorporaTag('NOUN,inan,femn sing,datv'), normal_form='арена', score=0.3333333333333333, methods_stack=((<DictionaryAnalyzer>, 'арене', 55, 2),)),
 Parse(word='арене', tag=OpencorporaTag('NOUN,inan,femn sing,loct'), normal_form='арена', score=0.3333333333333333, methods_stack=((<DictionaryAnalyzer>, 'арене', 55, 6),))]

In [26]:
ambigf[28:38]

Unnamed: 0,tokens,morph_POS,OC_POS,morph,OC,ambig,Differ,POS_ambig,Homonimy
55,в,PREP,PREP,PREP,[PREP],True,True,True,NO
57,тут,ADVB,ADVB,"ADVB,Dmns","[ADVB, Dmns]",True,True,True,NO
58,же,PRCL,PRCL,PRCL,[PRCL],True,True,True,NO
59,появившихся,PRTF,PRTF,"PRTF,perf,intr,past,actv plur,gent","[PRTF, perf, intr, past, actv, plur, gent]",True,True,False,Morphol_homonimy_PRTF_Gender
60,рекламных,ADJF,ADJF,"ADJF,Qual plur,gent","[ADJF, Qual, plur, gent]",True,True,False,NO
63,отсутствовавших,PRTF,PRTF,"PRTF,impf,intr,past,actv plur,gent","[PRTF, impf, intr, past, actv, plur, gent]",True,True,False,Morphol_homonimy_PRTF_Gender
64,на,PREP,PREP,PREP,[PREP],True,True,True,NO
66,Культуре,NOUN,NOUN,"NOUN,inan,femn sing,datv","[NOUN, inan, femn, sing, loct]",True,False,False,Morphol_homonimy_Noun_Case
71,с,PREP,PREP,PREP,[PREP],True,True,True,NO
72,одной,ADJF,ADJF,"ADJF,Apro femn,sing,gent","[ADJF, Apro, Anum, femn, sing, gent]",True,False,False,NO
