# SIGMORPHON 2018 - Hungarian data analysis

In [1]:
import pandas as pd
from collections import defaultdict
import subprocess

In [2]:
model = "../../../../repo/emMorph/hfst/hu.hfstol"

def parse_hfst_output(output):
    all_ana = []
    word = None
    for line in output.split("\n"):
        if line.strip():
            word, ana = line.strip().split("\t")[:2]
            all_ana.append(ana)
        else:
            if all_ana:
                yield word, all_ana
            all_ana = []
    if all_ana:
        yield word, all_ana
        
def run_hfst_ana(words):
    p = subprocess.Popen("hfst-lookup {} --cascade=composition".format(model),
                         stdin=subprocess.PIPE, stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE, shell=True)
    stdout = p.communicate("\n".join(words).encode("utf8"))[0]
    hfst_output = {}
    not_found = set()
    for word, ana in parse_hfst_output(stdout.decode("utf8")):
        hfst_output[word] = ana
    return hfst_output

# Load data into DataFrames and run HFST

In [3]:
hun_train = pd.read_table("../../data/conll2018/task1/all/hungarian-train-high", names=["lemma", "word", "tags"])
hun_dev = pd.read_table("../../data/conll2018/task1/all/hungarian-dev", names=["lemma", "word", "tags"])
hun = pd.concat((hun_train, hun_dev))
hun = hun.reset_index(drop=True)

### Replace `singular` with the lemma

Nominal case is unmarked in Hungarian.

In [4]:
hun.loc[hun['word'] == 'singular', 'word'] = hun['lemma']
ana = pd.Series(run_hfst_ana(hun["word"])).to_frame()
hun = hun.merge(ana, left_on="word", right_index=True)
hun = hun.rename(columns={0: "hfst"})
hun.head()

Unnamed: 0,lemma,word,tags,hfst
0,teknős,teknősökről,N;ON+ABL;PL,"[teknő[/N]s[_Nz:s/N]ök[Pl]ről[Del], teknős[/N]..."
1,hideg,hidegekbe,N;IN+ALL;PL,"[hideg[/Adj]ek[Pl]be[Ill], hideg[/N]ek[Pl]be[I..."
2,erősebb,erősebben,N;IN+ESS;SG,"[erő[/N]s[_Adjz:s/Adj]ebb[_Comp/Adj]en[Ine], e..."
3,kormányfő,kormányfőknek,N;DAT;PL,"[kormány[/N]fő[/N]k[Pl]nek[Dat], kormányfő[/N]..."
4,belga,belgáknál,N;IN+ALL;PL,[belga[/Adj|nat]k[Pl]nál[Ade]]


## HFST fail cases

HFST prints `+?` if it cannot analyze a word form.

In [5]:
def does_hfst_fail(ana):
    return "+?" in ana[0]

hun["hfst_no_ana"] = hun["hfst"].apply(does_hfst_fail)
print("Couldn't analyze {} words".format(hun["hfst_no_ana"].sum()))

Couldn't analyze 194 words


## POS

Just because we can.

In [6]:
hun["pos"] = hun["tags"].apply(lambda s: s.split(";")[0])
hun["pos"].value_counts()

N         9162
V         1711
V.PTCP      98
V.CVB       29
Name: pos, dtype: int64

## Matching Unimorph tags with one of HFST's outputs

Here I'm trying to find the best match out of HFST's multiple outputs (usually 1-5 options). All Unimorph tags are being tested against the candidates.

In [7]:
noun_case_mapping = {
    "AT+ABL": "[Abl]",
    "AT+ALL": "[All]",
    "AT+ESS": "[Ade]",
    "IN+ABL": "[Ela]",
    "IN+ALL": "[Ill]",
    "IN+ESS": "[Ine]",
    "ON+ABL": "[Del]",
    "ON+ALL": "[Subl]",
    "ON+ESS": "[Supe]",
    "INST": "[Ins]",
    "PRP": "[Cau]",
    "TRANS": "[Transl]",
    "TERM": "[Ter]",
    "DAT": "[Dat]",
    "ACC": "[Acc]",
    "NOM": "[Nom]",
    "FRML": "[EssFor:ként]",
}

tag_mapping = {
    "V": "/V",
    "COND": "[Cond",
    "DEF": "Def",
    "1": ".1",
    "2": ".2",
    "3": ".3",
    "FUT": "Fut",
    "SBJV": "Sbjv",
    "V.CVB": "[_AdvPtcp/Adv]",
    "V.PTCP": "Ptcp",
    "INDF": ".NDef.",
    "NFIN": "[Inf",
}

def substr_match(s):
    def is_in(hfst_ana, tags):
        return s in hfst_ana
    return is_in


correct_func = {
    key: substr_match(value) for key, value in noun_case_mapping.items()
}

correct_func.update({
    key: substr_match(value) for key, value in tag_mapping.items()
})

def correct_singular(hfst_ana, sigm_tags):
    if sigm_tags[0] == "N":
        return "[Pl" not in hfst_ana
    return "Sg" in hfst_ana
    
    
def correct_plural(hfst_ana, sigm_tags):
    if sigm_tags[0] == "N":
        return "[Pl]" in hfst_ana
    return "Pl" in hfst_ana
    
    
def past_correct(hfst_ana, sigm_tags):
    if sigm_tags[0] == 'V.PTCP':
        return 'Fut' not in hfst_ana
    return '[Pst' in hfst_ana


correct_func['SG'] = correct_singular
correct_func['PL'] = correct_plural
correct_func['N'] = lambda hfst_ana, tags: ('/N' in hfst_ana or '/Adj' in hfst_ana)
correct_func['PRS'] = lambda hfst_ana, tags: ('[Pst' not in hfst_ana)
correct_func['PST'] = past_correct

correct_func['IND'] = lambda hfst_ana, tags: ('Sbjv' not in hfst_ana)

def disambig_ana(row, verbose=False):
    tags = row['tags'].split(';')
    correct = []
    for ana in row['hfst']:
        ok = {}
        for tag in tags:
            ok[tag] = correct_func[tag](ana, tags)
        if verbose:
            print(ana, ok)
        if all(ok.values()):
            correct.append(ana)
    if len(correct) > 0:
        row['correct_ana'] = correct[0]
    else:
        row['correct_ana'] = None
    return row
   

In [8]:
hun = hun.apply(disambig_ana, axis=1)
hun.head()

Unnamed: 0,lemma,word,tags,hfst,hfst_no_ana,pos,correct_ana
0,teknős,teknősökről,N;ON+ABL;PL,"[teknő[/N]s[_Nz:s/N]ök[Pl]ről[Del], teknős[/N]...",False,N,teknő[/N]s[_Nz:s/N]ök[Pl]ről[Del]
1,hideg,hidegekbe,N;IN+ALL;PL,"[hideg[/Adj]ek[Pl]be[Ill], hideg[/N]ek[Pl]be[I...",False,N,hideg[/Adj]ek[Pl]be[Ill]
2,erősebb,erősebben,N;IN+ESS;SG,"[erő[/N]s[_Adjz:s/Adj]ebb[_Comp/Adj]en[Ine], e...",False,N,erő[/N]s[_Adjz:s/Adj]ebb[_Comp/Adj]en[Ine]
3,kormányfő,kormányfőknek,N;DAT;PL,"[kormány[/N]fő[/N]k[Pl]nek[Dat], kormányfő[/N]...",False,N,kormány[/N]fő[/N]k[Pl]nek[Dat]
4,belga,belgáknál,N;IN+ALL;PL,[belga[/Adj|nat]k[Pl]nál[Ade]],False,N,


## Failed cases

HFST is able to analyze the word but I could not match it with Unimorph tags.

In [9]:
no_correct = hun[(hun['correct_ana'].isnull()) & (hun['hfst_no_ana'] == False)].copy()
print("Unable to match {} out of {}".format(len(no_correct), len(hun)))

Unable to match 968 out of 11000


In [10]:
no_correct.head()

Unnamed: 0,lemma,word,tags,hfst,hfst_no_ana,pos,correct_ana
4,belga,belgáknál,N;IN+ALL;PL,[belga[/Adj|nat]k[Pl]nál[Ade]],False,N,
44,földcsuszamlás,földcsuszamlásból,N;ON+ABL;SG,"[föld[/N]csuszamlik[/V]ás[_Ger/N]ból[Ela], föl...",False,N,
46,hagyomány,hagyományig,N;FRML;SG,[hagyomány[/N]ig[Ter]],False,N,
54,cédé,cédére,N;AT+ALL;SG,[cédé[/N|Acronx]re[Subl]],False,N,
57,panamakalap,panamakalapokig,N;FRML;PL,"[panama[/N]kalap[/N]ok[Pl]ig[Ter], panamakalap...",False,N,


### All of them are nouns

The single verb is actually an error by HFST.

In [11]:
no_correct['pos'].value_counts()

N        967
V.CVB      1
Name: pos, dtype: int64

## Incorrect noun cases

I look for noun case mismatches.

In [12]:
inv_case_mapping = {v: k for k, v in noun_case_mapping.items()}

confusion_matrix = defaultdict(lambda: defaultdict(int))

def noun_case_incorrect(row):
    tags = row['tags'].split(';')
    if tags[0] != 'N':
        row['noun_case_incorrect'] = False
    else:
        case = tags[1]
        hfst_case = noun_case_mapping[case]
        if row['correct_ana'] is not None:
            assert hfst_case in row['correct_ana']
            row['noun_case_incorrect'] = False
        else:
            found = False
            for ana in row['hfst']:
                if hfst_case in ana:
                    row['noun_case_incorrect'] = False
                    found = True
                    break
            if found is False:
                for other_case in noun_case_mapping.values():
                    if other_case == hfst_case:
                        continue
                    if any(other_case in ana for ana in row['hfst']):
                        confusion_matrix[inv_case_mapping[other_case]][case] += 1
                        break
                row['noun_case_incorrect'] = True
    return row

hun = hun.apply(noun_case_incorrect, axis=1)

### Incorrect noun cases

Listing unanalyzed words as well.

In [13]:
hun[hun.pos=='N'].groupby(['noun_case_incorrect', 'hfst_no_ana']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
noun_case_incorrect,hfst_no_ana,Unnamed: 2_level_1
False,False,8105
True,False,885
True,True,172


### Confusion matrix

It's clear that there is a misalignment.

In [14]:
pd.DataFrame(confusion_matrix).fillna(0).astype(int)

Unnamed: 0,ACC,AT+ALL,AT+ESS,DAT,IN+ABL,IN+ALL,IN+ESS,INST,NOM,ON+ABL,ON+ALL,ON+ESS,PRP,TERM,TRANS
ACC,0,0,0,0,0,0,0,0,60,0,0,0,0,0,0
AT+ABL,0,0,0,0,0,0,0,0,0,56,0,0,0,0,0
AT+ALL,0,0,0,0,0,0,0,0,0,0,59,0,0,0,0
AT+ESS,0,0,0,0,0,0,0,0,0,0,0,63,0,0,0
DAT,43,0,0,0,0,0,0,0,0,0,0,0,0,0,0
FRML,0,0,0,0,0,0,0,0,0,0,0,0,0,67,0
IN+ABL,0,64,0,0,0,0,0,0,0,0,0,0,0,0,0
IN+ALL,0,0,72,0,0,0,0,0,1,0,0,0,0,0,0
INST,0,0,0,54,0,0,0,0,0,0,0,0,0,0,0
ON+ABL,0,0,0,0,62,0,0,0,0,0,0,0,0,0,0


## Incorrect number in nouns

In [15]:
def noun_number_incorrect(row):
    tags = row['tags'].split(';')
    if tags[0] != 'N':
        row['noun_number_incorrect'] = False
    else:
        number = tags[2]
        if row['correct_ana'] is not None:
            row['noun_number_incorrect'] = False
        else:
            if number == "SG":
                row['noun_number_incorrect'] = not any(correct_singular(ana, tags) for ana in row['hfst'])
            if number == "PL":
                row['noun_number_incorrect'] = not any(correct_plural(ana, tags) for ana in row['hfst'])
    return row

In [16]:
hun = hun.apply(noun_number_incorrect, axis=1)
nouns = hun[hun.pos=='N']
nouns['noun_number_incorrect'].sum(), len(nouns)

(188, 9162)

## Error co-occurence matrix

In [17]:
hun[hun.pos=='N'].groupby(['noun_number_incorrect', 'noun_case_incorrect']).size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
noun_number_incorrect,noun_case_incorrect,Unnamed: 2_level_1
False,False,8023
False,True,951
True,False,82
True,True,106


## Other errors

These two errors cover all errors (excluding words not analyzed by HFST). The only one left is the incorrect verb.

In [18]:
has_ana = hun[hun.hfst_no_ana == False]
has_ana.correct_ana

has_ana[(has_ana.noun_number_incorrect == False) & (has_ana.noun_case_incorrect == False)
        & (has_ana.correct_ana.isnull())]

Unnamed: 0,lemma,word,tags,hfst,hfst_no_ana,pos,correct_ana,noun_case_incorrect,noun_number_incorrect
5139,távvezérel,távvezérelve,V.CVB,[táv[/N]vezér[/N]elv[/N]e[Poss.3Sg][Nom]],False,V.CVB,,False,False
