In [156]:
import pandas as pd
import os
import re
import numpy as np
from collections import defaultdict

%matplotlib inline

DATA_DIR = "../data/conll2018/task1/all"

In [53]:
column_names = ["lemma", "inflected", "msd"]

hun_df = pd.read_table(os.path.join(DATA_DIR, "hungarian-train-high"), names=column_names)
hun_df.describe()

Unnamed: 0,lemma,inflected,msd
count,10000,10000,10000
unique,7123,9954,93
top,hámoz,plural,N;ON+ESS;SG
freq,6,24,286


In [157]:
unimorph_schema = pd.read_csv('https://docs.google.com/spreadsheets/d/1yEqZk13qK_E8CjL1pqIu2k89pRrq503smEY85DFFYEI/export?format=csv')

In [158]:
unimorph_schema['Label'] = unimorph_schema['Label'].str.upper()
unimorph_schema['Alternatives'] = unimorph_schema['Alternatives'].str.upper()
# unimorph_schema = unimorph_schema.set_index('Label')
unimorph_schema.describe()

Unnamed: 0,Dimension,Feature,Label,Alternatives
count,247,247,247,7
unique,25,237,246,7
top,Case,Proximate,PROX,NEGATIVE
freq,42,3,2,1


In [161]:
unimorph_update = [
    {'Dimension': 'Case', 'Label': 'LOC'},
]

for u in unimorph_update:
    unimorph_schema = unimorph_schema.append(u, ignore_index=True)

In [195]:
known_tags = set(unimorph_schema['Alternatives'].dropna().str.split(',', expand=True).values.flatten())
known_tags.discard(None)
known_tags |= set(unimorph_schema['Label'])

tag_freqs = defaultdict(int)
missing_tag_freqs = defaultdict(int)
missing_per_lang = defaultdict(int)

languages = {fn.split("-")[0] for fn in os.listdir(DATA_DIR)}
size = ["high", "medium", "low"]

for fn in os.listdir(DATA_DIR):
    lang = fn.split("-")[0]
    if lang == 'basque':
        continue
    if not "train" in fn:
        continue
    if "low" in fn and "{}-train-medium".format(lang) in os.listdir(DATA_DIR):
        continue
    if "medium" in fn and "{}-train-high".format(lang) in os.listdir(DATA_DIR):
        continue
    df = pd.read_table(os.path.join(DATA_DIR, fn), names=column_names)
    for tags in df['msd']:
        for tag_ in tags.split(";"):
            for tag in re.split(r'[:+/]', tag_):
                if tag.startswith('LGSPEC'):
                    continue
                if tag in known_tags:
                    tag_freqs[tag] += 1
                else:
                    missing_per_lang[lang] += 1
                    missing_tag_freqs[tag] += 1

In [196]:
print(len(missing_tag_freqs), sum(missing_tag_freqs.values()))
sorted(missing_tag_freqs.items(), key=lambda x: -x[1])[:20]

106 54896


[('ARG3', 4115),
 ('3rd_PERSON_PLURAL', 3356),
 ('ARGDU', 3306),
 ('ARGPL', 3222),
 ('ARG1', 3077),
 ('PRES', 2911),
 ('ARGSG', 2646),
 ('INTERROGATIVE', 2019),
 ('FUTURE', 2014),
 ('ARG2', 1982),
 ('ARGEXCL', 1729),
 ('HABITUAL', 1242),
 ('BIASED', 929),
 ('(non)NOM', 780),
 ('CERTAIN', 777),
 ('ARGINCL', 680),
 ('SIMPLE', 627),
 ('NON_FACTIVE', 623),
 ('RELATIVE', 621),
 ('FREQ', 562)]

In [197]:
missing_per_lang

defaultdict(int,
            {'armenian': 329,
             'bengali': 83,
             'faroese': 248,
             'finnish': 57,
             'greek': 41,
             'haida': 15630,
             'hebrew': 413,
             'irish': 332,
             'khaling': 20757,
             'latin': 103,
             'latvian': 28,
             'lithuanian': 562,
             'livonian': 10,
             'murrinhpatha': 420,
             'navajo': 587,
             'old': 41,
             'pashto': 780,
             'quechua': 1865,
             'swahili': 2212,
             'turkish': 787,
             'yiddish': 1435,
             'zulu': 8176})

In [217]:
hun_df.head()

# hun_df.msd.str.split(';').apply(lambda x: x[0])

nouns = hun_df[hun_df.msd.str[0] == 'N']
noun_cases = set(nouns.msd.str.split(';').apply(lambda x: x[1]))
noun_number = set(['SG', 'PL'])
for n in nouns.msd:
    for tag in n.split(';')[1:]:
        if tag not in noun_cases and tag not in noun_number:
            print("Unknown tag", tag)
            

noun_lemmas = set(nouns['lemma'])
generated = []
for lemma in noun_lemmas:
    for case in noun_cases:
        for number in noun_number:
            msd = 'N;{};{}'.format(case, number)
            generated.append({'lemma': lemma, 'msd': msd})

hun_gen = pd.DataFrame(generated)
hun_gen['inflected'] = None

In [222]:
hun_gen = hun_gen.iloc[:, [0, 2, 1]]

In [224]:
hun_gen.to_csv("hun_generated", index=False, header=False, sep="\t")

In [121]:
def find_dimensions(fn):
    cooc = defaultdict(lambda: defaultdict(int))
    with open(fn) as f:
        for line in f:
            tags = line.strip().split("\t")[2].split(';')
            for i in range(len(tags)-1):
                for j in range(i+1, len(tags)):
                    cooc[tags[i]][tags[j]] += 1
                    cooc[tags[j]][tags[i]] += 1
    cooc_mtx = np.zeros((len(cooc), len(cooc)), dtype=np.int32)
    tag_mapping = {}
    for tag1, co in cooc.items():
        i = tag_mapping.setdefault(tag1, len(tag_mapping))
        for tag2, fr in co.items():
            j = tag_mapping.setdefault(tag2, len(tag_mapping))
            cooc_mtx[i, j] += fr
            cooc_mtx[j, i] += fr
    return cooc_mtx, tag_mapping

m, mapping = find_dimensions("../data/conll2018/task1/all/hungarian-train-medium")

In [135]:
inv_mapping = {v: k for k, v in mapping.items()}
for i, j in zip(*np.where(m == 0)):
    if i == j:
        continue
    print(inv_mapping[i], inv_mapping[j])

N V
N COND
N PRS
N DEF
N 1
N SBJV
N INDF
N 2
N 3
N IND
N PST
N NFIN
N V.PTCP
N FUT
ON+ABL IN+ALL
ON+ABL IN+ESS
ON+ABL DAT
ON+ABL TERM
ON+ABL INST
ON+ABL AT+ALL
ON+ABL FRML
ON+ABL ACC
ON+ABL AT+ESS
ON+ABL TRANS
ON+ABL IN+ABL
ON+ABL ON+ALL
ON+ABL AT+ABL
ON+ABL ON+ESS
ON+ABL PRP
ON+ABL NOM
ON+ABL V
ON+ABL COND
ON+ABL PRS
ON+ABL DEF
ON+ABL 1
ON+ABL SBJV
ON+ABL INDF
ON+ABL 2
ON+ABL 3
ON+ABL IND
ON+ABL PST
ON+ABL NFIN
ON+ABL V.PTCP
ON+ABL FUT
PL SG
PL V.PTCP
PL FUT
IN+ALL ON+ABL
IN+ALL IN+ESS
IN+ALL DAT
IN+ALL TERM
IN+ALL INST
IN+ALL AT+ALL
IN+ALL FRML
IN+ALL ACC
IN+ALL AT+ESS
IN+ALL TRANS
IN+ALL IN+ABL
IN+ALL ON+ALL
IN+ALL AT+ABL
IN+ALL ON+ESS
IN+ALL PRP
IN+ALL NOM
IN+ALL V
IN+ALL COND
IN+ALL PRS
IN+ALL DEF
IN+ALL 1
IN+ALL SBJV
IN+ALL INDF
IN+ALL 2
IN+ALL 3
IN+ALL IND
IN+ALL PST
IN+ALL NFIN
IN+ALL V.PTCP
IN+ALL FUT
IN+ESS ON+ABL
IN+ESS IN+ALL
IN+ESS DAT
IN+ESS TERM
IN+ESS INST
IN+ESS AT+ALL
IN+ESS FRML
IN+ESS ACC
IN+ESS AT+ESS
IN+ESS TRANS
IN+ESS IN+ABL
IN+ESS ON+ALL
IN+ESS AT+ABL
IN+ESS ON