In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append("..")
from data_preparation.data_preparation_pos import read_conll
import utils.utils as utils

### Multiword count per language and dataset

In [2]:
def count_multiwords(file_path, lang_name, lang_code, dataset, table):
    conllu_data = read_conll(file_path)
    multiwords = []
    total_tags = []
    for taglist in conllu_data[2]:
        multiwords.append(taglist.count("_"))
        total_tags.append(len(taglist))
    multiwords = sum(multiwords)
    total_tags = sum(total_tags)
    
    if lang_name in table["language"].values.tolist():
        table.loc[table["language"] == lang_name, [dataset, dataset + " (%)"]] = multiwords, multiwords/total_tags * 100
    else:
        table.loc[table.shape[0], ["language", dataset, dataset + " (%)"]] = lang_name, multiwords, multiwords/total_tags * 100
    return table

In [7]:
table = pd.DataFrame(dict.fromkeys(["language", "train", "train (%)", "dev", "dev (%)", "test", "test (%)"], []))
table = utils.run_through_data("../data/ud/", count_multiwords, table)

HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




Export excel

In [8]:
table = utils.order_table(table)
table = table.astype(dict.fromkeys(["train", "dev", "test"], pd.Int64Dtype())) # Convert to int
table.to_excel("multiword_stats.xlsx", index=False)

Export latex

In [9]:
table = table.round(dict.fromkeys(["train (%)", "dev (%)", "test (%)"], 2)) # Round
# Match decimal places to 2
table[["train (%)", "dev (%)", "test (%)"]] = table[["train (%)", "dev (%)", "test (%)"]].applymap(lambda x: "{:.2f}".format(x))
table = table.replace(np.nan, "-")
table = table.replace("nan", "-")
table

Unnamed: 0,language,train,train (%),dev,dev (%),test,test (%)
0,Bulgarian,0,0.00,0,0.00,0,0.0
1,English,0,0.00,0,0.00,0,0.0
2,Russian,0,0.00,0,0.00,0,0.0
3,Slovak,0,0.00,0,0.00,0,0.0
4,Croatian,0,0.00,0,0.00,0,0.0
5,Chinese,0,0.00,0,0.00,0,0.0
6,Vietnamese,0,0.00,0,0.00,0,0.0
7,Thai,-,-,-,-,0,0.0
8,Finnish,199,0.12,18,0.10,6,0.04
9,Basque,0,0.00,0,0.00,0,0.0


In [10]:
utils.convert_table_to_latex(table)

\fusional{Fusional} & Bulgarian & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & English & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & Russian & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & Slovak & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & Croatian & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\isolating{Isolating} & Chinese & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\isolating{Isolating} & Vietnamese & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\isolating{Isolating} & Thai & - & - & - & - & 0 & 0.00\\
\agglutinative{Agglutinative} & Finnish & 199 & 0.12 & 18 & 0.10 & 6 & 0.04\\
\agglutinative{Agglutinative} & Basque & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Japanese & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Korean & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Turkish & 949 & 2.45 & 264 & 2.56 & 351 & 2.04\\
\introflexive{Introflexive} & Arabic & 30519 & 12.00 & 4022 & 11.74 & 0 & 0.00\\
\introflexive{

Unnamed: 0,group,language,train,train (%),dev,dev (%),test,test (%)
0,\fusional{Fusional},Bulgarian,0,0.00,0,0.00,0,0.0
1,\fusional{Fusional},English,0,0.00,0,0.00,0,0.0
2,\fusional{Fusional},Russian,0,0.00,0,0.00,0,0.0
3,\fusional{Fusional},Slovak,0,0.00,0,0.00,0,0.0
4,\fusional{Fusional},Croatian,0,0.00,0,0.00,0,0.0
5,\isolating{Isolating},Chinese,0,0.00,0,0.00,0,0.0
6,\isolating{Isolating},Vietnamese,0,0.00,0,0.00,0,0.0
7,\isolating{Isolating},Thai,-,-,-,-,0,0.0
8,\agglutinative{Agglutinative},Finnish,199,0.12,18,0.10,6,0.04
9,\agglutinative{Agglutinative},Basque,0,0.00,0,0.00,0,0.0


### Multiword example

In [32]:
example = """ID	FORM	LEMMA	UPOS	XPOS	FEATS	HEAD	DEPREL	DEPS	MISC
1-2	Ellei	_	_	_	_	_	_	_	_
1	Jos	jos	SCONJ	C	_	4	mark	4:mark	_
2	ei	ei	AUX	V	Number=Sing|Person=3|Polarity=Neg|VerbForm=Fin|Voice=Act	4	aux	4:aux	_"""

In [38]:
for line in example.split("\n"):
    print(("{:<8}"*5+"{:<60}"+"{:<7}"*2).format(*line.split("\t")))

ID      FORM    LEMMA   UPOS    XPOS    FEATS                                                       HEAD   DEPREL 
1-2     Ellei   _       _       _       _                                                           _      _      
1       Jos     jos     SCONJ   C       _                                                           4      mark   
2       ei      ei      AUX     V       Number=Sing|Person=3|Polarity=Neg|VerbForm=Fin|Voice=Act    4      aux    
