In [1]:
import numpy as np
import pandas as pd
import re

import sys
sys.path.append("..")
import utils.utils as utils
from data_preparation.data_preparation_pos import read_conll, ABSATokenizer

In [2]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

### PoS

In [3]:
model_name = "bert-base-multilingual-cased"
tokenizer = ABSATokenizer.from_pretrained(model_name)

In [4]:
def pos_examples_above_ntokens(info, stats, N, tokenizer):      
    lang_name = info["lang_name"]
    file_path = info["file_path"]
    dataset = info["dataset"]
    
    conllu_data = read_conll(file_path)
    examples = [{"id": sent_id, "tokens": tokens, "tags": tags} for sent_id, tokens, tags in zip(conllu_data[0], 
                                                                                                 conllu_data[1],
                                                                                                 conllu_data[2])]
    example_lengths = [len(tokenizer.subword_tokenize(e["tokens"], e["tags"])[0]) for e in examples]
    
    if lang_name not in stats.keys():
        stats[lang_name] = {}
    for n in N:
        cond = np.array(example_lengths) > n
        stats[lang_name][(dataset + " >" + str(n))] = cond.sum()
        stats[lang_name][(dataset + " >" + str(n) + "(%)")] = cond.mean() * 100

    return stats

In [65]:
pos_stats = utils.run_through_data("../data/ud/", pos_examples_above_ntokens, {}, N=(128, 256), tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [66]:
pos_above_length = pd.DataFrame.from_dict(pos_stats, orient="index")
first, last = [], []
for col in pos_above_length.columns:
    if "train" in col:
        first.append(col)
    else:
        last.append(col)
pos_above_length = pos_above_length[first + last]
pos_above_length = pos_above_length.reset_index().rename(columns={"index": "language"})
pos_above_length = utils.order_table(pos_above_length)
pos_above_length = pos_above_length.astype(
    dict.fromkeys([col for col in pos_above_length.columns if (">" in col and "%" not in col)], pd.Int64Dtype())
) # Convert to int

In [67]:
pos_above_length

Unnamed: 0,language,train >128,train >128(%),train >256,train >256(%),dev >128,dev >128(%),dev >256,dev >256(%),test >128,test >128(%),test >256,test >256(%)
0,Bulgarian,0.0,0.0,0.0,0.0,1.0,0.089686,0.0,0.0,1,0.089606,0,0.0
1,English,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
2,Russian,28.0,0.727273,0.0,0.0,6.0,1.036269,1.0,0.172712,0,0.0,0,0.0
3,Slovak,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
4,Croatian,12.0,0.173561,1.0,0.014463,2.0,0.208333,0.0,0.0,0,0.0,0,0.0
5,Chinese,5.0,0.125094,0.0,0.0,1.0,0.2,0.0,0.0,0,0.0,0,0.0
6,Vietnamese,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
7,Thai,,,,,,,,,28,2.8,0,0.0
8,Finnish,72.0,0.589343,12.0,0.098224,10.0,0.733138,1.0,0.073314,0,0.0,0,0.0
9,Basque,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0


Join train and dev to save space

In [8]:
example_lengths = pd.read_excel("pos_basic_stats.xlsx")

In [68]:
total = example_lengths["train_examples"] + example_lengths["dev_examples"]
pos_above_length["train+dev >128"] = pos_above_length["train >128"] + pos_above_length["dev >128"]
pos_above_length["train+dev >128(%)"] = pos_above_length["train+dev >128"] / total * 100
pos_above_length["train+dev >256"] = pos_above_length["train >256"] + pos_above_length["dev >256"]
pos_above_length["train+dev >256(%)"] = pos_above_length["train+dev >256"] / total * 100

In [69]:
pos_above_length

Unnamed: 0,language,train >128,train >128(%),train >256,train >256(%),dev >128,dev >128(%),dev >256,dev >256(%),test >128,test >128(%),test >256,test >256(%),train+dev >128,train+dev >128(%),train+dev >256,train+dev >256(%)
0,Bulgarian,0.0,0.0,0.0,0.0,1.0,0.089686,0.0,0.0,1,0.089606,0,0.0,1.0,0.009978,0.0,0.0
1,English,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
2,Russian,28.0,0.727273,0.0,0.0,6.0,1.036269,1.0,0.172712,0,0.0,0,0.0,34.0,0.767668,1.0,0.022578
3,Slovak,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
4,Croatian,12.0,0.173561,1.0,0.014463,2.0,0.208333,0.0,0.0,0,0.0,0,0.0,14.0,0.1778,1.0,0.0127
5,Chinese,5.0,0.125094,0.0,0.0,1.0,0.2,0.0,0.0,0,0.0,0,0.0,6.0,0.133422,0.0,0.0
6,Vietnamese,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
7,Thai,,,,,,,,,28,2.8,0,0.0,,,,
8,Finnish,72.0,0.589343,12.0,0.098224,10.0,0.733138,1.0,0.073314,0,0.0,0,0.0,82.0,0.603785,13.0,0.095722
9,Basque,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0


Export excel

In [70]:
pos_above_length.to_excel("pos_above_length.xlsx", index=False)

Export latex

In [71]:
float_cols = [col for col in pos_above_length.columns if "%" in col]
pos_above_length[float_cols] = pos_above_length[float_cols].applymap(lambda x: "{:.2f}".format(x))
pos_above_length = pos_above_length.replace(np.nan, "-")
pos_above_length = pos_above_length.replace("nan", "-")
pos_above_length

Unnamed: 0,language,train >128,train >128(%),train >256,train >256(%),dev >128,dev >128(%),dev >256,dev >256(%),test >128,test >128(%),test >256,test >256(%),train+dev >128,train+dev >128(%),train+dev >256,train+dev >256(%)
0,Bulgarian,0,0.00,0,0.00,1,0.09,0,0.00,1,0.09,0,0.0,1,0.01,0,0.00
1,English,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0.0,0,0.00,0,0.00
2,Russian,28,0.73,0,0.00,6,1.04,1,0.17,0,0.0,0,0.0,34,0.77,1,0.02
3,Slovak,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0.0,0,0.00,0,0.00
4,Croatian,12,0.17,1,0.01,2,0.21,0,0.00,0,0.0,0,0.0,14,0.18,1,0.01
5,Chinese,5,0.13,0,0.00,1,0.20,0,0.00,0,0.0,0,0.0,6,0.13,0,0.00
6,Vietnamese,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0.0,0,0.00,0,0.00
7,Thai,-,-,-,-,-,-,-,-,28,2.8,0,0.0,-,-,-,-
8,Finnish,72,0.59,12,0.10,10,0.73,1,0.07,0,0.0,0,0.0,82,0.60,13,0.10
9,Basque,0,0.00,0,0.00,0,0.00,0,0.00,0,0.0,0,0.0,0,0.00,0,0.00


In [75]:
train_dev_cols = re.findall(r"train\+dev >\d+(?:\(%\))?", "\n".join(pos_above_length.columns))
test_cols = re.findall(r"test.*", "\n".join(pos_above_length.columns))
utils.convert_table_to_latex(
    pos_above_length[["language"] + train_dev_cols[:2] + test_cols[:2] + train_dev_cols[2:] + test_cols[2:]]
)

\fusional{Fusional} & Bulgarian & 1 & 0.01 & 1 & 0.09 & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & English & 0 & 0.00 & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & Russian & 34 & 0.77 & 0 & 0.00 & 1 & 0.02 & 0 & 0.00\\
\fusional{Fusional} & Slovak & 0 & 0.00 & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & Croatian & 14 & 0.18 & 0 & 0.00 & 1 & 0.01 & 0 & 0.00\\
\isolating{Isolating} & Chinese & 6 & 0.13 & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\isolating{Isolating} & Vietnamese & 0 & 0.00 & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\isolating{Isolating} & Thai & - & - & 28 & 2.80 & - & - & 0 & 0.00\\
\agglutinative{Agglutinative} & Finnish & 82 & 0.60 & 0 & 0.00 & 13 & 0.10 & 0 & 0.00\\
\agglutinative{Agglutinative} & Basque & 0 & 0.00 & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Japanese & 22 & 0.29 & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Korean & 0 & 0.00 & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Turkish &

Unnamed: 0,group,language,train+dev >128,train+dev >128(%),test >128,test >128(%),train+dev >256,train+dev >256(%),test >256,test >256(%)
0,\fusional{Fusional},Bulgarian,1,0.01,1,0.09,0,0.00,0,0.0
1,\fusional{Fusional},English,0,0.00,0,0.0,0,0.00,0,0.0
2,\fusional{Fusional},Russian,34,0.77,0,0.0,1,0.02,0,0.0
3,\fusional{Fusional},Slovak,0,0.00,0,0.0,0,0.00,0,0.0
4,\fusional{Fusional},Croatian,14,0.18,0,0.0,1,0.01,0,0.0
5,\isolating{Isolating},Chinese,6,0.13,0,0.0,0,0.00,0,0.0
6,\isolating{Isolating},Vietnamese,0,0.00,0,0.0,0,0.00,0,0.0
7,\isolating{Isolating},Thai,-,-,28,2.8,-,-,0,0.0
8,\agglutinative{Agglutinative},Finnish,82,0.60,0,0.0,13,0.10,0,0.0
9,\agglutinative{Agglutinative},Basque,0,0.00,0,0.0,0,0.00,0,0.0


### Sentiment

In [3]:
model_name = "bert-base-multilingual-cased"
tokenizer = ABSATokenizer.from_pretrained(model_name)

In [6]:
def sentiment_examples_above_ntokens(info, stats, N, tokenizer):      
    lang_name = info["lang_name"]
    file_path = info["file_path"]
    dataset = info["dataset"]
    
    data = pd.read_csv(file_path, header=None)
    data.columns = ["sentiment", "review"]
    example_lengths = [len(tokenizer.encode(e)) for e in data["review"]]
    
    if lang_name not in stats.keys():
        stats[lang_name] = {}
    for n in N:
        cond = np.array(example_lengths) > n
        stats[lang_name][(dataset + " >" + str(n))] = cond.sum()
        stats[lang_name][(dataset + " >" + str(n) + "(%)")] = cond.mean() * 100

    return stats

In [18]:
sentiment_stats = utils.run_through_data("../data/sentiment/", sentiment_examples_above_ntokens, 
                                         {}, N=(256, 512), tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))

../data/sentiment\ko\kosac-corpus-130808.csv is not a valid data path, skipping



In [19]:
sentiment_above_length = pd.DataFrame.from_dict(sentiment_stats, orient="index")
first, last = [], []
for col in sentiment_above_length.columns:
    if "train" in col:
        first.append(col)
    else:
        last.append(col)
sentiment_above_length = sentiment_above_length[first + last]
sentiment_above_length = sentiment_above_length.reset_index().rename(columns={"index": "language"})
sentiment_above_length = utils.order_table(sentiment_above_length)
sentiment_above_length = sentiment_above_length.astype(
    dict.fromkeys([col for col in sentiment_above_length.columns if (">" in col and "%" not in col)], pd.Int64Dtype())
) # Convert to int

In [20]:
sentiment_above_length

Unnamed: 0,language,train >256,train >256(%),train >512,train >512(%),dev >256,dev >256(%),dev >512,dev >512(%),test >256,test >256(%),test >512,test >512(%)
0,Bulgarian,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,English,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,Russian,2559,87.100068,1718,58.475153,361,85.141509,249,58.726415,750,86.50519,496,57.208766
3,Slovak,20,0.537057,0,0.0,1,0.18797,0,0.0,9,0.845865,0,0.0
4,Croatian,17,1.128069,3,0.199071,1,0.46729,1,0.46729,1,0.228833,0,0.0
5,Chinese,381,1.974912,63,0.32656,49,1.777939,5,0.181422,106,1.922728,11,0.199528
6,Vietnamese,6,0.251678,0,0.0,3,0.906344,0,0.0,3,0.437956,1,0.145985
7,Thai,121,1.493274,17,0.209799,22,1.908066,5,0.433651,48,2.047782,8,0.341297
8,Finnish,81,5.97786,14,1.03321,13,6.532663,1,0.502513,26,6.549118,5,1.259446
9,Basque,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


Join train, dev and test to save space

In [21]:
example_lengths = pd.read_excel("sentiment_basic_stats.xlsx")

In [22]:
total = example_lengths["train_examples"] + example_lengths["dev_examples"] + example_lengths["test_examples"]
sentiment_above_length[">256"] = sentiment_above_length["train >256"] + sentiment_above_length["dev >256"] + \
                                 sentiment_above_length["test >256"]
sentiment_above_length[">256(%)"] = sentiment_above_length[">256"] / total * 100
sentiment_above_length[">512"] = sentiment_above_length["train >512"] + sentiment_above_length["dev >512"] + \
                                 sentiment_above_length["test >512"]
sentiment_above_length[">512(%)"] = sentiment_above_length[">512"] / total * 100

In [23]:
sentiment_above_length

Unnamed: 0,language,train >256,train >256(%),train >512,train >512(%),dev >256,dev >256(%),dev >512,dev >512(%),test >256,test >256(%),test >512,test >512(%),>256,>256(%),>512,>512(%)
0,Bulgarian,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,English,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,Russian,2559,87.100068,1718,58.475153,361,85.141509,249,58.726415,750,86.50519,496,57.208766,3670,86.781745,2463,58.240719
3,Slovak,20,0.537057,0,0.0,1,0.18797,0,0.0,9,0.845865,0,0.0,30,0.56391,0,0.0
4,Croatian,17,1.128069,3,0.199071,1,0.46729,1,0.46729,1,0.228833,0,0.0,19,0.880445,4,0.185357
5,Chinese,381,1.974912,63,0.32656,49,1.777939,5,0.181422,106,1.922728,11,0.199528,536,1.944777,79,0.286637
6,Vietnamese,6,0.251678,0,0.0,3,0.906344,0,0.0,3,0.437956,1,0.145985,12,0.352941,1,0.029412
7,Thai,121,1.493274,17,0.209799,22,1.908066,5,0.433651,48,2.047782,8,0.341297,191,1.646552,30,0.258621
8,Finnish,81,5.97786,14,1.03321,13,6.532663,1,0.502513,26,6.549118,5,1.259446,120,6.150692,20,1.025115
9,Basque,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


Export excel

In [24]:
sentiment_above_length.to_excel("sentiment_above_length.xlsx", index=False)

Export latex

In [29]:
float_cols = [col for col in sentiment_above_length.columns if "%" in col]
sentiment_above_length[float_cols] = sentiment_above_length[float_cols].applymap(lambda x: "{:.2f}".format(x))
sentiment_above_length = sentiment_above_length.replace(np.nan, "-")
sentiment_above_length = sentiment_above_length.replace("nan", "-")
sentiment_above_length

Unnamed: 0,language,train >256,train >256(%),train >512,train >512(%),dev >256,dev >256(%),dev >512,dev >512(%),test >256,test >256(%),test >512,test >512(%),>256,>256(%),>512,>512(%)
0,Bulgarian,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,English,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,Russian,2559,87.1,1718,58.48,361,85.14,249,58.73,750,86.51,496,57.21,3670,86.78,2463,58.24
3,Slovak,20,0.54,0,0.0,1,0.19,0,0.0,9,0.85,0,0.0,30,0.56,0,0.0
4,Croatian,17,1.13,3,0.2,1,0.47,1,0.47,1,0.23,0,0.0,19,0.88,4,0.19
5,Chinese,381,1.97,63,0.33,49,1.78,5,0.18,106,1.92,11,0.2,536,1.94,79,0.29
6,Vietnamese,6,0.25,0,0.0,3,0.91,0,0.0,3,0.44,1,0.15,12,0.35,1,0.03
7,Thai,121,1.49,17,0.21,22,1.91,5,0.43,48,2.05,8,0.34,191,1.65,30,0.26
8,Finnish,81,5.98,14,1.03,13,6.53,1,0.5,26,6.55,5,1.26,120,6.15,20,1.03
9,Basque,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [30]:
train_dev_cols = re.findall(r"train\+dev >\d+(?:\(%\))?", "\n".join(sentiment_above_length.columns))
test_cols = re.findall(r"test.*", "\n".join(sentiment_above_length.columns))
utils.convert_table_to_latex(sentiment_above_length.iloc[:,[0]+list(range(13, 17))])

\fusional{Fusional} & Bulgarian & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & English & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & Russian & 3670 & 86.78 & 2463 & 58.24\\
\fusional{Fusional} & Slovak & 30 & 0.56 & 0 & 0.00\\
\fusional{Fusional} & Croatian & 19 & 0.88 & 4 & 0.19\\
\isolating{Isolating} & Chinese & 536 & 1.94 & 79 & 0.29\\
\isolating{Isolating} & Vietnamese & 12 & 0.35 & 1 & 0.03\\
\isolating{Isolating} & Thai & 191 & 1.65 & 30 & 0.26\\
\agglutinative{Agglutinative} & Finnish & 120 & 6.15 & 20 & 1.03\\
\agglutinative{Agglutinative} & Basque & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Japanese & 10613 & 81.64 & 4422 & 34.02\\
\agglutinative{Agglutinative} & Korean & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Turkish & 935 & 100.00 & 934 & 99.89\\
\introflexive{Introflexive} & Arabic & 6435 & 12.61 & 2182 & 4.27\\
\introflexive{Introflexive} & Hebrew & 219 & 1.76 & 63 & 0.51\\


Unnamed: 0,group,language,>256,>256(%),>512,>512(%)
0,\fusional{Fusional},Bulgarian,0,0.0,0,0.0
1,\fusional{Fusional},English,0,0.0,0,0.0
2,\fusional{Fusional},Russian,3670,86.78,2463,58.24
3,\fusional{Fusional},Slovak,30,0.56,0,0.0
4,\fusional{Fusional},Croatian,19,0.88,4,0.19
5,\isolating{Isolating},Chinese,536,1.94,79,0.29
6,\isolating{Isolating},Vietnamese,12,0.35,1,0.03
7,\isolating{Isolating},Thai,191,1.65,30,0.26
8,\agglutinative{Agglutinative},Finnish,120,6.15,20,1.03
9,\agglutinative{Agglutinative},Basque,0,0.0,0,0.0
