In [39]:
import numpy as np, os, sys, matplotlib.pyplot as plt, seaborn as sns, pandas, orjson
from tqdm.notebook import tqdm
from mosestokenizer import MosesTokenizer
from transformers import AutoTokenizer

tqdm.pandas()

import logging
logging.disable(logging.INFO)

In [25]:
def get_dict_of_df(df_dict, read_directory='../log_calculations/unified_outputs/', fname_to_read='filtered_df.csv'):
    
    dfs = {}
    read_dirs = []
    for exp, models in df_dict.items():
        for model in models:
            file_name = os.path.join(read_directory, exp, model, fname_to_read)
            read_dirs.append(os.path.join(read_directory, exp, model))
            dfs[model] = pandas.read_csv(file_name)
    return dfs, read_dirs

In [26]:
models = {
    'every_5_5000' : ['allenai/scibert_scivocab_cased', 'bert-base-cased', 'roberta-base', 'xlm-roberta-base']
}

dfs, read_dirs = get_dict_of_df(models, fname_to_read='filtered_df_with_source.tsv')

In [27]:
languages = pandas.concat([df['langdetect'] for _, df in dfs.items()], axis=0).drop_duplicates()
languages

0           en
19          de
206         fr
211         es
286         nl
638         id
966         af
1427        ca
1497        pt
1756        no
1896        fi
2083        it
2151        ja
2281        ru
2561        pl
3751        sv
4647        ko
5458        cy
8278        cs
9107        tr
9123     zh-cn
11499       vi
13825       hr
15169       sl
16519       et
20484       ar
22367       fa
23589       bg
24231       hu
24657       uk
29646       ro
30590    zh-tw
32546       da
34920       lv
37522       sk
43195       tl
45572       el
49076       lt
49187       mk
51561       th
56058       so
66114       he
68487       hi
76982       sw
96362       sq
84817       bn
Name: langdetect, dtype: object

In [50]:
tokenizers = {}
for lang in languages:
    tokenizers[lang] = MosesTokenizer(lang)

In [None]:
for model, df in dfs.items():
    tokenizer = AutoTokenizer.from_pretrained(model)
    df['abstract_input'] = df['tokens'].progress_apply(lambda x: tokenizer.decode(eval(x)[1:-1]))
    df['num_moses_tokens'] = df[['langdetect', 'abstract_input']].progress_apply(lambda x: len(tokenizers[x.langdetect](x.abstract_input)), axis=1)

In [60]:
for (model, df), dir in zip(dfs.items(), read_dirs):
    print(dir)
    print(model)
    df.to_csv(os.path.join(dir, 'filtered_df_with_moses_tokens.tsv'))

../log_calculations/unified_outputs/every_5_5000/allenai/scibert_scivocab_cased
allenai/scibert_scivocab_cased
../log_calculations/unified_outputs/every_5_5000/bert-base-cased
bert-base-cased
../log_calculations/unified_outputs/every_5_5000/roberta-base
roberta-base
../log_calculations/unified_outputs/every_5_5000/xlm-roberta-base
xlm-roberta-base
