In [22]:
import pandas, numpy as np, seaborn as sns
from collections import Counter
import os, sys

In [23]:
def get_dict_of_df(df_dict, read_directory='../log_calculations/unified_outputs/', fname_to_read='filtered_df.csv'):
    
    dfs = {}
    
    for exp, models in df_dict.items():
        for model in models:
            file_name = os.path.join(read_directory, exp, model, fname_to_read)
            dfs[model] = pandas.read_csv(file_name)
    return dfs





In [39]:
d = {
    'multilingual' : ['bert-base-multilingual-cased', 'xlm-roberta-base'],
    'filter_newline' : ['bert-large-cased', 'bert-base-cased', 'allenai/scibert_scivocab_cased']
}

dfs = get_dict_of_df(d)
dfs['bert-base-cased']

Unnamed: 0.1,Unnamed: 0,bin,local_id,corpusid,abstract,tokens,num_unique_tokens,num_total_tokens,num_unk_tokens,langdetect
0,0,0,0,82212221,"The action of adenosine, muscle and yeast aden...","[101, 1109, 2168, 1104, 8050, 26601, 10606, 11...",39,128,0,en
1,1,0,1,31685335,Introduction. Since the observation by Koch 1 ...,"[101, 13813, 119, 1967, 1103, 8310, 1118, 1616...",41,128,0,en
2,2,0,2,37429768,The following paper is based on the results ob...,"[101, 1109, 1378, 2526, 1110, 1359, 1113, 1103...",49,128,0,en
3,4,0,4,71292698,If the oral treatment for syphilis were as eff...,"[101, 1409, 1103, 9619, 3252, 1111, 188, 1183,...",43,128,0,en
4,5,0,5,43937120,me by an invitation to contribute on the subje...,"[101, 1143, 1118, 1126, 8727, 1106, 8681, 1113...",44,128,0,en
...,...,...,...,...,...,...,...,...,...,...
8672,10995,99,995,256901121,Consider the integer best approximations of a ...,"[101, 25515, 1103, 18157, 1436, 22519, 1116, 1...",48,128,0,en
8673,10996,99,996,257171538,Abstract Salivary duct carcinoma (SDC) is a ra...,"[101, 138, 4832, 15017, 18613, 12416, 1616, 26...",45,128,0,en
8674,10997,99,997,256932495,Penelitian ini bertujuan untuk mengetahui stra...,"[101, 23544, 21091, 11969, 1107, 1182, 1129, 3...",46,128,0,id
8675,10998,99,998,257990226,Particulate matter in Computer Numerical Contr...,"[101, 4539, 26748, 2187, 1107, 6701, 151, 1544...",41,128,0,en


In [59]:
# Percent English. Different across models, because we filter out rows with more than 2 pad values.
# XLM-R has a better vocabulary for multilingual abstracts, so the representation is shorter, and the filtering for non-english is higher

for k,v in dfs.items():
    c = Counter(v['langdetect'])
    print(k)
    print(c)
    print()

for k,v in dfs.items():
    c = Counter(v['langdetect'])
    print(f"{k:30s}:\t Percent English: {(c['en'] * 100) / c.total():5.2f}%")

bert-base-multilingual-cased
Counter({'en': 7479, 'id': 201, 'fr': 197, 'zh-cn': 188, 'es': 168, 'pt': 151, 'de': 94, 'ko': 71, 'tr': 47, 'ru': 35, 'ja': 28, 'fa': 19, 'pl': 11, 'uk': 11, 'nl': 10, 'it': 10, 'hr': 10, 'ar': 9, 'sl': 7, 'cs': 7, 'hu': 7, 'ca': 6, 'sv': 5, 'vi': 4, 'sk': 4, 'el': 3, 'sw': 1, 'zh-tw': 1, 'th': 1, 'lt': 1, 'ro': 1, 'no': 1, 'fi': 1, 'so': 1})

xlm-roberta-base
Counter({'en': 7611, 'fr': 199, 'id': 197, 'es': 169, 'pt': 150, 'zh-cn': 109, 'de': 92, 'ko': 68, 'tr': 47, 'ru': 34, 'ja': 27, 'fa': 18, 'nl': 10, 'it': 10, 'hr': 10, 'ar': 10, 'uk': 10, 'pl': 8, 'hu': 7, 'ca': 6, 'sv': 5, 'vi': 5, 'sl': 5, 'cs': 5, 'sk': 4, 'th': 1, 'lt': 1, 'el': 1, 'no': 1, 'fi': 1, 'so': 1, 'zh-tw': 1, 'et': 1})

bert-large-cased
Counter({'en': 7291, 'fr': 221, 'id': 205, 'es': 194, 'zh-cn': 188, 'pt': 158, 'de': 107, 'tr': 48, 'ru': 47, 'ko': 46, 'ja': 27, 'fa': 18, 'hr': 13, 'uk': 13, 'nl': 12, 'it': 12, 'pl': 12, 'cs': 11, 'ar': 10, 'ca': 7, 'sl': 7, 'hu': 7, 'sv': 5, 'vi': 

In [60]:
for k,v in dfs.items():
    print(f"{k:50s} Ratio of unknown/total tokens (total): {v['num_unk_tokens'].sum() / v['num_total_tokens'].sum():10.2f}")
    temp = v[v['num_unk_tokens'] > 0]
    print(f"{'':50s} Average ratio of unk/total: {(temp['num_unk_tokens'] / temp['num_total_tokens']).mean():21.2f}\t(Across {len(temp):5d} rows which have [UNK] tokens)")
    print()

bert-base-multilingual-cased                       Ratio of unknown/total tokens (total):       0.00
                                                   Average ratio of unk/total:                  0.02	(Across  1629 rows which have [UNK] tokens)

xlm-roberta-base                                   Ratio of unknown/total tokens (total):       0.00
                                                   Average ratio of unk/total:                  0.01	(Across    21 rows which have [UNK] tokens)

bert-large-cased                                   Ratio of unknown/total tokens (total):       0.01
                                                   Average ratio of unk/total:                  0.28	(Across   398 rows which have [UNK] tokens)

bert-base-cased                                    Ratio of unknown/total tokens (total):       0.01
                                                   Average ratio of unk/total:                  0.28	(Across   398 rows which have [UNK] tokens)

allenai/scib