In [1]:
# general libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import random
import os

random.seed(42)
np.random.seed(42)

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
baseDir = '/content/gdrive/My Drive/Colab Notebooks/AA'

In [4]:
datasets = pd.read_json(baseDir+'/data/AllDS.json.zip', orient='records', compression='gzip')
datasets['row_index'] = np.arange(len(datasets));
datasets = datasets[['row_index','dataset','problem','language','set','filename','text','label']]

## reading ensemble outputs

In [5]:
fields = ['dataset','problem','model','row_index','pred']
output_models = pd.concat(
    [
        pd.read_csv(baseDir+'/ensemble/output_ensemble/'+f+'_predictions.csv.zip', compression='zip')[fields]
        for f in ['lyrics','socialaa','pan18_eval','pan18_train']
    ]
    +
    [
        pd.read_csv(baseDir+'/ensemble/output_ensemble/'+f+'_predictions_parcial.csv.zip', compression='zip')[fields]
        for f in ['lyrics','socialaa','pan18_eval','pan18_train']
    ]
    
)
output_models.head()

Unnamed: 0,dataset,problem,model,row_index,pred
0,lyrics,problem00001,dymAA,4561,candidate00001
1,lyrics,problem00001,dymAA,4562,candidate00001
2,lyrics,problem00001,dymAA,4563,candidate00001
3,lyrics,problem00001,dymAA,4564,candidate00001
4,lyrics,problem00001,dymAA,4565,candidate00001


In [6]:
output_pvt = output_models.pivot(index=['row_index','dataset','problem'], columns='model', values='pred').reset_index().rename(columns={'dymAA':'dynAA'});
output_pvt

model,row_index,dataset,problem,char,dep,dist,dynAA,pos,tag,w2v,word
0,0,pan18_train,problem00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001
1,1,pan18_train,problem00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001
2,2,pan18_train,problem00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001
3,3,pan18_train,problem00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001
4,4,pan18_train,problem00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001
...,...,...,...,...,...,...,...,...,...,...,...
30961,30961,socialaa,problem00032,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046
30962,30962,socialaa,problem00032,candidate00047,candidate00040,candidate00047,candidate00047,candidate00040,candidate00047,candidate00027,candidate00047
30963,30963,socialaa,problem00032,candidate00048,candidate00048,candidate00048,candidate00048,candidate00045,candidate00048,candidate00048,candidate00048
30964,30964,socialaa,problem00032,candidate00049,candidate00049,candidate00049,candidate00049,candidate00049,candidate00024,candidate00049,candidate00049


In [7]:
len(output_models), len(output_pvt),len(datasets)

(244899, 30966, 30966)

In [8]:
datasets.pivot_table(index='dataset',columns='set',values='row_index', aggfunc='count' )

set,known,unknown
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
lyrics,3688,1637
pan18_eval,1750,1315
pan18_train,875,621
socialaa,20400,680


In [9]:
output_models[['dataset','row_index']].groupby('dataset').count()

Unnamed: 0_level_0,row_index
dataset,Unnamed: 1_level_1
lyrics,42600
pan18_eval,22570
pan18_train,11089
socialaa,168640


# Reading bert output

In [10]:
bert_output = pd.read_csv(baseDir+'/BERT/output_bert_truncated/bert_predictions.csv.zip', compression='zip')[fields]

In [11]:
output_bert = bert_output.query("model == 'LR'")\
    .pivot(index=['row_index','dataset','problem'], columns='model', values='pred').reset_index().rename(columns={'LR':'BERT'})

# Reading CNN output

In [12]:
cnn_output = pd.read_csv(baseDir+'/DL/output_dl/cnn_predictions.csv.zip', compression='zip')[fields] \
    .pivot(index=['row_index','dataset','problem'], columns='model', values='pred').reset_index()

In [13]:
cnn_output

model,row_index,dataset,problem,CNN
0,0,pan18_train,problem00001,candidate00001
1,1,pan18_train,problem00001,candidate00001
2,2,pan18_train,problem00001,candidate00001
3,3,pan18_train,problem00001,candidate00001
4,4,pan18_train,problem00001,candidate00001
...,...,...,...,...
30961,30961,socialaa,problem00032,candidate00022
30962,30962,socialaa,problem00032,candidate00030
30963,30963,socialaa,problem00032,candidate00022
30964,30964,socialaa,problem00032,candidate00013


In [14]:
valds = datasets[['row_index','dataset','set','problem','language','label']] \
    .merge(output_pvt[['row_index','dynAA','char','dist','word','tag','pos','dep','w2v']], on='row_index', how='left')\
    .merge(output_bert[['row_index','BERT']], on='row_index', how='left') \
    .merge(cnn_output[['row_index','CNN']], on='row_index', how='left')

# Statistical test

In [15]:
valds_unknown = valds.query('set =="unknown" and dataset!= "pan18_train"')

In [16]:
from sklearn import metrics

In [17]:
valds_unknown

Unnamed: 0,row_index,dataset,set,problem,language,label,dynAA,char,dist,word,tag,pos,dep,w2v,BERT,CNN
1636,1636,pan18_eval,unknown,problem00001,en,candidate00020,candidate00020,candidate00020,candidate00020,candidate00020,candidate00020,candidate00020,candidate00001,candidate00020,candidate00013,candidate00016
1637,1637,pan18_eval,unknown,problem00001,en,candidate00003,candidate00005,candidate00005,candidate00016,candidate00016,candidate00005,candidate00017,candidate00012,candidate00001,candidate00016,candidate00001
1638,1638,pan18_eval,unknown,problem00001,en,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005
1639,1639,pan18_eval,unknown,problem00001,en,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001,candidate00001
1640,1640,pan18_eval,unknown,problem00001,en,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00005,candidate00001,candidate00014,candidate00005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30961,30961,socialaa,unknown,problem00032,en,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046,candidate00046,candidate00021,candidate00022
30962,30962,socialaa,unknown,problem00032,en,candidate00047,candidate00047,candidate00047,candidate00047,candidate00047,candidate00047,candidate00040,candidate00040,candidate00027,candidate00012,candidate00030
30963,30963,socialaa,unknown,problem00032,en,candidate00048,candidate00048,candidate00048,candidate00048,candidate00048,candidate00048,candidate00045,candidate00048,candidate00048,candidate00048,candidate00022
30964,30964,socialaa,unknown,problem00032,en,candidate00049,candidate00049,candidate00049,candidate00049,candidate00049,candidate00024,candidate00049,candidate00049,candidate00049,candidate00049,candidate00013


In [18]:
models = ['dynAA','char','BERT','CNN'];
kappa = pd.DataFrame([
        {
            'a':str(i)+'_'+a,
            'b':str(j)+'_'+b,
            'kappa':metrics.cohen_kappa_score(
                valds_unknown.query(f"dataset == '{d}'")[a],
                valds_unknown.query(f"dataset == '{d}'")[b]),

        } for i,a in enumerate(['label']+models)
        for j,b in enumerate(['label']+models)
        for d in valds_unknown['dataset'].unique()
]).pivot_table(index='a', columns='b', values='kappa')


with open(baseDir+'/kappa_report.txt','w') as f:
    f.write(kappa.round(2).to_latex(index=False))
kappa.round(2)

b,0_label,1_dynAA,2_char,3_BERT,4_CNN
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0_label,1.0,0.64,0.59,0.24,0.24
1_dynAA,0.64,1.0,0.84,0.25,0.28
2_char,0.59,0.84,1.0,0.24,0.29
3_BERT,0.24,0.25,0.24,1.0,0.14
4_CNN,0.24,0.28,0.29,0.14,1.0


In [19]:
import numpy as np;

In [20]:
performances = [];
np.random.seed(42);
for dataset in valds_unknown['dataset'].unique():
    for problem in valds_unknown.query(f"dataset == '{dataset}'")['problem'].unique():
        instance = valds_unknown.query(f"dataset == '{dataset}' and problem == '{problem}' ")

        for i in  range(100):
            indexes = np.arange(len(instance));
            np.random.shuffle(indexes);
            indexes = indexes[0:int(len(instance)*0.7)]

            performances.append(dict(**{
                    'dataset':dataset,
                    'problem':problem,
                    'run':i,
                }, **{
                    m:metrics.f1_score(instance.iloc[indexes]['label'], instance.iloc[indexes][m], average='macro')
                    for m in models
                }))

In [35]:
from scipy import stats;
def _stats(x):
    result = {}

    for m in ['char', 'CNN','BERT']:
        result[m+'_mean'] = round(x[m].mean(),4)
    for m in ['char', 'CNN','BERT']:
        result[m+'_ttest'], result[m+'_pvalue'] = stats.ttest_rel(x['dynAA'], x[m]);
        result[m+'_ttest'] = round(result[m+'_ttest'],2)

        #result[m+'_std'] = x[m].std()
        #result[m+'_ks_normal_pvalue'] = stats.kstest(x[m], 'norm')[1]
        #result[m+'_ks_d'] = stats.kstest(x[m], 'norm')[0]
        #result[m+'_shapiro'] = round(stats.shapiro(x[m])[0],4)


    return pd.Series(result)

pd.DataFrame(performances).groupby(['dataset','problem']).apply(_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,char_mean,CNN_mean,BERT_mean,char_ttest,char_pvalue,CNN_ttest,CNN_pvalue,BERT_ttest,BERT_pvalue
dataset,problem,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lyrics,problem00001,0.6023,0.5224,0.4282,0.72,4.720796e-01,18.09,3.798753e-33,28.87,5.122074e-50
lyrics,problem00002,0.5225,0.2337,0.1540,26.44,1.162746e-46,100.02,2.947105e-101,103.85,7.366123e-103
lyrics,problem00003,0.3528,0.1639,0.1315,21.75,1.716257e-39,68.80,2.127610e-85,71.32,6.453799e-87
lyrics,problem00004,0.3587,0.0936,0.1050,11.60,3.785142e-20,108.50,1.005503e-104,98.30,1.610453e-100
lyrics,problem00005,0.3580,0.0699,0.0766,-0.99,3.239148e-01,129.55,2.701280e-112,129.69,2.427158e-112
...,...,...,...,...,...,...,...,...,...,...
socialaa,problem00028,0.6634,0.0372,0.2204,7.31,7.027657e-11,139.73,1.567606e-115,79.74,1.243375e-91
socialaa,problem00029,0.6415,0.0474,0.3319,38.52,2.116590e-61,148.27,4.544984e-118,95.83,1.952656e-99
socialaa,problem00030,0.4009,0.0156,0.1014,29.93,2.103419e-51,93.02,3.597472e-98,69.97,4.103611e-86
socialaa,problem00031,0.6167,0.0300,0.2626,28.71,8.518775e-50,156.80,1.826189e-120,87.51,1.405636e-95


In [36]:
import re
def statistics(x):
    docs = x.query('set == "known"').groupby('label').agg({'filename':'nunique'}).mean().astype(int).values[0];

    nchar = int(x.query('set == "unknown"')['text'].apply(lambda x:len(x)).mean());
    leastOne = lambda x: x if x>0 else 1;
    nword = x.query('set == "unknown"')['text'].apply(lambda x:len(re.findall(r'\b\w+\b',x))).apply(leastOne).mean();

    nauthors = len(x['label'].unique())
    return pd.Series({
        'ndocs':docs,
        'nauthors':nauthors,
        'nchar': int(nchar/10)*10,
        'nword':int(np.ceil(nword/5)*5),
    })

metadata = datasets.groupby(['dataset','problem','language']).apply(statistics).reset_index()

In [37]:
perf = pd.DataFrame(performances);
temp2 = perf.merge(metadata)

with open(baseDir+'/statistical_test_report.txt','w') as f:
    #f.write(perf.groupby(['dataset','problem']).apply(_stats).round(2).to_latex());
    #f.write("\n\n");
    f.write(perf.groupby(['dataset']).apply(_stats).round(2).to_latex());
    f.write("\n\n");

    for v in ['language','nauthors','ndocs','nchar','nword']:
        f.write(temp2.groupby(['dataset',v]).apply(_stats).round(2).to_latex(index=False));
        f.write("\n\n");

        f.write(temp2.groupby([v]).apply(_stats).round(2).to_latex(index=False));
        f.write("\n\n");

# other reports

In [41]:
from IPython.core.display import HTML

In [87]:
model_renames = {'CNN':'CNN','BERT':'BERT','dynAA':'DynAA',
                 'char':'Char','dist':'Dist','word':'Word',
                 'tag':'TAG','pos':'POS','dep':'DEP','w2v':'W2V',
                 'language':'Lang'}

In [96]:
def f1(x,m):
    if x[m].isna().sum() > 0:
        return 0;
    else:
        return metrics.f1_score(x['label'],x[m], average='macro')


In [99]:
def a():
    models = [ 'CNN','BERT','dynAA','char','dist','word','tag','pos','dep','w2v']
    
    temp = valds_unknown.merge(metadata);
    temp['language'] = temp['language'].str.upper()

    latex = "";

    agg = lambda x:pd.Series({
        m:f1(x,m) for m in models
    })
    
    for dataset in valds_unknown['dataset'].unique():
        display(HTML(f"<hr/><h1>{dataset}</h1>"))
        latex += '\n\section{Tables for '+dataset+'}\n'
        temp1 = temp.query(f'dataset == "{dataset}"').groupby(['problem','nauthors']).apply(agg)\
                            .reset_index().groupby('nauthors').mean().reset_index().round(2).rename(columns={'nauthors':dataset})

        temp1[dataset] = temp1[dataset].astype(str)
        temp2 = temp.query(f'dataset == "{dataset}"').groupby(['problem','language']).apply(agg)\
                            .reset_index().groupby('language').mean().reset_index().round(2).rename(columns={'language':dataset})
        display(pd.concat([temp1,temp2]).rename(columns=model_renames))
        latex += pd.concat([temp1,temp2]).rename(columns=model_renames).to_latex(index=False)

        for t in [
            ['language','nauthors'],
            ['language','nchar'],
            ['language','ndocs'],

        ]:
            latex += "\n\n\n\n";
            temp2 = temp.query(f'dataset == "{dataset}"')\
                .groupby(['problem']+t).apply(agg)\
                .reset_index().groupby(t).mean().round(2)\
                .rename(columns={'nauthors':dataset})
            display(temp2)
            latex += temp2.to_latex(index=False)

    return latex;

with open(baseDir+'/report_general_v2.txt','w') as f:       
    f.write(a())

Unnamed: 0,pan18_eval,CNN,BERT,DynAA,Char,Dist,Word,TAG,POS,DEP,W2V
0,5,0.43,0.3,0.71,0.63,0.57,0.56,0.55,0.54,0.43,0.23
1,10,0.36,0.27,0.76,0.65,0.56,0.58,0.5,0.47,0.43,0.33
2,15,0.36,0.18,0.7,0.67,0.52,0.51,0.42,0.37,0.32,0.24
3,20,0.25,0.17,0.7,0.63,0.5,0.49,0.38,0.37,0.3,0.2
0,EN,0.37,0.32,0.82,0.78,0.5,0.61,0.68,0.59,0.55,0.41
1,FR,0.31,0.18,0.7,0.64,0.5,0.53,0.54,0.56,0.4,0.14
2,IT,0.36,0.33,0.71,0.67,0.59,0.53,0.46,0.38,0.32,0.28
3,PL,0.32,0.1,0.47,0.44,0.34,0.37,0.0,0.0,0.0,0.21
4,SP,0.39,0.21,0.89,0.7,0.75,0.62,0.64,0.67,0.59,0.23


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,nauthors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,5,0.6,0.4,0.92,0.83,0.54,0.64,0.92,0.88,0.71,0.43
EN,10,0.27,0.43,0.87,0.81,0.55,0.65,0.69,0.53,0.63,0.39
EN,15,0.31,0.25,0.74,0.73,0.49,0.63,0.63,0.5,0.43,0.46
EN,20,0.28,0.21,0.76,0.76,0.43,0.55,0.48,0.45,0.42,0.34
FR,5,0.34,0.34,0.71,0.71,0.57,0.65,0.61,0.6,0.35,0.13
FR,10,0.37,0.16,0.73,0.61,0.5,0.56,0.59,0.64,0.63,0.18
FR,15,0.43,0.11,0.68,0.68,0.47,0.46,0.44,0.46,0.3,0.12
FR,20,0.12,0.11,0.69,0.56,0.47,0.46,0.51,0.52,0.33,0.11
IT,5,0.27,0.13,0.58,0.54,0.49,0.45,0.42,0.32,0.23,0.2
IT,10,0.38,0.51,0.88,0.78,0.72,0.71,0.62,0.51,0.4,0.38


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,nchar,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,4430,0.31,0.25,0.74,0.73,0.49,0.63,0.63,0.5,0.43,0.46
EN,4440,0.28,0.21,0.76,0.76,0.43,0.55,0.48,0.45,0.42,0.34
EN,4470,0.27,0.43,0.87,0.81,0.55,0.65,0.69,0.53,0.63,0.39
EN,4660,0.6,0.4,0.92,0.83,0.54,0.64,0.92,0.88,0.71,0.43
FR,4310,0.34,0.34,0.71,0.71,0.57,0.65,0.61,0.6,0.35,0.13
FR,4390,0.37,0.16,0.73,0.61,0.5,0.56,0.59,0.64,0.63,0.18
FR,4410,0.12,0.11,0.69,0.56,0.47,0.46,0.51,0.52,0.33,0.11
FR,4420,0.43,0.11,0.68,0.68,0.47,0.46,0.44,0.46,0.3,0.12
IT,4700,0.27,0.13,0.58,0.54,0.49,0.45,0.42,0.32,0.23,0.2
IT,4740,0.42,0.32,0.71,0.68,0.59,0.48,0.41,0.34,0.34,0.28


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,ndocs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,7,0.37,0.32,0.82,0.78,0.5,0.61,0.68,0.59,0.55,0.41
FR,7,0.31,0.18,0.7,0.64,0.5,0.53,0.54,0.56,0.4,0.14
IT,7,0.36,0.33,0.71,0.67,0.59,0.53,0.46,0.38,0.32,0.28
PL,7,0.32,0.1,0.47,0.44,0.34,0.37,0.0,0.0,0.0,0.21
SP,7,0.39,0.21,0.89,0.7,0.75,0.62,0.64,0.67,0.59,0.23


Unnamed: 0,lyrics,CNN,BERT,DynAA,Char,Dist,Word,TAG,POS,DEP,W2V
0,5,0.41,0.43,0.6,0.6,0.37,0.55,0.47,0.38,0.34,0.53
1,10,0.22,0.2,0.58,0.56,0.35,0.49,0.39,0.3,0.29,0.39
2,15,0.17,0.16,0.47,0.42,0.21,0.33,0.28,0.23,0.21,0.33
3,20,0.13,0.15,0.43,0.42,0.17,0.36,0.29,0.19,0.17,0.29
4,25,0.1,0.09,0.36,0.37,0.16,0.27,0.21,0.19,0.14,0.21
0,EN,0.2,0.23,0.51,0.5,0.29,0.43,0.4,0.31,0.25,0.35
1,PT,0.22,0.18,0.46,0.45,0.22,0.37,0.25,0.21,0.21,0.35


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,nauthors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,5,0.3,0.42,0.58,0.58,0.4,0.55,0.54,0.46,0.37,0.44
EN,10,0.21,0.25,0.58,0.59,0.39,0.52,0.5,0.38,0.32,0.41
EN,15,0.18,0.19,0.55,0.47,0.26,0.33,0.36,0.24,0.24,0.36
EN,20,0.17,0.2,0.48,0.48,0.2,0.43,0.37,0.24,0.19,0.29
EN,25,0.12,0.11,0.35,0.38,0.18,0.3,0.24,0.22,0.15,0.22
PT,5,0.52,0.43,0.61,0.62,0.35,0.55,0.39,0.29,0.3,0.63
PT,10,0.23,0.16,0.57,0.53,0.3,0.46,0.29,0.22,0.27,0.36
PT,15,0.16,0.13,0.39,0.36,0.15,0.32,0.21,0.22,0.19,0.3
PT,20,0.09,0.11,0.38,0.36,0.14,0.29,0.21,0.15,0.16,0.28
PT,25,0.07,0.08,0.36,0.36,0.14,0.24,0.18,0.16,0.13,0.2


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,nchar,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,1280,0.3,0.42,0.58,0.58,0.4,0.55,0.54,0.46,0.37,0.44
EN,1340,0.12,0.11,0.35,0.38,0.18,0.3,0.24,0.22,0.15,0.22
EN,1430,0.19,0.22,0.53,0.53,0.3,0.48,0.44,0.31,0.25,0.35
EN,1450,0.18,0.19,0.55,0.47,0.26,0.33,0.36,0.24,0.24,0.36
PT,1050,0.16,0.13,0.39,0.36,0.15,0.32,0.21,0.22,0.19,0.3
PT,1060,0.52,0.43,0.61,0.62,0.35,0.55,0.39,0.29,0.3,0.63
PT,1080,0.23,0.16,0.57,0.53,0.3,0.46,0.29,0.22,0.27,0.36
PT,1120,0.09,0.11,0.38,0.36,0.14,0.29,0.21,0.15,0.16,0.28
PT,1170,0.07,0.08,0.36,0.36,0.14,0.24,0.18,0.16,0.13,0.2


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,ndocs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,23,0.12,0.11,0.35,0.38,0.18,0.3,0.24,0.22,0.15,0.22
EN,25,0.18,0.19,0.51,0.48,0.23,0.38,0.37,0.24,0.21,0.33
EN,26,0.21,0.25,0.58,0.59,0.39,0.52,0.5,0.38,0.32,0.41
EN,29,0.3,0.42,0.58,0.58,0.4,0.55,0.54,0.46,0.37,0.44
PT,23,0.12,0.11,0.38,0.36,0.15,0.28,0.19,0.19,0.16,0.25
PT,24,0.09,0.11,0.38,0.36,0.14,0.29,0.21,0.15,0.16,0.28
PT,25,0.52,0.43,0.61,0.62,0.35,0.55,0.39,0.29,0.3,0.63
PT,26,0.23,0.16,0.57,0.53,0.3,0.46,0.29,0.22,0.27,0.36


Unnamed: 0,socialaa,CNN,BERT,DynAA,Char,Dist,Word,TAG,POS,DEP,W2V
0,5,0.25,0.48,0.85,0.63,0.75,0.51,0.5,0.52,0.51,0.52
1,10,0.28,0.43,0.73,0.62,0.65,0.48,0.42,0.45,0.3,0.43
2,20,0.1,0.34,0.68,0.61,0.55,0.45,0.4,0.28,0.23,0.36
3,50,0.02,0.22,0.54,0.48,0.43,0.4,0.28,0.18,0.16,0.27
0,EN,0.16,0.36,0.7,0.58,0.6,0.46,0.4,0.36,0.3,0.4


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,nauthors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,5,0.25,0.48,0.85,0.63,0.75,0.51,0.5,0.52,0.51,0.52
EN,10,0.28,0.43,0.73,0.62,0.65,0.48,0.42,0.45,0.3,0.43
EN,20,0.1,0.34,0.68,0.61,0.55,0.45,0.4,0.28,0.23,0.36
EN,50,0.02,0.22,0.54,0.48,0.43,0.4,0.28,0.18,0.16,0.27


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,nchar,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,40,0.02,0.35,0.31,0.2,0.65,0.35,0.22,0.07,0.27,0.23
EN,50,0.05,0.29,0.52,0.15,0.33,0.12,0.08,0.11,0.21,0.42
EN,60,0.01,0.18,0.22,0.23,0.16,0.18,0.07,0.08,0.07,0.06
EN,70,0.03,0.12,0.3,0.24,0.25,0.13,0.08,0.07,0.12,0.08
EN,130,0.07,0.29,0.61,0.5,0.47,0.31,0.23,0.27,0.14,0.29
EN,140,0.04,0.24,0.38,0.27,0.26,0.2,0.2,0.03,0.15,0.1
EN,150,0.07,0.73,0.73,0.73,0.73,0.73,0.73,0.73,0.33,0.47
EN,170,0.28,0.23,1.0,0.73,0.73,1.0,0.33,0.47,0.47,0.73
EN,270,0.04,0.36,0.88,0.57,0.8,0.45,0.49,0.23,0.13,0.38
EN,280,0.12,0.38,0.82,0.78,0.66,0.65,0.4,0.42,0.33,0.5


Unnamed: 0_level_0,Unnamed: 1_level_0,CNN,BERT,dynAA,char,dist,word,tag,pos,dep,w2v
language,ndocs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EN,10,0.17,0.34,0.63,0.52,0.54,0.35,0.33,0.31,0.2,0.3
EN,50,0.15,0.39,0.77,0.65,0.65,0.57,0.47,0.41,0.4,0.49
