In [2]:
from pathlib import Path

import conllu
import plotly.express as px
import pandas as pd
import numpy as np


porttinari_gold_sents = conllu.parse(open("../data/UD_Portuguese-Porttinari/pt_porttinari-ud-test.conllu", encoding="utf-8").read())
dante_gold_sents = conllu.parse(open("../data/UD_Portuguese-DANTE/pt_dante-ud-test.conllu", encoding="utf-8").read())
petrogold_gold_sents = conllu.parse(open("../data/UD_Portuguese-PetroGold/pt_petrogold-ud-test.conllu", encoding="utf-8").read())

models_preds = {}
for setting_name in Path("../multigenre/outputs").iterdir():
    full_setting_name = setting_name.stem.replace("porttinari", "Porttinari")
    full_setting_name = full_setting_name.replace("dante", "DANTEStocks")
    full_setting_name = full_setting_name.replace("petrogold", "PetroGold")
    full_setting_name = full_setting_name.replace("_", " ")
    for exp_id in setting_name.iterdir():
        data = {}
        print("Processing ", full_setting_name, exp_id.stem)
        for exp_filename in exp_id.glob("*pred.conllu"):
            if "dante" in exp_filename.stem:
                data["DANTEStocks"] = conllu.parse(open(exp_filename, encoding="utf-8").read())
            elif "petrogold" in exp_filename.stem:
                data["PetroGold"] = conllu.parse(open(exp_filename, encoding="utf-8").read())
            else:
                data["Porttinari"] = conllu.parse(open(exp_filename, encoding="utf-8").read())
        if full_setting_name not in models_preds.keys():
            models_preds[full_setting_name] = []    
        models_preds[full_setting_name].append( data )

Processing  DANTEStocks model-19fciu72-v0
Processing  DANTEStocks model-1b7kowrx-v0
Processing  DANTEStocks model-2akpbr05-v0
Processing  DANTEStocks model-2l1q1co8-v0
Processing  DANTEStocks model-2vl5c8as-v0
Processing  DANTEStocks model-66omv4vd-v0
Processing  DANTEStocks model-hfcre0cn-v0
Processing  DANTEStocks model-o0ojeh3x-v0
Processing  DANTEStocks model-sm3hnlkk-v0
Processing  DANTEStocks model-tv4yte30-v0
Processing  DANTEStocks PetroGold model-1vwxg7v2-v0
Processing  DANTEStocks PetroGold model-216u2b9d-v0
Processing  DANTEStocks PetroGold model-25rt674h-v0
Processing  DANTEStocks PetroGold model-2kyu5jev-v0
Processing  DANTEStocks PetroGold model-3evjf8e7-v0
Processing  DANTEStocks PetroGold model-3h6mbbp2-v0
Processing  DANTEStocks PetroGold model-3lesa1b2-v0
Processing  DANTEStocks PetroGold model-bnm5t8fw-v0
Processing  DANTEStocks PetroGold model-suxb490l-v0
Processing  DANTEStocks PetroGold model-vlmx70y6-v0
Processing  PetroGold model-19jxdvv7-v0
Processing  PetroGol

In [1]:
for model_name in models_preds.keys():
    assert len(models_preds[model_name]) == 10, f"Error on {model_name} with {len(models_preds[model_name])} files."
    for exp in models_preds[model_name]:
        assert len(exp.keys()) == 3, f"Error, data object has {len(exp.keys())} keys, it was expected 3 keys"

NameError: name 'models_preds' is not defined

In [3]:
from sklearn.metrics import accuracy_score


def get_tags(sents, min_tokens = 0, max_tokens = 60, get_n_sents = False):
    true_tags = []
    n_sents = 0
    for sent in sents:
        tags = []
        n_tokens = 0
        for token in sent:
            if isinstance(token["id"], int):
                tags.append(token["upos"])
                n_tokens += 1
        if n_tokens > min_tokens and n_tokens <= max_tokens:
            true_tags += tags
            n_sents += 1
    if get_n_sents:
        return true_tags, n_sents
    return true_tags

def get_acc_sent_divided(gold_sents, pred_sents, step_size = 10):
    acc_res = {
        "Grupo": [],
        "Acurácia": [],
        "Qtde de sentenças": [],
    }
    for i in range(6):
        min_tokens = i*step_size
        max_tokens = (i+1)*step_size
        true_tags, n_sents = get_tags(gold_sents, min_tokens=min_tokens, max_tokens=max_tokens, get_n_sents=True)
        pred_tags = get_tags(pred_sents, min_tokens=min_tokens, max_tokens=max_tokens)

        acc = accuracy_score(true_tags, pred_tags)
        acc_res["Grupo"].append(f"De {(i) * step_size + 1} a {(i+1) * step_size}")
        acc_res["Acurácia"].append(acc)
        acc_res["Qtde de sentenças"].append(n_sents)

    return acc_res

header = False
results = {
    "model_name": [],
    "De 1 a 10": [],
    "De 11 a 20": [],
    "De 21 a 30": [],
    "De 31 a 40": [],
    "De 41 a 50": [],
    "De 51 a 60": [],
}
for model_name in models_preds.keys():
    accs = []
    for i, experiment in enumerate(models_preds[model_name]):
        acc_df = get_acc_sent_divided(porttinari_gold_sents, experiment["Porttinari"])
        accs.append(acc_df["Acurácia"])
        groups = acc_df["Grupo"]
        groups_n_sents = acc_df["Qtde de sentenças"]
    
    if not header:
        header = True
        results["model_name"].append("Qtde de sentenças")
        for i in range(6):
            results[f"De {(i) * 10 + 1} a {(i+1) * 10}"].append(groups_n_sents[i])

    results["model_name"].append(model_name)
    mean, std = np.mean(accs, axis=0), np.std(accs, axis=0)
    for k in range(6):
        results[f"De {(k) * 10 + 1} a {(k+1) * 10}"].append(f"{100*mean[k]:.4f}  {100*std[k]:.4f}")

results_df = pd.DataFrame(results)

In [4]:
results_df

Unnamed: 0,model_name,De 1 a 10,De 11 a 20,De 21 a 30,De 31 a 40,De 41 a 50,De 51 a 60
0,Qtde de sentenças,137,926,388,206,8,3
1,DANTEStocks,95.0089 0.5974,96.7737 0.1716,96.3156 0.2386,96.7111 0.2663,95.1923 0.3072,96.1765 0.2941
2,DANTEStocks PetroGold,96.1525 0.2545,97.8724 0.0669,97.8159 0.0605,98.1994 0.0814,97.3077 0.4564,97.8235 0.2696
3,PetroGold,94.5124 0.2899,97.0070 0.1196,97.0865 0.0708,97.2539 0.1320,96.2363 0.7063,96.5882 0.4402
4,Porttinari,97.6950 0.2718,99.0847 0.0402,99.1689 0.0426,99.1406 0.0543,98.5714 0.3204,99.5294 0.3529
5,Porttinari DANTEStocks,97.5532 0.2068,99.0701 0.0509,99.2115 0.0374,99.0726 0.0416,98.1044 0.2868,99.0000 0.5294
6,Porttinari DANTEStocks PetroGold,97.8103 0.1775,99.0030 0.0441,99.1181 0.0655,99.0907 0.0542,98.1593 0.1259,98.7647 0.1765
7,Porttinari PetroGold,97.6507 0.1694,98.9485 0.0504,98.9936 0.0989,99.1032 0.0706,98.5989 0.1923,98.6471 0.6471


In [6]:
def get_tags(sents, min_tokens = 0, max_tokens = 60, get_n_sents = False):
    true_tags = []
    n_sents = 0
    for sent in sents:
        tags = []
        n_tokens = 0
        for token in sent:
            if isinstance(token["id"], int):
                tags.append(token["upos"])
                n_tokens += 1
        if n_tokens > min_tokens and n_tokens <= max_tokens:
            true_tags += tags
            n_sents += 1
    if get_n_sents:
        return true_tags, n_sents
    return true_tags

def get_acc_sent_divided(gold_sents, pred_sents, step_size = 10):
    acc_res = {
        "Grupo": [],
        "Acurácia": [],
        "Qtde de sentenças": [],
    }
    for i in range(6):
        min_tokens = i*step_size
        max_tokens = (i+1)*step_size
        true_tags, n_sents = get_tags(gold_sents, min_tokens=min_tokens, max_tokens=max_tokens, get_n_sents=True)
        pred_tags = get_tags(pred_sents, min_tokens=min_tokens, max_tokens=max_tokens)

        acc = accuracy_score(true_tags, pred_tags)
        acc_res["Grupo"].append(f"De {(i) * step_size + 1} a {(i+1) * step_size}")
        acc_res["Acurácia"].append(acc)
        acc_res["Qtde de sentenças"].append(n_sents)

    return acc_res

header = False
results = {
    "model_name": [],
    "De 1 a 10": [],
    "De 11 a 20": [],
    "De 21 a 30": [],
    "De 31 a 40": [],
    "De 41 a 50": [],
    "De 51 a 60": [],
}
for model_name in models_preds.keys():
    accs = []
    for i, experiment in enumerate(models_preds[model_name]):
        acc_df = get_acc_sent_divided(dante_gold_sents, experiment["DANTEStocks"])
        accs.append(acc_df["Acurácia"])
        groups = acc_df["Grupo"]
        groups_n_sents = acc_df["Qtde de sentenças"]
    
    if not header:
        header = True
        results["model_name"].append("Qtde de sentenças")
        for i in range(6):
            results[f"De {(i) * 10 + 1} a {(i+1) * 10}"].append(groups_n_sents[i])

    results["model_name"].append(model_name)
    mean, std = np.mean(accs, axis=0), np.std(accs, axis=0)
    for k in range(6):
        results[f"De {(k) * 10 + 1} a {(k+1) * 10}"].append(f"{100*mean[k]:.4f}  {100*std[k]:.4f}")

dante_results_df = pd.DataFrame(results)
dante_results_df

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dty

Unnamed: 0,model_name,De 1 a 10,De 11 a 20,De 21 a 30,De 31 a 40,De 41 a 50,De 51 a 60
0,Qtde de sentenças,122,313,296,66,4,0
1,DANTEStocks,96.6150 0.2935,97.9818 0.0847,98.1555 0.0497,97.8138 0.4648,100.0000 0.0000,nan nan
2,DANTEStocks PetroGold,96.8363 0.1656,97.9213 0.1009,98.1703 0.0571,97.8183 0.3558,100.0000 0.0000,nan nan
3,PetroGold,77.4558 0.8201,80.0505 0.6120,87.6777 0.4145,89.5938 0.4546,88.7135 1.3089,nan nan
4,Porttinari,80.2987 0.6382,84.3814 0.8658,89.0497 0.4964,89.5436 0.5222,88.8304 0.6105,nan nan
5,Porttinari DANTEStocks,96.6261 0.1731,98.0061 0.1458,98.1863 0.1039,97.1931 0.1418,99.3567 1.1534,nan nan
6,Porttinari DANTEStocks PetroGold,96.5044 0.2673,97.9213 0.0807,98.1662 0.1456,97.4806 0.5203,100.0000 0.0000,nan nan
7,Porttinari PetroGold,77.7544 0.8993,80.3653 0.4773,87.8758 0.3107,90.4929 0.3948,88.5965 1.0862,nan nan


In [12]:
def get_tags(sents, min_tokens = 0, max_tokens = 60, get_n_sents = False):
    true_tags = []
    n_sents = 0
    for sent in sents:
        tags = []
        n_tokens = 0
        for token in sent:
            if isinstance(token["id"], int):
                tags.append(token["upos"])
                n_tokens += 1
        if n_tokens > min_tokens and n_tokens <= max_tokens:
            true_tags += tags
            n_sents += 1
    if get_n_sents:
        return true_tags, n_sents
    return true_tags

def get_acc_sent_divided(gold_sents, pred_sents, step_size = 10):
    acc_res = {
        "Grupo": [],
        "Acurácia": [],
        "Qtde de sentenças": [],
    }
    for i in range(7):
        min_tokens = i*step_size
        max_tokens = (i+1)*step_size if i < 6 else 120
        true_tags, n_sents = get_tags(gold_sents, min_tokens=min_tokens, max_tokens=max_tokens, get_n_sents=True)
        pred_tags = get_tags(pred_sents, min_tokens=min_tokens, max_tokens=max_tokens)

        acc = accuracy_score(true_tags, pred_tags)
        acc_res["Grupo"].append(f"De {(i) * step_size + 1} a {max_tokens}")
        acc_res["Acurácia"].append(acc)
        acc_res["Qtde de sentenças"].append(n_sents)

    return acc_res

header = False
results = {
    "model_name": [],
    "De 1 a 10": [],
    "De 11 a 20": [],
    "De 21 a 30": [],
    "De 31 a 40": [],
    "De 41 a 50": [],
    "De 51 a 60": [],
    "De 61 a 120": [],
}
for model_name in models_preds.keys():
    accs = []
    for i, experiment in enumerate(models_preds[model_name]):
        acc_df = get_acc_sent_divided(petrogold_gold_sents, experiment["PetroGold"])
        accs.append(acc_df["Acurácia"])
        groups = acc_df["Grupo"]
        groups_n_sents = acc_df["Qtde de sentenças"]
    
    if not header:
        header = True
        results["model_name"].append("Qtde de sentenças")
        for i in range(7):
            max_tokens = (i+1)*10 if i < 6 else 120
            results[f"De {(i) * 10 + 1} a {max_tokens}"].append(groups_n_sents[i])

    results["model_name"].append(model_name)
    mean, std = np.mean(accs, axis=0), np.std(accs, axis=0)
    for k in range(7):
        max_tokens = (k+1)*10 if k < 6 else 120
        results[f"De {(k) * 10 + 1} a {max_tokens}"].append(f"{100*mean[k]:.4f}  {100*std[k]:.4f}")

petrogold_results_df = pd.DataFrame(results)
petrogold_results_df

Unnamed: 0,model_name,De 1 a 10,De 11 a 20,De 21 a 30,De 31 a 40,De 41 a 50,De 51 a 60,De 61 a 120
0,Qtde de sentenças,74,87,120,78,51,18,16
1,DANTEStocks,87.5414 0.6607,94.6368 0.1886,95.7039 0.2043,95.0988 0.1821,95.1605 0.2296,95.4051 0.4872,95.3061 0.2585
2,DANTEStocks PetroGold,92.8369 0.6348,98.7892 0.1624,99.4746 0.0478,98.9173 0.0899,99.3148 0.1023,99.1282 0.1468,98.8465 0.1316
3,PetroGold,92.9078 0.7697,98.9174 0.1381,99.3853 0.0516,99.0819 0.0577,99.1847 0.0694,99.1590 0.1281,98.8820 0.1267
4,Porttinari,87.8487 1.3043,95.8476 0.2796,97.0687 0.1106,96.8764 0.1845,96.6956 0.1059,97.4667 0.1838,96.9920 0.3753
5,Porttinari DANTEStocks,89.4090 0.4584,95.8405 0.3536,97.2637 0.1408,96.8544 0.1481,96.7910 0.1809,97.3846 0.2438,97.3469 0.1283
6,Porttinari DANTEStocks PetroGold,93.4988 0.7267,98.7251 0.1575,99.4878 0.0743,98.8808 0.1215,99.1500 0.0893,99.0769 0.1654,98.9352 0.1587
7,Porttinari PetroGold,92.8842 0.6470,98.5541 0.1594,99.4514 0.0664,98.9612 0.1182,99.0503 0.1793,99.1897 0.1410,99.0240 0.1316


In [11]:
results["De 61 a 120"]

[]

In [35]:
train_sents = conllu.parse(open("../data/UD_Portuguese-Porttinari/pt_porttinari-ud-train.conllu", encoding="utf-8").read())
vocab = set()
for sent in train_sents:
    for token in sent:
        if isinstance(token["id"], int):
            vocab.add(token["form"])
len(vocab)

17117

In [46]:
sents = models_preds["BERTimbau"][0]
min_tokens = 50
max_tokens = 60
get_n_sents = False
true_tags = []
n_sents = 0
sent_ids = []
preds = []
oov_cnt_total = 0
total_tokens = 0
for sent in sents:
    tags = []
    n_tokens = 0
    oov_cnt = 0
    for token in sent:
        if isinstance(token["id"], int):
            tags.append(token["upos"])
            if token["form"] not in vocab:
                oov_cnt += 1
            n_tokens += 1
            
    if n_tokens > min_tokens and n_tokens <= max_tokens:
        true_tags += tags
        n_sents += 1
        sent_ids.append(sent.metadata["sent_id"])
        preds.append(tags)
        total_tokens += n_tokens
        oov_cnt_total += oov_cnt
oov_cnt_total / total_tokens

0.07647058823529412

In [None]:
9,840425531914894,
8,155533399800598,
7,989209379539323,
7,122032486463974,
8,241758241758242
7,647058823529412

In [75]:
results = {
    "model_name": [],
    "1 a 10": [],
    "11 a 20": [],
    "21 a 30": [],
    "31 a 40": [],
    "41 a 50": [],
    "51 a 60": [],
}
for model_name in models_preds.keys(): 
    results["model_name"].append(model_name)
    total_accs = []
    for sents in models_preds[model_name]: # Para cada experimento
        accs = []
        for k in range(6): # Para cada intervalo de senteças (10 em 10)
            min_tokens = k * 10
            max_tokens = (k + 1) * 10
            get_n_sents = False
            total_pred_tags = []
            total_true_tags = []
            n_sents = 0
            sent_ids = []
            preds = []
            oov_cnt_total = 0
            total_tokens = 0
            max_size = 0
            for i, sent in enumerate(sents):
                pred_tags = []
                true_tags = []
                n_tokens = 0
                oov_cnt = 0
                for j, token in enumerate(sent):
                    if isinstance(token["id"], int):
                        if token["form"] not in vocab:
                            oov_cnt += 1
                            pred_tags.append(token["upos"])
                            true_tags.append(porttinari_gold_sents[i][j]["upos"])
                        n_tokens += 1
                        
                if n_tokens > min_tokens and n_tokens <= max_tokens:
                    total_pred_tags += pred_tags
                    total_true_tags += true_tags
                    n_sents += 1
                    preds.append(tags)
                    total_tokens += n_tokens
                    oov_cnt_total += oov_cnt
            curr_acc = accuracy_score(total_true_tags, total_pred_tags)
            accs.append(curr_acc)
        total_accs.append(accs)
    mean, std = np.mean(total_accs, axis=0), np.std(total_accs, axis=0)
    # print(mean, std)
    for k in range(6):
        results[f"{(k) * 10 + 1} a {(k+1) * 10}"].append(f"{100*mean[k]:.4f}  {100*std[k]:.4f}")
pd.DataFrame(results)

Unnamed: 0,model_name,1 a 10,11 a 20,21 a 30,31 a 40,41 a 50,51 a 60
0,BERTimbau,91.5315 0.8257,96.4548 0.2452,96.9740 0.3130,96.4717 0.3642,97.0000 3.7859,100.0000 0.0000
1,CNCSR,88.0180 0.7036,92.9340 0.4553,92.4286 0.7114,92.5341 0.3268,90.6667 2.4944,99.2308 2.3077
2,DeBERTa-v3,92.7928 0.9009,96.3733 0.2452,97.0390 0.3664,95.9649 0.3268,97.0000 2.7689,100.0000 0.0000
3,Meta-BiLSTM,88.6486 0.9187,94.5314 0.3166,94.8701 0.3863,95.0097 0.3285,97.0000 1.7951,100.0000 0.0000
4,Stanza,90.7207 0.9054,94.8492 0.2023,94.9351 0.4928,94.9318 0.4613,99.6667 1.0000,100.0000 0.0000
5,UDPipe,84.6847 1.0660,91.2795 0.4250,92.0130 0.5072,91.2281 0.5008,93.0000 1.0000,95.3846 3.7684
6,XLM-R,91.3514 1.6216,96.1369 0.1901,97.0000 0.2987,96.4133 0.2917,99.0000 2.1344,93.0769 2.3077


In [72]:
len(results["model_name"]), len(results["1 a 10"])

(7, 7)

In [50]:
total_true_tags[0]

{'id': 17,
 'form': 'afetações',
 'lemma': 'afetação',
 'upos': 'NOUN',
 'xpos': None,
 'feats': {'Gender': 'Fem', 'Number': 'Plur'},
 'head': None,
 'deprel': '_',
 'deps': None,
 'misc': None}

In [32]:
preds[6]

['DET', 'NOUN', 'NOUN', 'VERB', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT']

In [28]:
 results_df.groupby("model_name")["acc"].std()

model_name
BERTimbau      0.006829
CNCSR          0.006598
DeBERTa-v3     0.007161
Meta-BiLSTM    0.010007
Stanza         0.006968
UDPipe         0.005064
XLM-R          0.007054
Name: acc, dtype: float64

In [14]:
results_df.to_csv("accuracy_level_porttinari.csv", index=False)

In [3]:
porttinari_gold_sents = conllu.parse(open("../tmp/porttinari/model-31361v0k-v0/pt_porttinari-ud-test.conllu", encoding="utf-8").read())
porttinari_pred_sents = conllu.parse(open("../tmp/porttinari/model-31361v0k-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())
dante_pred_sents = conllu.parse(open("../tmp/dante/model-1b7kowrx-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())
petrogold_pred_sents = conllu.parse(open("../tmp/petrogold/model-2v9qux4u-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())
multi_pred_sents = conllu.parse(open("../tmp/porttinari_dante_petrogold/model-3nr9mytc-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())

porttinari_df = get_sent_errors(porttinari_gold_sents, porttinari_pred_sents)
dante_df = get_sent_errors(porttinari_gold_sents, dante_pred_sents)
petrogold_df = get_sent_errors(porttinari_gold_sents, petrogold_pred_sents)
multi_df = get_sent_errors(porttinari_gold_sents, multi_pred_sents)

porttinari_df["Modelo"] = "porttinari"
dante_df["Modelo"] = "dante"
petrogold_df["Modelo"] = "petrogold"
multi_df["Modelo"] = "Multigênero"


for name, df in zip([
    "Porttinari", "DANTE", "PetroGold", "Multigênero"
], [porttinari_df, dante_df, petrogold_df, multi_df]):
    display(name)
    display(df["errors"].value_counts()/df.shape[0])

final_df = porttinari_df.append(dante_df, ignore_index=True).append(petrogold_df, ignore_index=True).append(multi_df, ignore_index=True)

# new_df = 
fig = px.histogram(final_df, x="errors", barmode="group", color="Modelo", width=800, height=400)
fig.write_image("porttinari_errors.png", scale=2)
fig.show()

'Porttinari'

0    0.845324
1    0.132494
2    0.020384
3    0.001799
Name: errors, dtype: float64

'DANTE'

0     0.583333
1     0.281175
2     0.094125
3     0.027578
4     0.006595
5     0.004796
6     0.001199
8     0.000600
13    0.000600
Name: errors, dtype: float64

'PetroGold'

0    0.552158
1    0.314748
2    0.099520
3    0.022782
4    0.007194
5    0.003597
Name: errors, dtype: float64

'Multigênero'

0    0.832734
1    0.142686
2    0.023381
3    0.001199
Name: errors, dtype: float64

  final_df = porttinari_df.append(dante_df, ignore_index=True).append(petrogold_df, ignore_index=True).append(multi_df, ignore_index=True)


In [4]:
for sent_id in dante_df[dante_df["errors"] > 6]["sent_id"]:
    for true_sent in porttinari_gold_sents:
        if true_sent.metadata["sent_id"] == sent_id:
            for sent in dante_pred_sents:
                if sent.metadata["sent_id"] == sent_id:
                    print(sent.metadata["sent_id"])
                    for pred_token, gold_token in zip(sent, true_sent):
                        print("{}\t{}\t{}".format(pred_token["form"], gold_token["upos"], pred_token["upos"]))
    print()

FOLHA_DOC003071_SENT011
Em	ADP	ADP
um	NUM	DET
dos	_	_
de	ADP	ADP
os	DET	DET
momentos	NOUN	NOUN
dignos	ADJ	ADJ
de	ADP	ADP
menção	NOUN	NOUN
,	PUNCT	PUNCT
a	DET	DET
backing	X	NOUN
vocal	X	X
Whitney	PROPN	PROPN
se	PRON	PRON
esgoelou	VERB	VERB
ao	_	_
a	ADP	ADP
o	DET	DET
cantar	VERB	VERB
"	PUNCT	PUNCT
How	PROPN	INTJ
Come	PROPN	VERB
You	PROPN	PRON
Dont	PROPN	X
Call	PROPN	X
Me	PROPN	PRON
"	PUNCT	PUNCT
e	CCONJ	CCONJ
foi	AUX	AUX
ovacionada	VERB	VERB
.	PUNCT	PUNCT

FOLHA_DOC003084_SENT009
Abriu	VERB	VERB
com	ADP	ADP
"	PUNCT	PUNCT
I've	PROPN	SYM
Got	PROPN	X
You	PROPN	PRON
Under	PROPN	ADP
My	PROPN	DET
Skin	PROPN	PROPN
"	PUNCT	PUNCT
e	CCONJ	CCONJ
teve	VERB	VERB
ótimos	ADJ	ADJ
momentos	NOUN	NOUN
,	PUNCT	PUNCT
como	ADP	ADP
"	PUNCT	PUNCT
The	PROPN	DET
Lady	PROPN	NOUN
is	X	AUX
a	X	DET
Tramp	PROPN	NOUN
"	PUNCT	PUNCT
e	CCONJ	CCONJ
"	PUNCT	PUNCT
They	PROPN	X
Can't	PROPN	X
Take	PROPN	VERB
"	PUNCT	PUNCT
.	PUNCT	PUNCT



In [6]:
dante_gold_sents = conllu.parse(open("../tmp/porttinari/model-31361v0k-v0/pt_dante-ud-test.conllu", encoding="utf-8").read())
porttinari_pred_sents = conllu.parse(open("../tmp/porttinari/model-31361v0k-v0/pt_dante-ud-test_pred.conllu", encoding="utf-8").read())
# dante_gold_sents = conllu.parse(open("../tmp/dante/model-1b7kowrx-v0/pt_dante-ud-test.conllu", encoding="utf-8").read())
dante_pred_sents = conllu.parse(open("../tmp/dante/model-1b7kowrx-v0/pt_dante-ud-test_pred.conllu", encoding="utf-8").read())
# petrogold_gold_sents = conllu.parse(open("../tmp/petrogold/model-2v9qux4u-v0/pt_petrogold-ud-test.conllu", encoding="utf-8").read())
petrogold_pred_sents = conllu.parse(open("../tmp/petrogold/model-2v9qux4u-v0/pt_dante-ud-test_pred.conllu", encoding="utf-8").read())
multi_pred_sents = conllu.parse(open("../tmp/porttinari_dante_petrogold/model-3nr9mytc-v0/pt_dante-ud-test_pred.conllu", encoding="utf-8").read())

porttinari_df = get_sent_errors(dante_gold_sents, porttinari_pred_sents)
dante_df = get_sent_errors(dante_gold_sents, dante_pred_sents)
petrogold_df = get_sent_errors(dante_gold_sents, petrogold_pred_sents)
multi_df = get_sent_errors(dante_gold_sents, multi_pred_sents)

porttinari_df["Modelo"] = "porttinari"
dante_df["Modelo"] = "dante"
petrogold_df["Modelo"] = "petrogold"
multi_df["Modelo"] = "Multigênero"
dante_final_df = porttinari_df.append(dante_df, ignore_index=True).append(petrogold_df, ignore_index=True).append(multi_df, ignore_index=True)

for name, df in zip([
    "Porttinari", "DANTE", "PetroGold", "Multigênero"
], [porttinari_df, dante_df, petrogold_df, multi_df]):
    display(name)
    display(df["errors"].value_counts()/df.shape[0])


# new_df = 
fig = px.histogram(dante_final_df, x="errors", barmode="group", color="Modelo", width=800, height=400)
fig.write_image("dante_errors.png", scale=2)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



'Porttinari'

1     0.273067
2     0.238155
3     0.167082
0     0.107232
4     0.098504
5     0.063591
6     0.019950
7     0.013716
8     0.008728
9     0.004988
10    0.002494
11    0.001247
12    0.001247
Name: errors, dtype: float64

'DANTE'

0    0.734414
1    0.189526
2    0.058603
3    0.012469
4    0.002494
5    0.001247
7    0.001247
Name: errors, dtype: float64

'PetroGold'

2     0.240648
1     0.185786
3     0.158354
4     0.120948
5     0.115960
0     0.062344
6     0.059850
7     0.031172
8     0.009975
9     0.008728
10    0.006234
Name: errors, dtype: float64

'Multigênero'

0    0.724439
1    0.194514
2    0.067332
4    0.006234
3    0.006234
8    0.001247
Name: errors, dtype: float64

In [9]:
petrogold_gold_sents = conllu.parse(open("../tmp/porttinari/model-31361v0k-v0/pt_petrogold-ud-test.conllu", encoding="utf-8").read())
porttinari_pred_sents = conllu.parse(open("../tmp/porttinari/model-31361v0k-v0/pt_petrogold-ud-test_pred.conllu", encoding="utf-8").read())
dante_pred_sents = conllu.parse(open("../tmp/dante/model-1b7kowrx-v0/pt_petrogold-ud-test_pred.conllu", encoding="utf-8").read())
petrogold_pred_sents = conllu.parse(open("../tmp/petrogold/model-2v9qux4u-v0/pt_petrogold-ud-test_pred.conllu", encoding="utf-8").read())
multi_pred_sents = conllu.parse(open("../tmp/porttinari_dante_petrogold/model-3nr9mytc-v0/pt_petrogold-ud-test_pred.conllu", encoding="utf-8").read())

porttinari_df = get_sent_errors(petrogold_gold_sents, porttinari_pred_sents)
dante_df = get_sent_errors(petrogold_gold_sents, dante_pred_sents)
petrogold_df = get_sent_errors(petrogold_gold_sents, petrogold_pred_sents)
multi_df = get_sent_errors(petrogold_gold_sents, multi_pred_sents)

porttinari_df["Modelo"] = "porttinari"
dante_df["Modelo"] = "dante"
petrogold_df["Modelo"] = "petrogold"
multi_df["Modelo"] = "Multigênero"
petrogold_final_df = porttinari_df.append(dante_df, ignore_index=True).append(petrogold_df, ignore_index=True).append(multi_df, ignore_index=True)

for name, df in zip([
    "Porttinari", "DANTE", "PetroGold", "Multigênero"
], [porttinari_df, dante_df, petrogold_df, multi_df]):
    display(name)
    display(df["errors"].value_counts()/df.shape[0])


# new_df = 
fig = px.histogram(petrogold_final_df, x="errors", barmode="group", color="Modelo", width=800, height=400)
fig.write_image("petrogold_errors.png", scale=2)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



'Porttinari'

0     0.483146
1     0.269663
2     0.152809
3     0.056180
4     0.024719
5     0.011236
17    0.002247
Name: errors, dtype: float64

'DANTE'

0     0.366292
1     0.289888
2     0.188764
3     0.078652
4     0.035955
5     0.022472
7     0.011236
6     0.004494
17    0.002247
Name: errors, dtype: float64

'PetroGold'

0     0.802247
1     0.141573
2     0.042697
3     0.006742
4     0.004494
14    0.002247
Name: errors, dtype: float64

'Multigênero'

0     0.782022
1     0.150562
2     0.044944
3     0.015730
4     0.004494
13    0.002247
Name: errors, dtype: float64

In [36]:
petrogold_gold_sents = conllu.parse(open("../tmp/porttinari/model-31361v0k-v0/pt_porttinari-ud-test.conllu", encoding="utf-8").read())
porttinari_pred_sents = conllu.parse(open("../tmp/porttinari/model-31361v0k-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())
dante_pred_sents = conllu.parse(open("../tmp/dante/model-1b7kowrx-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())
petrogold_pred_sents = conllu.parse(open("../tmp/petrogold/model-2v9qux4u-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())
multi_pred_sents = conllu.parse(open("../tmp/dante_petrogold/model-3h6mbbp2-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())

porttinari_df = get_sent_errors(petrogold_gold_sents, porttinari_pred_sents)
dante_df = get_sent_errors(petrogold_gold_sents, dante_pred_sents)
petrogold_df = get_sent_errors(petrogold_gold_sents, petrogold_pred_sents)
multi_df = get_sent_errors(petrogold_gold_sents, multi_pred_sents)

porttinari_df["Modelo"] = "porttinari"
dante_df["Modelo"] = "dante"
petrogold_df["Modelo"] = "petrogold"
multi_df["Modelo"] = "Multigênero"
petrogold_final_df = porttinari_df.append(dante_df, ignore_index=True).append(petrogold_df, ignore_index=True).append(multi_df, ignore_index=True)

for name, df in zip([
    "Porttinari", "DANTE", "PetroGold", "Multigênero"
], [porttinari_df, dante_df, petrogold_df, multi_df]):
    display(name)
    display(df["errors"].value_counts()/df.shape[0])


# new_df = 
px.histogram(petrogold_final_df, x="errors", barmode="group", color="Modelo")


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



'Porttinari'

0    0.845324
1    0.132494
2    0.020384
3    0.001799
Name: errors, dtype: float64

'DANTE'

0     0.583333
1     0.281175
2     0.094125
3     0.027578
4     0.006595
5     0.004796
6     0.001199
8     0.000600
13    0.000600
Name: errors, dtype: float64

'PetroGold'

0    0.552158
1    0.314748
2    0.099520
3    0.022782
4    0.007194
5    0.003597
Name: errors, dtype: float64

'Multigênero'

0    0.684652
1    0.233213
2    0.065947
3    0.013789
4    0.002398
Name: errors, dtype: float64

### Porttinari

In [3]:
porttinari_total_correct = porttinari_df[porttinari_df["errors"] == 0].shape[0]
print("Total percentage of totally correct sentences: {:.10f}%".format(100*porttinari_total_correct/porttinari_df.shape[0]))
display(porttinari_df["errors"].value_counts() / porttinari_df.shape[0])
px.histogram(porttinari_df, x="errors")

Total percentage of totally correct sentences: 84.5323741007%


0    0.845324
1    0.132494
2    0.020384
3    0.001799
Name: errors, dtype: float64

### DANTE

In [36]:
dante_total_correct = dante_df[dante_df["errors"] == 0].shape[0]
print("Total percentage of totally correct sentences: {:.4f}%".format(100*dante_total_correct/dante_df.shape[0]))
display(dante_df["errors"].value_counts() / dante_df.shape[0])
px.histogram(dante_df, x="errors")

Total percentage of totally correct sentences: 73.4414%


0    0.734414
1    0.189526
2    0.058603
3    0.012469
4    0.002494
5    0.001247
7    0.001247
Name: errors, dtype: float64

### PetroGold

In [37]:
petrogold_total_correct = petrogold_df[petrogold_df["errors"] == 0].shape[0]
print("Total percentage of totally correct sentences: {:.4f}%".format(100*petrogold_total_correct/petrogold_df.shape[0]))
display(petrogold_df["errors"].value_counts() / petrogold_df.shape[0])
px.histogram(petrogold_df, x="errors")

Total percentage of totally correct sentences: 80.2247%


0     0.802247
1     0.141573
2     0.042697
3     0.006742
4     0.004494
14    0.002247
Name: errors, dtype: float64

## Repetindo análise com melhor modelo multigênero

In [38]:
porttinari_pred_sents = conllu.parse(open("../tmp/porttinari_dante_petrogold/model-3nr9mytc-v0/pt_porttinari-ud-test_pred.conllu", encoding="utf-8").read())
dante_pred_sents = conllu.parse(open("../tmp/porttinari_dante_petrogold/model-3nr9mytc-v0/pt_dante-ud-test_pred.conllu", encoding="utf-8").read())
petrogold_pred_sents = conllu.parse(open("../tmp/porttinari_dante_petrogold/model-3nr9mytc-v0/pt_petrogold-ud-test_pred.conllu", encoding="utf-8").read())

porttinari_df = get_sent_errors(porttinari_gold_sents, porttinari_pred_sents)
dante_df = get_sent_errors(dante_gold_sents, dante_pred_sents)
petrogold_df = get_sent_errors(petrogold_gold_sents, petrogold_pred_sents)

### Porttinari

In [39]:
porttinari_total_correct = porttinari_df[porttinari_df["errors"] == 0].shape[0]
print("Total percentage of totally correct sentences: {:.10f}%".format(100*porttinari_total_correct/porttinari_df.shape[0]))
display(porttinari_df["errors"].value_counts() / porttinari_df.shape[0])
px.histogram(porttinari_df, x="errors")

Total percentage of totally correct sentences: 83.2733812950%


0    0.832734
1    0.142686
2    0.023381
3    0.001199
Name: errors, dtype: float64

### DANTE

In [40]:
dante_total_correct = dante_df[dante_df["errors"] == 0].shape[0]
print("Total percentage of totally correct sentences: {:.4f}%".format(100*dante_total_correct/dante_df.shape[0]))
display(dante_df["errors"].value_counts() / dante_df.shape[0])
px.histogram(dante_df, x="errors")

Total percentage of totally correct sentences: 72.4439%


0    0.724439
1    0.194514
2    0.067332
4    0.006234
3    0.006234
8    0.001247
Name: errors, dtype: float64

### PetroGold

In [41]:
petrogold_total_correct = petrogold_df[petrogold_df["errors"] == 0].shape[0]
print("Total percentage of totally correct sentences: {:.4f}%".format(100*petrogold_total_correct/petrogold_df.shape[0]))
display(petrogold_df["errors"].value_counts() / petrogold_df.shape[0])
px.histogram(petrogold_df, x="errors")

Total percentage of totally correct sentences: 78.2022%


0     0.782022
1     0.150562
2     0.044944
3     0.015730
4     0.004494
13    0.002247
Name: errors, dtype: float64