In [1]:
import json
import re
from datetime import datetime
from pprint import pprint
from statistics import mean

import numpy as np
import pandas as pd
from simpletransformers.t5 import T5Model
from sklearn.metrics import accuracy_score, f1_score
from transformers.data.metrics.squad_metrics import compute_exact, compute_f1
from simpletransformers.t5 import T5Model
from sklearn.model_selection import train_test_split


def f1(truths, preds):
    return mean([compute_f1(truth, pred) for truth, pred in zip(truths, preds)])


def exact(truths, preds):
    return mean([compute_exact(truth, pred) for truth, pred in zip(truths, preds)])



In [2]:
def clean_text(string):
    output = string.strip()
    # replacements = (("“", '"'), ("”", '"'), ("//", ""), ("«", '"'), ("»",'"'))
    replacements = (
      ("“", ''), ("”", ''), ("//", ""), ("«", ''), ("»",''), (",", ''),
      (";", ''), (".", ''),
    #   ("?", ''), ("¿", ''), ("¡", ''), ("!", ''), ("-", ' '),
    )
    for replacement in replacements:
        output = output.replace(*replacement)
    # Any sequence of two or more spaces should be converted into one space
    output = re.sub(r'(?is)\s+', ' ', output)
    return output.strip().lower()


def metric2binary(meter, pad=11):
    return ([1 if syllable == "+" else 0 for syllable in meter] + [0] * (11 - len(meter)))[:pad]


def label2metric(label):
    return "".join("+" if l else "-" for l in label)


def label2indexed(label):
    return ", ".join(str(i + 1) for i, l in enumerate(label) if l == "+")


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [3]:
es_test = (pd
    .read_json(open("adso100.json"))
    .query("manually_checked == True")[["line_text", "metrical_pattern"]]
    .assign(
        line_text=lambda x: x["line_text"].apply(clean_text),
        length=lambda x: x["metrical_pattern"].str.len()
    )
    .drop_duplicates("line_text")
    .rename(columns={"line_text": "input_text", "metrical_pattern": "meter"})
)
es_test = es_test[es_test["length"] == 11]
pos_names = [f"pos{i}" for i in range(1, 12)]
pos_labels = es_test.meter.apply(metric2binary)
es_test["labels"] = pos_labels
es_test[pos_names] = pos_labels.tolist()
es = (pd
    .read_json(open("adso.json"))
    .query("manually_checked == True")[["line_text", "metrical_pattern"]]
    .assign(
        line_text=lambda x: x["line_text"].apply(clean_text),
        length=lambda x: x["metrical_pattern"].str.len()
    )
    .drop_duplicates("line_text")
    .rename(columns={"line_text": "input_text", "metrical_pattern": "meter"})
)
es = es[~es["input_text"].isin(es_test["input_text"])][es["length"] == 11]
pos_labels = es.meter.apply(metric2binary)
es["labels"] = pos_labels
es[pos_names] = pos_labels.tolist()



In [4]:
es["prefix"] = "scansion"
es_test["prefix"] = "scansion"
es["predict"] = es.input_text.apply(lambda x: f"scansion: {x}")
es_test["predict"] = es_test.input_text.apply(lambda x: f"scansion: {x}")
es["target_text"] = es.meter.apply(label2indexed)
es_test["target_text"] = es_test.meter.apply(label2indexed)

In [5]:
es_train, es_eval = train_test_split(
    es[["prefix", "input_text", "target_text"]], test_size=0.25, random_state=42)

In [7]:
model_args = {
    "output_dir": "./bertsification-mt5-google-mt5-large",
    "max_seq_length": 32,
    "train_batch_size": 8,
    "eval_batch_size": 32,
    "num_train_epochs": 5,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 10000,
    "evaluate_during_training_verbose": True,
    
    "use_multiprocessing": False,
    "fp16": False,

    "save_steps": -1,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,

    "reprocess_input_data": True,
    "overwrite_output_dir": True,

    #"wandb_project": "mT5 Scansion for Spanish",
}

model = T5Model("mt5", "google/mt5-large", args=model_args)
model.train_model(es_train, eval_data=es_eval)

HBox(children=(FloatProgress(value=0.0, max=6555.0), HTML(value='')))


Using Adafactor for T5


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=820.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, max=2185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=820.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, max=2185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=820.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, max=2185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=820.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, max=2185.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=820.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, max=2185.0), HTML(value='')))





(4100,
 {'global_step': [820, 1640, 2460, 3280, 4100],
  'eval_loss': [0.888180130633755,
   0.8088074032811151,
   0.8646798099296681,
   0.7684073059455209,
   0.7771830144135848],
  'train_loss': [0.7398898005485535,
   1.0607621669769287,
   1.1390413045883179,
   0.5957548022270203,
   0.9672317504882812]})

5 Epochs
```
(4100,
 {'global_step': [820, 1640, 2460, 3280, 4100],
  'eval_loss': [0.888180130633755,
   0.8088074032811151,
   0.8646798099296681,
   0.7684073059455209,
   0.7771830144135848],
  'train_loss': [0.7398898005485535,
   1.0607621669769287,
   1.1390413045883179,
   0.5957548022270203,
   0.9672317504882812]})
```

1 Epoch
```
(820,
 {'global_step': [820],
  'eval_loss': [0.9217365530953892],
  'train_loss': [0.720869243144989]})
```

---

In [6]:
model_args = {
    "overwrite_output_dir": True,
    "max_seq_length": 32,
    "eval_batch_size": 8,
    "num_train_epochs": 1,
    "use_multiprocessing": False,
    "num_workers": 1,
    "num_beams": None,
    "do_sample": True,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}

In [7]:
# Load the trained model
model = T5Model("mt5", "./bertsification-mt5-google-mt5-large", args=model_args)

In [8]:
to_predict = es_test["predict"].tolist()
truth = es_test["target_text"].tolist()
tasks = es_test["prefix"].tolist()

In [17]:
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

# Get the model predictions
predictions = model.predict(to_predict)

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=176.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=4203.0, style=ProgressStyle(descri…




In [18]:
# Saving the predictions if needed
with open(f"bertsification-mt5-google-mt5-large_preds/predictions_{datetime.now()}.txt", "w") as f:
    for i, text in enumerate(es_test["input_text"].tolist()):
        f.write(str(text) + "\n\n")

        f.write("Truth:\n")
        f.write(truth[i] + "\n\n")

        f.write("Prediction:\n")
        for pred in predictions[i]:
            f.write(str(pred) + "\n")
        f.write(
            "________________________________________________________________________________\n"
        )

In [19]:
# Taking only the first prediction
preds = [pred[0] for pred in predictions]
es_test["predicted"] = preds

# Evaluating the tasks separately
output_dict = {
    "scansion": {"truth": [], "preds": []}
}

results_dict = {}

for task, truth_value, pred in zip(tasks, truth, preds):
    output_dict[task]["truth"].append(truth_value)
    output_dict[task]["preds"].append(pred)

print("-----------------------------------")
print("Results: ")
for task, outputs in output_dict.items():
    if task == "scansion":
        try:
            task_truth = output_dict[task]["truth"]
            task_preds = output_dict[task]["preds"]
            results_dict[task] = {
                "F1 Score": f1(task_truth, task_preds),
                "Exact matches": exact(task_truth, task_preds),
                "Accuracy Score": accuracy_score(task_truth, task_preds),
            }
            print(f"Scores for {task}:")
            print(f"F1 score: {f1(task_truth, task_preds)}")
            print(f"Exact matches: {exact(task_truth, task_preds)}")
            print(f"Accuracy Score: {results_dict[task]['Accuracy Score']}")
            print()
        except:
            pass

with open(f"bertsification-mt5-google-mt5-large_preds/result_{datetime.now()}.json", "w") as f:
    json.dump(results_dict, f)

-----------------------------------
Results: 
Scores for scansion:
F1 score: 0.6626517189686354
Exact matches: 0.03997144896502498
Accuracy Score: 0.03997144896502498



In [23]:
results_dict

{'scansion': {'F1 Score': 0.6626517189686354,
  'Exact matches': 0.03997144896502498,
  'Accuracy Score': 0.03997144896502498}}

In [25]:
es_test["all_predicted"] = predictions
sum(es_test.apply(lambda row: row["target_text"] in row["all_predicted"], axis=1)), es_test.shape[0]

(171, 1401)

In [24]:
sum(es_test.apply(lambda row: row["target_text"] in row["all_predicted"], axis=1)) / es_test.shape[0]

0.12205567451820129

In [26]:
sum(es_test.apply(lambda row: row["target_text"] == row["all_predicted"][0], axis=1)) / es_test.shape[0]

0.03997144896502498