In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

In [2]:
from control_simp.models.end_to_end import BartFinetuner
from control_simp.data.bart import BartDataModule

# model_loc = "/media/liam/data2/control_simp_ckps/3fn5qcza/checkpoints/epoch=0.ckpt" # end-to-end baseline 4class
# model_loc = "/media/liam/data2/control_simp_ckps/d68v4v5z/checkpoints/epoch=3-step=259528.ckpt" # end-to-end baseline 3class
# model_loc = "/media/liam/data2/control_simp_ckps/12s3nazz/checkpoints/epoch=6-step=166580.ckpt" # end-to-end control_tok 4class
model_loc = "/media/liam/data2/control_simp_ckps/25ldpr2h/checkpoints/epoch=8-step=170317.ckpt" # end-to-end control_tok 3class

model = BartFinetuner.load_from_checkpoint(model_loc, strict=False).to("cuda").eval()
# model = BartFinetuner()

In [10]:
print(model.tokenizer("Hey there fella."))
print(model.tokenizer("<ssplit>"))
text = model.tokenizer("<ssplit> Hey there fella.")
print(text)
print(model.tokenizer("<dsplit> Hey there fella."))
print(model.ids_to_clean_text(text["input_ids"]))

{'input_ids': [0, 11468, 89, 1064, 102, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [0, 50267, 2], 'attention_mask': [1, 1, 1]}
{'input_ids': [0, 50267, 13368, 89, 1064, 102, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [0, 50268, 13368, 89, 1064, 102, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
['<s>', '<ssplit>', 'Hey', 'there', 'fell', 'a', '.', '</s>']


In [3]:
test_file = "/media/liam/data2/discourse_data/simp_clf_data/gen/control_simp_valid_exp1.csv"
test_set = pd.read_csv(test_file)
batch_size = 16

In [4]:
from control_simp.models.end_to_end import run_generator

SAVE_PREDS = False
MAX_SAMPLES = 1024
USE_CTRL = True

test_set = test_set[:MAX_SAMPLES]
test_set["pred"] = run_generator(model, test_set, ctrl_toks=USE_CTRL, max_samples=MAX_SAMPLES)
if SAVE_PREDS:
    test_set.to_csv(f"{model_loc.split('checkpoints')[0]}preds.csv", index=None)
    

In [5]:
from control_simp.models.eval import run_evaluation
results = run_evaluation(test_set, tokenizer=model.tokenizer)

In [6]:
print(f"BLEU: {np.mean(results['bleu'])}\nSARI: {np.mean(results['sari'])}")

BLEU: 73.37815211659724
SARI: 56.42390767826076


In [7]:
for i, row in test_set[100:200].iterrows():
    # put y seqs through tokenizer to clean up spaces, etc.
    y_ids = model.tokenizer(row.simple)["input_ids"]
    y = model.tokenizer.decode(y_ids, skip_special_tokens=True)
    
    print(f"{row.label}\n{row.complex}\n--> BLEU: {results['bleu'][i]}\n--> SARI: {results['sari'][i]}\n--> {row.pred}\n-->{y}\n-----")

3
Michigan Governor Rick Snyder signed a lease on October 1 , 2013 to lease the park from the city for 30 years ; while the City Council rejected that offer in mid-October , the Michigan Emergency Loan Board opted for the State 's proposal on November 12 , 2013 .
--> BLEU: 58.46928673110214
--> SARI: 67.89949172990109
--> Michigan Governor Rick Snyder signed a lease on October 1, 2013 to lease the park from the city for 30 years. While the City Council rejected that offer in mid-October, the Michigan Emergency Loan Board opted for the State's proposal on November 12, 2013.
--> Michigan Governor Rick Snyder signed a lease on October 1, 2013 to lease the park from the city for 30 years, while the City Council rejected that offer in mid-October, proposing for a 10 - year lease instead. Having to choose one of the proposals, the Michigan Emergency Loan Board opted for the State's 30 - year proposal on November 12, 2013.
-----
2
During his work with the NFL , he developed the instant replay

1
The name of the package must be unique in a project.
--> BLEU: 27.483906537871352
--> SARI: 30.09439634439634
--> The package name must be unique in a project.
--> The name of the group must be unique in the project.
-----
2
The Alfa Romeo 33 Stradale is an extremely rare road car built by Alfa Romeo of Italy as only 18 are reported to have been made .
--> BLEU: 68.94063546511065
--> SARI: 51.64253878515556
--> The Alfa Romeo 33 Stradale is an extremely rare road car. Only 18 are reported to have been made.
--> The Alfa Romeo 33 Stradale is an extremely rare road car built by Alfa Romeo of Italy. Only 18 are reported to have been made.
-----
1
date of birth is required to be submitted.
--> BLEU: 0.0
--> SARI: 38.93518518518518
--> The birth date must be submitted.
--> A date of birth must be provided.
-----
1
" My water bill is $ 80 a month , my gas bill is $ 200 a month , my electricity bill is $ 45 a month , " said Edwards , adding that she 's " in the middle of a really bad divorc