In [1]:
import os
import json

import torch
import tqdm

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
split = "train.json"
data = json.load(open(f"Complete_dataset/{split}"))
files = os.listdir("Complete_dataset/CT json/")
files.remove(".DS_Store")

In [3]:
files_data = {file[:-5]:json.load(open(f"Complete_dataset/CT json/{file}")) for file in files}

In [4]:
data_expanded = []
for _id, value in data.items():
    temp = {}
    temp["id"] = _id
    p_nctid = value["Primary_id"]
    s_nctid = value.get("Secondary_id")
    section_id = value["Section_id"]
    statement = value["Statement"]
    primary_evidence = files_data[p_nctid][section_id]
    temp["statement"] = statement
    temp["primary_evidence"] = primary_evidence
    temp["label"] = value["Label"]
    
    if s_nctid is not None:
        secondary_evidence = files_data[s_nctid][section_id]
        temp["secondary_evidence"] = secondary_evidence
    
    data_expanded.append(temp)

In [5]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")

In [6]:
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", cache_dir="/mnt/data/huggingface_cache", device_map="auto",)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
def get_input_text(premise, hypothesis):
    options_prefix = "OPTIONS:\n- "
    separator = "\n- "
    options_ = options_prefix + f"{separator}".join(["Entailment","Contradiction"])
    return f"{premise} \n Question: Does this imply that {hypothesis}? {options_}"


In [8]:
samples = []
for sample in data_expanded:
    primary_evidence = "".join(sample['primary_evidence'])
    sentence = f"Primary trial evidence are {primary_evidence}"
    secondary_evidence = sample.get("secondary_evidence")
    if secondary_evidence:
        secondary_evidence = "".join(sample['secondary_evidence'])
        sentence = f"{sentence} Secondary trial evidence are {secondary_evidence}"
    input_text = get_input_text(sentence, sample['statement'])
    temp = {"text":input_text, "label":sample['label']}
    samples.append(temp)

In [9]:
labels = []
pred = []
with torch.inference_mode():
    for sample in tqdm.tqdm(samples):
        labels.append(sample["label"])
        input_ids = tokenizer(sample["text"], return_tensors="pt",).input_ids.to("cuda")
        outputs = model.generate(input_ids)
        pred.append(tokenizer.decode(outputs[0]))

  0%|          | 1/1700 [00:01<50:26,  1.78s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1258 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1700/1700 [11:05<00:00,  2.55it/s]


In [10]:
pred

['<pad> Contradiction</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Entailment</s>',
 '<pad> Contradiction</s>',
 '<pad> Entailment</s>',
 '<pad> Contra

In [11]:
pred = [p[5:][:-4].strip() for p in pred]

In [16]:
set(pred)

{'Contradiction', 'Entailment', 'No', 'Yes'}

In [13]:
labels[0:10]

['Contradiction',
 'Contradiction',
 'Entailment',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Entailment',
 'Contradiction']

In [14]:
from sklearn.metrics import f1_score

In [15]:
f1_score(pred, labels,pos_label="Entailment")

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [18]:
prediction_dict = {}
for _id,pred_x in zip(data, pred):
    prediction_dict[str(_id)] = {"Prediction":pred_x}

In [None]:
json.dump(prediction_dict, open("results.json",'w'),indent=4)
!zip results_xl_zero_dev.zip results.json

updating: results.json (deflated 73%)


In [19]:
from sklearn.metrics import f1_score
uuid_list = list(prediction_dict.keys())
results_pred = []
gold_labels = []
for i in range(len(uuid_list)):
    if prediction_dict[uuid_list[i]]["Prediction"] in ["Entailment", "Yes"]:
        results_pred.append(1)
    else:
        results_pred.append(0)
    if data[uuid_list[i]]["Label"] in ["Entailment", "No"]:
        gold_labels.append(1)
    else:
        gold_labels.append(0)
f1_score(gold_labels,results_pred)

0.6754896770778189