In [1]:
import os
import json

import torch
import tqdm

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
data = json.load(open("Complete_dataset/test.json"))
files = os.listdir("Complete_dataset/CT json/")
files.remove(".DS_Store")

In [3]:
files_data = {file[:-5]:json.load(open(f"Complete_dataset/CT json/{file}")) for file in files}

In [4]:
data_expanded = []
for _id, value in data.items():
    temp = {}
    temp["id"] = _id
    p_nctid = value["Primary_id"]
    s_nctid = value.get("Secondary_id")
    section_id = value["Section_id"]
    statement = value["Statement"]
    primary_evidence = files_data[p_nctid][section_id]
    temp["statement"] = statement
    temp["primary_evidence"] = primary_evidence
    # temp["label"] = value["Label"]
    
    if s_nctid is not None:
        secondary_evidence = files_data[s_nctid][section_id]
        temp["secondary_evidence"] = secondary_evidence
    
    data_expanded.append(temp)

In [5]:
tokenizer = T5Tokenizer.from_pretrained("bigscience/T0pp",cache_dir="/mnt/data/huggingface_cache")

In [6]:
model = T5ForConditionalGeneration.from_pretrained("bigscience/T0pp", 
                                                   cache_dir="/mnt/data/huggingface_cache", 
                                                   device_map="auto",
                                                   )

In [32]:
def get_input_text(premise, hypothesis):
    options_prefix = "OPTIONS:\n- "
    separator = "\n- "
    options_ = options_prefix + f"{separator}".join(["Entailment","Contradiction"])
    return f"{premise} \n Question: Does this imply that {hypothesis}? Entailment or Contradiction?"
    # return f"{premise} \n Question: Does this imply that {hypothesis}? {options_} Answer:"

In [33]:
samples = []
for sample in data_expanded:
    primary_evidence = "".join([x.strip() for x in sample['primary_evidence']])
    sentence = f"Primary trial evidence are {primary_evidence}"
    secondary_evidence = sample.get("secondary_evidence")
    if secondary_evidence:
        secondary_evidence = "".join([x.strip() for x in sample['secondary_evidence']])
        sentence = f"{sentence} Secondary trial evidence are {secondary_evidence}"
    input_text = get_input_text(sentence, sample['statement'])
    temp = {"text":input_text, "label":0}
    samples.append(temp)

In [36]:
labels = []
pred = []
with torch.inference_mode():
    for sample in tqdm.tqdm(samples):
        labels.append(sample["label"])
        input_ids = tokenizer(sample["text"], return_tensors="pt",).input_ids.to("cuda")
        outputs = model.generate(input_ids)
        pred.append(tokenizer.decode(outputs[0]))
        # break

100%|██████████| 500/500 [07:16<00:00,  1.14it/s]


In [37]:
pred

['<pad> No</s>',
 '<pad> No</s>',
 '<pad> Contradiction</s>',
 '<pad> Contradiction</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> Yes</s>',
 '<pad> Yes</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> Yes</s>',
 '<pad> Contradiction</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> Contradiction</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> Entailment</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> Contradiction</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> Entailment</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> Yes</s>',
 '<pad> Yes</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad> No</s>',
 '<pad

In [38]:
pred = [p[5:][:-4].strip() for p in pred]

In [39]:
from collections import Counter

In [40]:
Counter(pred)

Counter({'No': 323, 'Contradiction': 48, 'Yes': 91, 'Entailment': 38})

In [41]:
prediction_dict = {}
for _id,pred_x in zip(data, pred):
    if pred_x == "Yes":
        pred_x = "Entailment"
    elif pred_x == "No":
        pred_x = "Contradiction"
    prediction_dict[str(_id)] = {"Prediction":pred_x}

In [42]:
json.dump(prediction_dict, open("results.json",'w'),indent=4)
!zip results_zeroshot_TOpp_prompt.zip results.json

  adding: results.json (deflated 74%)
