In [None]:
!pip install openai
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/CSE635proj
!ls

In [None]:
!pip install transformers datasets
import collections
import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_scheduler
from tqdm.notebook import tqdm_notebook as tqdm
import transformers
from sklearn.metrics import accuracy_score
import json
import os
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score,precision_score,recall_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
!huggingface-cli login

# simple TF-IDF approach
test performance of a simple TF-IDF approach provided by SemEval

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import numpy

In [None]:
with open('dev.json', 'r', encoding='utf-8') as json_file:
    dev = json.load(json_file)
uuid_list = list(dev.keys())
statements = []

for i in range(len(uuid_list)): # retrieve all statements from the dev dataset
  statements.append(dev[uuid_list[i]]["Statement"])

# TF-IDF Entailment prediction baseline
Results = {}

for i in range(len(uuid_list)):
  with open('CT json/'+dev[uuid_list[i]]["Primary_id"]+".json", 'r', encoding='utf-8') as json_file:
    primary_ctr = json.load(json_file)
      
  primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]
  vectorizer = TfidfVectorizer().fit(primary_section)
    
  X_s = vectorizer.transform([statements[i]])
  X_p = vectorizer.transform(primary_section)
  primary_scores = cosine_distances(X_s, X_p)

  if dev[uuid_list[i]]["Type"] == "Comparison":
    with open('CT json/'+dev[uuid_list[i]]["Secondary_id"]+".json", 'r', encoding='utf-8') as json_file:
      secondary_ctr = json.load(json_file)
        
    secondary_section = secondary_ctr[dev[uuid_list[i]]["Section_id"]]
    vectorizer = TfidfVectorizer().fit(secondary_section)
      
    X_s = vectorizer.transform([statements[i]])
    X_p = vectorizer.transform(secondary_section)
    secondary_scores = cosine_distances(X_s, X_p)
      
    # Combine and average the cosine distances of all entries from the relevant section of the primary and secondary trial
    combined_scores = []
    combined_scores.extend(secondary_scores[0])
    combined_scores.extend(primary_scores[0])
    score = numpy.average(combined_scores)
      
    #If the cosine distance is gless than 0.9 the prediction is entailment
    if score > 0.9:
      Prediction = "Contradiction"
    else:
      Prediction = "Entailment"
    Results[str(uuid_list[i])] = {"Prediction":Prediction}
  else:
    #If the cosine distance is greater than 0.9 the prediction is contradiction
    score = numpy.average(primary_scores)
    if score > 0.9:
      Prediction = "Contradiction"
    else:
      Prediction = "Entailment"
    Results[str(uuid_list[i])] = {"Prediction":Prediction}

# evaluation
gold = dev
results = Results
uuid_list = list(results.keys())

results_pred = []
gold_labels = []
for i in range(len(uuid_list)):
    if results[uuid_list[i]]["Prediction"] == "Entailment":
        results_pred.append(1)
    else:
        results_pred.append(0)
    if gold[uuid_list[i]]["Label"] == "Entailment":
        gold_labels.append(1)
    else:
        gold_labels.append(0)

f_score = f1_score(gold_labels,results_pred)
p_score = precision_score(gold_labels,results_pred)
r_score = recall_score(gold_labels,results_pred)

print('F1:{:f}'.format(f_score))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))

In [None]:
with open('dev.json', 'r', encoding='utf-8') as json_file:
    dev = json.load(json_file)

uuid_list = list(dev.keys())
statements = []
for i in range(len(uuid_list)): # retrieve all statements from the dev dataset
  statements.append(dev[uuid_list[i]]["Statement"])
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("cnut1648/biolinkbert-large-mnli-snli")
model = AutoModelForSequenceClassification.from_pretrained("cnut1648/biolinkbert-large-mnli-snli").to(device)

results = {}
for i in range(len(uuid_list)):
    with open('CT json/'+dev[uuid_list[i]]["Primary_id"]+".json", 'r', encoding='utf-8') as json_file:
        primary_ctr = json.load(json_file)
    primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]

    statements_repeat = [statements[i]] * len(primary_section)
    inputs = tokenizer(primary_section, statements_repeat, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits).item()
    prediction = "Contradiction" if predicted_class == 0 else "Entailment"

    results[str(uuid_list[i])] = {"Prediction": prediction}

gold = dev
uuid_list = list(results.keys())
results_pred = []
gold_labels = []
misclassified_examples = []

for i in range(len(uuid_list)):
    if results[uuid_list[i]]["Prediction"] == "Entailment":
        results_pred.append(1)
    else:
        results_pred.append(0)
    if gold[uuid_list[i]]["Label"] == "Entailment":
        gold_labels.append(1)
    else:
        gold_labels.append(0)

f_score = f1_score(gold_labels, results_pred, zero_division='warn')
p_score = precision_score(gold_labels, results_pred, zero_division='warn')
r_score = recall_score(gold_labels, results_pred, zero_division='warn')

print('F1:{:f}'.format(f_score))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))

conf_matrix = confusion_matrix(gold_labels, results_pred)
print('Confusion Matrix:')
print(conf_matrix)
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]
TP = conf_matrix[1, 1]

print(f'True Negatives (TN): {TN}')
print(f'False Positives (FP): {FP}')
print(f'False Negatives (FN): {FN}')
print(f'True Positives (TP): {TP}')

class_labels = ["Contradiction", "Entailment"]
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=class_labels)

disp.plot()
plt.show()

finetuned BioLinkBERT - Simple Test on Dev dataset

In [None]:
with open('train.json', 'r', encoding='utf-8') as json_file:
  train_data = json.load(json_file)

statements = []
labels = []

for entry_id, entry_data in train_data.items():
  statements.append(entry_data['Statement'])
  labels.append(entry_data['Label'])

label_map = {'Entailment': 0, 'Contradiction': 1}
labels_mapped = [label_map[label] for label in labels]

train_stmt_tmp, test_stmt, train_label_tmp, test_label = train_test_split(statements, labels_mapped, test_size=0.2, random_state=42)
train_stmt, valid_stmt, train_label, valid_label = train_test_split(train_stmt_tmp, train_label_tmp, test_size=0.2, random_state=42)

tokenized_train_stmt = tokenizer(train_stmt, padding=True, truncation=True, return_tensors="pt")
tokenized_valid_stmt = tokenizer(valid_stmt, padding=True, truncation=True, return_tensors="pt")
tokenized_test_stmt = tokenizer(test_stmt, padding=True, truncation=True, return_tensors="pt")

train_dataset = TensorDataset(tokenized_train_stmt['input_ids'], tokenized_train_stmt['attention_mask'], torch.tensor(train_label))
valid_dataset = TensorDataset(tokenized_valid_stmt['input_ids'], tokenized_valid_stmt['attention_mask'], torch.tensor(valid_label))
test_dataset = TensorDataset(tokenized_test_stmt['input_ids'], tokenized_test_stmt['attention_mask'], torch.tensor(test_label))

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=3e-05)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=10*len(train_dataloader))

# Training with num of epochs = 10
num_epochs = 10
model.train()
for epoch in range(num_epochs):
  print(f'------------------------------[EPOCH {epoch}]------------------------------')
  model.train()
  train_loss_list = []
  train_correct = 0
  train_samples = 0
  for batch in train_dataloader:
    optimizer.zero_grad()
    outputs = model(input_ids=batch[0].to(device), attention_mask=batch[1].to(device), labels=batch[2].to(device))
    train_loss = outputs.loss
    train_loss_list.append(train_loss.item())
    train_loss.backward()
    optimizer.step()
    lr_scheduler.step()

    pred = torch.argmax(outputs.logits, axis=1)
    train_correct += torch.sum(pred==batch[2].to(device)).item()
    train_samples += len(batch[2].to(device))

    if train_samples % 10 == 0:
      average_loss = sum(train_loss_list) / len(train_loss_list)
      accuracy = train_correct / train_samples
      print(f"Batch [{train_samples}/{len(train_dataset)}] Average Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}")

  epoch_train_loss = sum(train_loss_list) / len(train_loss_list)
  epoch_train_acc = train_correct / train_samples
  print(f'train loss: {epoch_train_loss:.4f} | train accuracy: {epoch_train_acc:.4f}')

  model.eval()
  valid_loss_list = []
  valid_correct = 0
  valid_samples = 0
  for batch in valid_dataloader:
    with torch.no_grad():
      outputs = model(input_ids=batch[0].to(device), attention_mask=batch[1].to(device), labels=batch[2].to(device))
      valid_loss = outputs.loss
      pred = torch.argmax(outputs.logits, axis=1)
      valid_correct += torch.sum(pred==batch[2].to(device)).item()
      valid_samples += len(batch[2].to(device))
      valid_loss_list.append(valid_loss.item())
  epoch_valid_loss = sum(valid_loss_list) / len(valid_loss_list)
  epoch_valid_acc = valid_correct / valid_samples
  print(f'validation loss: {epoch_valid_loss:.4f} | validation accuracy: {epoch_valid_acc:.4f}')

# Testing
model.eval()
test_loss_list = []
test_correct = 0
test_samples = 0
for batch in test_dataloader:
  with torch.no_grad():
    outputs = model(input_ids=batch[0].to(device), attention_mask=batch[1].to(device), labels=batch[2].to(device))
    test_loss = outputs.loss
    pred = torch.argmax(outputs.logits, axis=1)
    test_correct += torch.sum(pred==batch[2].to(device)).item()
    test_samples += len(batch[2].to(device))
    test_loss_list.append(test_loss.item())

total_test_loss = sum(test_loss_list) / len(test_loss_list)
total_test_acc = test_correct / test_samples
print(f'test loss: {total_test_loss:.4f} | test accuracy: {total_test_acc:.4f}')

# testing on dev dataset after fine-tuning with learning scheduler
results1 = {}
for i in range(len(uuid_list)):
    with open('CT json/'+dev[uuid_list[i]]["Primary_id"]+".json", 'r', encoding='utf-8') as json_file:
        primary_ctr = json.load(json_file)
    primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]

    statements_repeat = [statements[i]] * len(primary_section)
    inputs = tokenizer(primary_section, statements_repeat, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits).item()
    prediction = "Contradiction" if predicted_class == 0 else "Entailment"

    results1[str(uuid_list[i])] = {"Prediction": prediction}

gold = dev
uuid_list = list(results1.keys())

results_pred = []
gold_labels = []
misclassified_examples = []

for i in range(len(uuid_list)):
    if results1[uuid_list[i]]["Prediction"] == "Entailment":
        results_pred.append(1)
    else:
        results_pred.append(0)
    if gold[uuid_list[i]]["Label"] == "Entailment":
        gold_labels.append(1)
    else:
        gold_labels.append(0)

f_score = f1_score(gold_labels, results_pred, zero_division='warn')
p_score = precision_score(gold_labels, results_pred, zero_division='warn')
r_score = recall_score(gold_labels, results_pred, zero_division='warn')

print('F1:{:f}'.format(f_score))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))

conf_matrix1 = confusion_matrix(gold_labels, results_pred)
print('Confusion Matrix:')
print(conf_matrix)
TN = conf_matrix1[0, 0]
FP = conf_matrix1[0, 1]
FN = conf_matrix1[1, 0]
TP = conf_matrix1[1, 1]

print(f'True Negatives (TN): {TN}')
print(f'False Positives (FP): {FP}')
print(f'False Negatives (FN): {FN}')
print(f'True Positives (TP): {TP}')

class_labels = ["Contradiction", "Entailment"]
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix1, display_labels=class_labels)

disp.plot()
plt.show()

# BERT baseline

BERT - Simple Test on Dev dataset

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
with open('dev.json', 'r', encoding='utf-8') as json_file:
    dev = json.load(json_file)

uuid_list = list(dev.keys())
statements = []
for i in range(len(uuid_list)): # retrieve all statements from the dev dataset
  statements.append(dev[uuid_list[i]]["Statement"])
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased").to(device)

results = {}
for i in range(len(uuid_list)):
    with open('CT json/'+dev[uuid_list[i]]["Primary_id"]+".json", 'r', encoding='utf-8') as json_file:
        primary_ctr = json.load(json_file)
    primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]

    statements_repeat = [statements[i]] * len(primary_section)
    inputs = tokenizer(primary_section, statements_repeat, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits).item()
    prediction = "Contradiction" if predicted_class == 0 else "Entailment"

    results[str(uuid_list[i])] = {"Prediction": prediction}

gold = dev
uuid_list = list(results.keys())
results_pred = []
gold_labels = []
misclassified_examples = []

for i in range(len(uuid_list)):
    if results[uuid_list[i]]["Prediction"] == "Entailment":
        results_pred.append(1)
    else:
        results_pred.append(0)
    if gold[uuid_list[i]]["Label"] == "Entailment":
        gold_labels.append(1)
    else:
        gold_labels.append(0)

f_score = f1_score(gold_labels, results_pred, zero_division='warn')
p_score = precision_score(gold_labels, results_pred, zero_division='warn')
r_score = recall_score(gold_labels, results_pred, zero_division='warn')

print('F1:{:f}'.format(f_score))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))

conf_matrix = confusion_matrix(gold_labels, results_pred)
print('Confusion Matrix:')
print(conf_matrix)
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]
TP = conf_matrix[1, 1]

print(f'True Negatives (TN): {TN}')
print(f'False Positives (FP): {FP}')
print(f'False Negatives (FN): {FN}')
print(f'True Positives (TP): {TP}')

class_labels = ["Contradiction", "Entailment"]
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=class_labels)

disp.plot()
plt.show()

finetuned BERT - Simple Test on Dev dataset

In [None]:
with open('train.json', 'r', encoding='utf-8') as json_file:
  train_data = json.load(json_file)

statements = []
labels = []

for entry_id, entry_data in train_data.items():
  statements.append(entry_data['Statement'])
  labels.append(entry_data['Label'])

label_map = {'Entailment': 0, 'Contradiction': 1}
labels_mapped = [label_map[label] for label in labels]

train_stmt_tmp, test_stmt, train_label_tmp, test_label = train_test_split(statements, labels_mapped, test_size=0.2, random_state=42)
train_stmt, valid_stmt, train_label, valid_label = train_test_split(train_stmt_tmp, train_label_tmp, test_size=0.2, random_state=42)

tokenized_train_stmt = tokenizer(train_stmt, padding=True, truncation=True, return_tensors="pt")
tokenized_valid_stmt = tokenizer(valid_stmt, padding=True, truncation=True, return_tensors="pt")
tokenized_test_stmt = tokenizer(test_stmt, padding=True, truncation=True, return_tensors="pt")

train_dataset = TensorDataset(tokenized_train_stmt['input_ids'], tokenized_train_stmt['attention_mask'], torch.tensor(train_label))
valid_dataset = TensorDataset(tokenized_valid_stmt['input_ids'], tokenized_valid_stmt['attention_mask'], torch.tensor(valid_label))
test_dataset = TensorDataset(tokenized_test_stmt['input_ids'], tokenized_test_stmt['attention_mask'], torch.tensor(test_label))

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=1e-4)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=10*len(train_dataloader))

# Training with num of epochs = 10
num_epochs = 10
model.train()
for epoch in range(num_epochs):
  print(f'------------------------------[EPOCH {epoch}]------------------------------')
  model.train()
  train_loss_list = []
  train_correct = 0
  train_samples = 0
  for batch in train_dataloader:
    optimizer.zero_grad()
    outputs = model(input_ids=batch[0].to(device), attention_mask=batch[1].to(device), labels=batch[2].to(device))
    train_loss = outputs.loss
    train_loss_list.append(train_loss.item())
    train_loss.backward()
    optimizer.step()
    lr_scheduler.step()

    pred = torch.argmax(outputs.logits, axis=1)
    train_correct += torch.sum(pred==batch[2].to(device)).item()
    train_samples += len(batch[2].to(device))

    if train_samples % 10 == 0:
      average_loss = sum(train_loss_list) / len(train_loss_list)
      accuracy = train_correct / train_samples
      print(f"Batch [{train_samples}/{len(train_dataset)}] Average Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}")

  epoch_train_loss = sum(train_loss_list) / len(train_loss_list)
  epoch_train_acc = train_correct / train_samples
  print(f'train loss: {epoch_train_loss:.4f} | train accuracy: {epoch_train_acc:.4f}')

  model.eval()
  valid_loss_list = []
  valid_correct = 0
  valid_samples = 0
  for batch in valid_dataloader:
    with torch.no_grad():
      outputs = model(input_ids=batch[0].to(device), attention_mask=batch[1].to(device), labels=batch[2].to(device))
      valid_loss = outputs.loss
      pred = torch.argmax(outputs.logits, axis=1)
      valid_correct += torch.sum(pred==batch[2].to(device)).item()
      valid_samples += len(batch[2].to(device))
      valid_loss_list.append(valid_loss.item())
  epoch_valid_loss = sum(valid_loss_list) / len(valid_loss_list)
  epoch_valid_acc = valid_correct / valid_samples
  print(f'validation loss: {epoch_valid_loss:.4f} | validation accuracy: {epoch_valid_acc:.4f}')

# Testing
model.eval()
test_loss_list = []
test_correct = 0
test_samples = 0
for batch in test_dataloader:
  with torch.no_grad():
    outputs = model(input_ids=batch[0].to(device), attention_mask=batch[1].to(device), labels=batch[2].to(device))
    test_loss = outputs.loss
    pred = torch.argmax(outputs.logits, axis=1)
    test_correct += torch.sum(pred==batch[2].to(device)).item()
    test_samples += len(batch[2].to(device))
    test_loss_list.append(test_loss.item())

total_test_loss = sum(test_loss_list) / len(test_loss_list)
total_test_acc = test_correct / test_samples
print(f'test loss: {total_test_loss:.4f} | test accuracy: {total_test_acc:.4f}')

# testing on dev dataset after fine-tuning with learning scheduler
results1 = {}
for i in range(len(uuid_list)):
    with open('CT json/'+dev[uuid_list[i]]["Primary_id"]+".json", 'r', encoding='utf-8') as json_file:
        primary_ctr = json.load(json_file)
    primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]

    statements_repeat = [statements[i]] * len(primary_section)
    inputs = tokenizer(primary_section, statements_repeat, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits).item()
    prediction = "Contradiction" if predicted_class == 0 else "Entailment"

    results1[str(uuid_list[i])] = {"Prediction": prediction}

gold = dev
uuid_list = list(results1.keys())

results_pred = []
gold_labels = []
misclassified_examples = []

for i in range(len(uuid_list)):
    if results1[uuid_list[i]]["Prediction"] == "Entailment":
        results_pred.append(1)
    else:
        results_pred.append(0)
    if gold[uuid_list[i]]["Label"] == "Entailment":
        gold_labels.append(1)
    else:
        gold_labels.append(0)

f_score = f1_score(gold_labels, results_pred, zero_division='warn')
p_score = precision_score(gold_labels, results_pred, zero_division='warn')
r_score = recall_score(gold_labels, results_pred, zero_division='warn')

print('F1:{:f}'.format(f_score))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))

conf_matrix1 = confusion_matrix(gold_labels, results_pred)
print('Confusion Matrix:')
print(conf_matrix)
TN = conf_matrix1[0, 0]
FP = conf_matrix1[0, 1]
FN = conf_matrix1[1, 0]
TP = conf_matrix1[1, 1]

print(f'True Negatives (TN): {TN}')
print(f'False Positives (FP): {FP}')
print(f'False Negatives (FN): {FN}')
print(f'True Positives (TP): {TP}')

class_labels = ["Contradiction", "Entailment"]
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix1, display_labels=class_labels)

disp.plot()
plt.show()

# GPT-3.5-turbo  baseline

A default gpt3.5-turbo model that receive a prompt and gives its responsse.

In [5]:
import openai
client = openai.OpenAI(api_key='xxxx') #personal openai API key

def get_completion(prompt, model="gpt-3.5-turbo-0125"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

Simple test on Dev dataset

In [None]:
dev = open('data/dev.json')
data = json.load(dev)
for i,(key,value) in enumerate(data.items()):
  if value['Type']=='Single':
    section = value['Section_id']
    primary = value['Primary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT = f'data/CT json/{primary}.json'
    CT = json.load(open(file_CT))
    prompt = f'''
      Based on the file and statement, determine whether the statement is a Contradiction or Entailment.

      File:
      ```{CT[section]}```
      Statement:
      ```{statement}```
      '''
    response = get_completion(prompt)
    print('True label: ',label)
    print('Response: ',response)
  else:
    section = value['Section_id']
    primary = value['Primary_id']
    secondary = value['Secondary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT1 = f'data/CT json/{primary}.json'
    CT1 = json.load(open(file_CT1))
    file_CT2 = f'data/CT json/{secondary}.json'
    CT2 = json.load(open(file_CT2))
    prompt = f'''
      Based on the file and statement, determine whether the statement is a Contradiction or Entailment.

      File:
      ```{CT1[section]}```
      ```{CT2[section]}```
      Statement:
      ```{statement}```
      '''
    print('True label: ',label)
    print('Response: ',response)
  if i>20:
    break


# Prompt engineering
This section includes many alterations to the api setting and the prompt content, and most of the past alterations can not be preserved in the code

Single case development area

In [None]:
role = 'You are an assistant that is good at analyzing clinical trial records. When you receive a statement that makes a claim about a trial or the relation between two trials, you will read the provided information for the trials and determine whether the information support or contradict the statement.'

#Predefining the role for the system
def get_completion1(prompt, model="gpt-3.5-turbo-0125"):
    messages = [{'role': 'system', 'content': role},{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

section_description = {'Eligibility':'a set of conditions for patients to be allowed to take part in the clinical trial.',
                          'Intervention': 'information concerning the treatment type, dosage, frequency, and duration being studied.',
                          'Results': 'number of participants in the trial, outcome measures, units, and the results.',
                          'Adverse Events': 'signs and symptoms observed in patients during the clinical trial.'}
section = 'Results'
file = f'''
    [
        "Outcome Measurement: ",
        "  Event-free Survival",
        "  Event free survival, the primary endpoint of this study, is defined as the time from randomization to the time of documented locoregional or distant recurrence, new primary breast cancer, or death from any cause.",
        "  Time frame: 5 years",
        "Results 1: ",
        "  Arm/Group Title: Exemestane",
        "  Arm/Group Description: Patients receive oral exemestane (25 mg) once daily for 5 years.",
        "  exemestane: Given orally",
        "  Overall Number of Participants Analyzed: 3789",
        "  Measure Type: Number",
        "  Unit of Measure: percentage of participants  88        (87 to 89)",
        "Results 2: ",
        "  Arm/Group Title: Anastrozole",
        "  Arm/Group Description: Patients receive oral anastrozole (1 mg) once daily for 5 years.",
        "  anastrozole: Given orally",
        "  Overall Number of Participants Analyzed: 3787",
        "  Measure Type: Number",
        "  Unit of Measure: percentage of participants  89        (88 to 90)"
    ]
'''
prompt = f"""
Your task is to determine whether a given file supports or contradicts a statement.

Specifically, you should perform the following actions:
1 - Read the following json file delimited by triple backticks. This file is the {section} of a primary trial, which may contains two cohorts.
2 - Read the statement: "there is a 13.2% difference between the results from the two the primary trial cohorts"
3 - Determine whether the file is an entailment or a contradiction to the statement. Please note that the difference between numerical numbers count as a contradiction. Provide the confidence in percentage of your answer, and the reason you make this decision.

Use the following format:
Result:
```
Entailment or Contradiction
```
Confidence:
```
confidence here
```
Reason:
```
Your reason here
```

File:
```{file}```
"""

statement = 'there is a 13% difference between the results from the two the primary trial cohorts'
prompt1 = f"""
statement: {statement}
File: {file}
Is the statement a contradiction or an entailment of the file?
"""
prompt2 = f"""
Perform the following actions:
  1 - Read the following file delimited by triple backticks.
  2 - The file contains {section_description[section]}
  3 - Read the statement delimited by triple backticks.
  4 - Determine whether the statement supports the file or contradicts the file.

What is your reason?
File:
```{file}```
Statement:
```{statement}```

"""
# Provide your answer with a single word:
  # Entailment or Contradiction
# Is the statement an entailment or a contradiction of the file?
# Determine whether the statement entails the file or contradicts the file.
# Determine the inference relation between the statement and the file.
response = get_completion1(prompt2)
print(response)

Evaluating on the dev datset

In [None]:
dev = open('data/dev.json')
data = json.load(dev)
results_single = []
results_comparison = []
section_description = {'Eligibility':'a set of conditions for patients to be allowed to take part in the clinical trial.',
                          'Intervention': 'information concerning the treatment type, dosage, frequency, and duration being studied.',
                          'Results': 'number of participants in the trial, outcome measures, units, and the results.',
                          'Adverse Events': 'signs and symptoms observed in patients during the clinical trial.'}
for i,(key,value) in enumerate(data.items()):
  if value['Type']=='Single':
    section = value['Section_id']
    primary = value['Primary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT = f'data/CT json/{primary}.json'
    CT = json.load(open(file_CT))
    prompt = f'''
    Perform the following actions:
      1 - Read the following file delimited by triple backticks.
      2 - The file contains {section_description[section]}
      3 - Read the statement delimited by triple backticks.
      4 - Determine whether the file supports or contradicts the statement. If support, your answer will be Entailment; if contradict, your answer will be Contradiction.

      Use the following format:
        <Contradiction or Entailment>

      File:
      ```{CT[section]}```
      Statement:
      ```{statement}```
      '''
    response = get_completion(prompt)
    # print(label,response)
    results_single.append((label,response))
  else:
    section = value['Section_id']
    primary = value['Primary_id']
    secondary = value['Secondary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT1 = f'data/CT json/{primary}.json'
    CT1 = json.load(open(file_CT1))
    file_CT2 = f'data/CT json/{secondary}.json'
    CT2 = json.load(open(file_CT2))
    prompt = f'''
    Perform the following actions:
      1 - Read the two files delimited by triple backticks. They contain {section_description[section]}
      2 - Read the following statement: {statement}.
      3 - The statement is about the comparison between the primary trial and the secondary trial provided.
      4 - Determine whether the files support or contradicts the statement. If support, your answer will be Entailment; if contradict, your answer will be Contradiction.

    Use the following format:
        <Contradiction or Entailment>

    Primary trial:
    ```{CT1[section]}```
    Secondary trial:
    ```{CT2[section]}```
    '''
    response = get_completion(prompt)
    # print(label,response)
    results_comparison.append((label,response))
  # if i>20:
  #   break


In [None]:

TP = 0
FP = 0
FN = 0
TN = 0
targets = []
results = []
for (l,r) in results_comparison:
  targets.append(l)
  results.append(r)
  # r = r.strip()
  if l==r:
    if l=='Entailment':
      TP+=1
    else: TN+=1
  elif l=='Entailment':
    FN+=1
  else: FP+=1

for (l,r) in results_single:
  targets.append(l)
  results.append(r)
  # r = r.strip()
  if l==r:
    if l=='Entailment':
      TP+=1
    else: TN+=1
  elif l=='Entailment':
    FN+=1
  else: FP+=1
print(TP,FP,FN,TN)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(targets, results)
ConfusionMatrixDisplay(cm,display_labels=['Contradiction','Entailment']).plot()

Evaluating with CoT on the dev datset

In [None]:
dev = open('data/dev.json')
data = json.load(dev)
labels = []
preds = []
response = ''

for i,(key,value) in enumerate(data.items()):
  if value['Type']=='Single':
    section = value['Section_id']
    primary = value['Primary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT = f'CT json/{primary}.json'
    CT = json.load(open(file_CT))
    prompt = f'''
    Perform the following actions:
      1 - Read the following file delimited by triple backticks.
      2 - The file contains {section_description[section]}
      3 - Read the statement delimited by triple backticks.
      4 - Determine whether the file supports or contradicts the statement. If support, your answer will be Entailment; if contradict, your answer will be Contradiction.

      Use the following format:
        <Contradiction or Entailment>
      Reason:
        Your reason here

      File:
      ```{CT[section]}```
      Statement:
      ```{statement}```
    '''
    prompt_tmp = prompt
    CoT_prompts = [
        "Let's think step by step",
        "Think about if the statement contradicts or supports information from the file.",
        "Based on your analysis, determine whether the statement as 'Contradiction' or 'Entailment'"
    ]
    for p in CoT_prompts:
      prompt_tmp = prompt_tmp + '\n' + p
      response = get_completion(prompt_tmp)

    labels.append(label)
    preds.append(response.split("Reason")[0])
  else:
    section = value['Section_id']
    primary = value['Primary_id']
    secondary = value['Secondary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT1 = f'CT json/{primary}.json'
    CT1 = json.load(open(file_CT1))
    file_CT2 = f'CT json/{secondary}.json'
    CT2 = json.load(open(file_CT2))
    prompt = f'''
    Perform the following actions:
      1 - Read the two files delimited by triple backticks. They contain {section_description[section]}
      2 - Read the following statement: {statement}.
      3 - The statement is about the comparison between the primary trial and the secondary trial provided.
      4 - Determine whether the files support or contradicts the statement. If support, your answer will be Entailment; if contradict, your answer will be Contradiction.

    Use the following format:
        <Contradiction or Entailment>
    Reason：
      Your reason here

    Primary trial:
    ```{CT1[section]}```
    Secondary trial:
    ```{CT2[section]}```
    '''
    prompt_tmp = prompt
    CoT_prompts = [
        "Let's think step by step",
        "Think about if the statement contradicts or supports information from the files.",
        "Based on your analysis, determine whether the statement as 'Contradiction' or 'Entailment'"
    ]
    for p in CoT_prompts:
      prompt_tmp = prompt_tmp + '\n' + p
      response = get_completion(prompt_tmp)

    labels.append(label)
    preds.append(response.split("Reason")[0])
  # if i>=0:
  #   break

label_map = {'Contradiction': 0, 'Entailment': 1}
labels_new = [label_map[label] for label in labels]
preds_new = [0 if 'Contradiction' in pr else 1 for pr in preds]

conf_matrix_cot = confusion_matrix(labels_new, preds_new)
f_score = f1_score(labels_new, preds_new, zero_division='warn')
p_score = precision_score(labels_new, preds_new, zero_division='warn')
r_score = recall_score(labels_new, preds_new, zero_division='warn')

print('F1:{:f}'.format(f_score))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))

class_labels = ["Contradiction", "Entailment"]
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_cot, display_labels=class_labels)

disp.plot()
plt.show()

# Finetuning
Training data file creation

In [None]:
import json
train = open('data/train.json')
train_data = json.load(train)
val_file = open('val.jsonl', 'w', encoding='utf8')
train_file = open('train.jsonl', 'w', encoding='utf8')

role = 'You are an assistant that is good at analyzing clinical trial records. When you receive a statement that makes a claim about a trial or the relation between two trials, you will read the provided information for the trials and determine whether the information support or contradict the statement.'


for i,(key,value) in enumerate(train_data.items()):
  section = value['Section_id']
  primary = value['Primary_id']
  statement = value['Statement']
  label = value['Label']
  file_CT1 = f'data/CT json/{primary}.json'
  CT1 = json.load(open(file_CT1))
  if value['Type']=='Single':
    prompt = f'''
    Statement:
    {statement}
    Primary trial:
    {CT1[section]}
    '''
  else:
    secondary = value['Secondary_id']
    file_CT2 = f'data/CT json/{secondary}.json'
    CT2 = json.load(open(file_CT2))
    prompt = f'''
    Statement:
    {statement}
    Primary trial:
    {CT1[section]}
    Secondary trial:
    {CT2[section]}
    '''
  msgs = [{'role': 'system', 'content': role},{"role": "user", "content": prompt},{"role": "assistant", "content": label}]
  msg = {'messages':msgs}
  if i<=1500:
    val_file.write(json.dumps(msg))
    val_file.write('\n')
    continue
  train_file.write(json.dumps(msg))
  train_file.write('\n')


Upload training files and finefune the model on OpenAI playground

In [None]:
client.files.create(
  file=open("val.jsonl", "rb"),
  purpose="fine-tune"
)
client.files.create(
  file=open("train.jsonl", "rb"),
  purpose="fine-tune"
)

Finetuned model

In [None]:
def get_completion2(prompt, model="ft:gpt-3.5-turbo-0125:personal:full:9JB5gM4K"):
    messages = [{'role': 'system', 'content': role},{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

Evaluate Finetuned Model with CoT on dev dataset

In [None]:
dev = open('data/dev.json')
data = json.load(dev)
labels = []
preds = []
response = ''

for i,(key,value) in enumerate(data.items()):
  if value['Type']=='Single':
    section = value['Section_id']
    primary = value['Primary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT = f'CT json/{primary}.json'
    CT = json.load(open(file_CT))
    prompt = f'''
    Perform the following actions:
      1 - Read the following file delimited by triple backticks.
      2 - The file contains {section_description[section]}
      3 - Read the statement delimited by triple backticks.
      4 - Determine whether the file supports or contradicts the statement. If support, your answer will be Entailment; if contradict, your answer will be Contradiction.

      Use the following format:
        <Contradiction or Entailment>
      Reason:
        Your reason here

      File:
      ```{CT[section]}```
      Statement:
      ```{statement}```
    '''
    prompt_tmp = prompt
    CoT_prompts = [
        "Let's think step by step",
        "Think about if the statement contradicts or supports information from the file.",
        "Based on your analysis, determine whether the statement as 'Contradiction' or 'Entailment'"
    ]
    for p in CoT_prompts:
      prompt_tmp = prompt_tmp + '\n' + p
      response = get_completion2(prompt_tmp)

    labels.append(label)
    preds.append(response.split("Reason")[0])
  else:
    section = value['Section_id']
    primary = value['Primary_id']
    secondary = value['Secondary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT1 = f'CT json/{primary}.json'
    CT1 = json.load(open(file_CT1))
    file_CT2 = f'CT json/{secondary}.json'
    CT2 = json.load(open(file_CT2))
    prompt = f'''
    Perform the following actions:
      1 - Read the two files delimited by triple backticks. They contain {section_description[section]}
      2 - Read the following statement: {statement}.
      3 - The statement is about the comparison between the primary trial and the secondary trial provided.
      4 - Determine whether the files support or contradicts the statement. If support, your answer will be Entailment; if contradict, your answer will be Contradiction.

    Use the following format:
        <Contradiction or Entailment>
    Reason：
      Your reason here

    Primary trial:
    ```{CT1[section]}```
    Secondary trial:
    ```{CT2[section]}```
    '''
    prompt_tmp = prompt
    CoT_prompts = [
        "Let's think step by step",
        "Think about if the statement contradicts or supports information from the files.",
        "Based on your analysis, determine whether the statement as 'Contradiction' or 'Entailment'"
    ]
    for p in CoT_prompts:
      prompt_tmp = prompt_tmp + '\n' + p
      response = get_completion2(prompt_tmp)

    labels.append(label)
    preds.append(response.split("Reason")[0])
  # if i>=0:
  #   break

label_map = {'Contradiction': 0, 'Entailment': 1}
labels_new = [label_map[label] for label in labels]
preds_new = [0 if 'Contradiction' in pr else 1 for pr in preds]

conf_matrix_cot = confusion_matrix(labels_new, preds_new)
f_score = f1_score(labels_new, preds_new, zero_division='warn')
p_score = precision_score(labels_new, preds_new, zero_division='warn')
r_score = recall_score(labels_new, preds_new, zero_division='warn')

print('F1:{:f}'.format(f_score))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))

class_labels = ["Contradiction", "Entailment"]
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_cot, display_labels=class_labels)

disp.plot()
plt.show()

# Evaluate Final Model on gold_practice_test dataset
evaluate faithfulness, consistency based on the codes provided by SemEval

In [None]:
gold_practice_test = open('/content/drive/MyDrive/CSE635proj/gold_practice_test.json')
gold_data = json.load(gold_practice_test)

labels = []
preds = []
preds_keys = []
response = ''

for i,(key,value) in enumerate(gold_data.items()):
  if value['Type']=='Single':
    section = value['Section_id']
    primary = value['Primary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT = f'CT json/{primary}.json'
    CT = json.load(open(file_CT))
    prompt = f'''
    Perform the following actions:
      1 - Read the following file delimited by triple backticks.
      2 - The file contains {section_description[section]}
      3 - Read the statement delimited by triple backticks.
      4 - Determine whether the file supports or contradicts the statement. If support, your answer will be Entailment; if contradict, your answer will be Contradiction.

      Use the following format:
        <Contradiction or Entailment>
      Reason:
        Your reason here

      File:
      ```{CT[section]}```
      Statement:
      ```{statement}```
    '''
    prompt_tmp = prompt
    CoT_prompts = [
        "Let's think step by step",
        "Think about if the statement contradicts or supports information from the file.",
        "Based on your analysis, determine whether the statement as 'Contradiction' or 'Entailment'"
    ]
    for p in CoT_prompts:
      prompt_tmp = prompt_tmp + '\n' + p
      response = get_completion2(prompt_tmp)

    labels.append(label)
    preds.append(response.split("Reason")[0])
    preds_keys.append(key)
  else:
    section = value['Section_id']
    primary = value['Primary_id']
    secondary = value['Secondary_id']
    statement = value['Statement']
    label = value['Label']
    file_CT1 = f'CT json/{primary}.json'
    CT1 = json.load(open(file_CT1))
    file_CT2 = f'CT json/{secondary}.json'
    CT2 = json.load(open(file_CT2))
    prompt = f'''
    Perform the following actions:
      1 - Read the two files delimited by triple backticks. They contain {section_description[section]}
      2 - Read the following statement: {statement}.
      3 - The statement is about the comparison between the primary trial and the secondary trial provided.
      4 - Determine whether the files support or contradicts the statement. If support, your answer will be Entailment; if contradict, your answer will be Contradiction.

    Use the following format:
        <Contradiction or Entailment>
    Reason：
      Your reason here

    Primary trial:
    ```{CT1[section]}```
    Secondary trial:
    ```{CT2[section]}```
    '''
    prompt_tmp = prompt
    CoT_prompts = [
        "Let's think step by step",
        "Think about if the statement contradicts or supports information from the files.",
        "Based on your analysis, determine whether the statement as 'Contradiction' or 'Entailment'"
    ]
    for p in CoT_prompts:
      prompt_tmp = prompt_tmp + '\n' + p
      response = get_completion2(prompt_tmp)

    labels.append(label)
    preds.append(response.split("Reason")[0])
    preds_keys.append(key)
  # if i>=0:
  #   break

label_map = {'Contradiction': 0, 'Entailment': 1}
labels_new = [label_map[label] for label in labels]
preds_new = [0 if 'Contradiction' in pr else 1 for pr in preds]
preds_cleaned = ['Contradiction' if 'Contradiction' in pr else 'Entailment' for pr in preds]

conf_matrix_cot = confusion_matrix(labels_new, preds_new)
f_score_macro = f1_score(labels_new, preds_new, average='macro', zero_division='warn')
p_score = precision_score(labels_new, preds_new, zero_division='warn')
r_score = recall_score(labels_new, preds_new, zero_division='warn')

print('F1 socre macro:{:f}'.format(f_score_macro))
print('precision_score:{:f}'.format(p_score))
print('recall_score:{:f}'.format(r_score))

class_labels = ["Contradiction", "Entailment"]
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_cot, display_labels=class_labels)
disp.plot()
plt.show()

In [None]:
def faithfulness(predictions, gold):
    uuid_list = list(predictions.keys())
    N = len(uuid_list)
    results = []
    for key in uuid_list:
        if predictions[key]["Prediction"] != gold[gold[key]["Causal_type"][1]]["Label"]:
            results.append(1)
        else:
            results.append(0)
    Faithfulness = sum(results) / N
    return Faithfulness


def consistency(predictions, gold):
    uuid_list = list(predictions.keys())
    N = len(uuid_list)
    results = []
    for key in uuid_list:
        if predictions[key]["Prediction"] == gold[key]["Label"]:
            results.append(1)
        else:
            results.append(0)
    Consistency = sum(results) / N
    return Consistency

def extract_by_causal_type(predictions, gold):
    predictions_preserving = {}
    predictions_altering = {}
    for key in predictions.keys():
        if "Causal_type" not in gold[key].keys():
            continue
        if gold[key]["Causal_type"][0] == "Preserving":
            predictions_preserving[key] = predictions[key]
        elif gold[key]["Causal_type"][0] == "Altering":
            predictions_altering[key] = predictions[key]
    return predictions_preserving, predictions_altering

def extract_contrast_set(predictions, gold):
    contrast_predicitons = {}
    for key in predictions.keys():
        if "Causal_type" in gold[key].keys():
            contrast_predicitons[key] = predictions[key]
    return contrast_predicitons

def extract_by_intervention(predictions, gold):
    para_predictions = {}
    cont_predictions = {}
    numerical_para_predictions = {}
    numerical_cont_predictions = {}
    definitions_predictions = {}
    for key in predictions.keys():
        if "Intervention" not in gold[key].keys():
            continue
        if gold[key]["Intervention"] == "Paraphrase":
            para_predictions[key] = predictions[key]
        elif gold[key]["Intervention"] == "Contradiction":
            cont_predictions[key] = predictions[key]
        elif gold[key]["Intervention"] == "Numerical_paraphrase":
            numerical_para_predictions[key] = predictions[key]
        elif gold[key]["Intervention"] == "Numerical_contradiction":
            numerical_cont_predictions[key] = predictions[key]
        elif gold[key]["Intervention"] == "Text_appended":
            definitions_predictions[key] = predictions[key]
    return para_predictions, cont_predictions, numerical_para_predictions, numerical_cont_predictions, definitions_predictions

def F1_Recall_Precision(predictions, gold):
    pred_labels = []
    gold_labels = []
    for key in predictions.keys():
        if predictions[key]["Prediction"] == "Entailment":
            pred_labels.append(1)
        else:
            pred_labels.append(0)
        if gold[key]["Label"] == "Entailment":
            gold_labels.append(1)
        else:
            gold_labels.append(0)
    F1 = f1_score(gold_labels, pred_labels)
    Recall = precision_score(gold_labels, pred_labels)
    Precision = recall_score(gold_labels, pred_labels)
    return F1, Recall, Precision

def extract_control_set(predictions, gold):
    control_predicitons = {}
    for key in gold.keys():
        if "Causal_type" not in gold[key].keys():
            control_predicitons[key] = predictions[key]
    return control_predicitons

In [None]:
predictions = {}
for key, value in zip(preds_keys, preds_cleaned):
  predictions[key] = {"Prediction": value}

# Control Test Set F1, Recall, Precision PUBLIC
Control_F1, Control_Rec, Control_Prec = F1_Recall_Precision(extract_control_set(predictions, gold_data), gold_data)

# Contrast Consistency & Faithfullness PUBLIC
contrast_predictions = extract_contrast_set(predictions, gold_data)
predictions_preserving, predictions_altering = extract_by_causal_type(contrast_predictions, gold_data)
Faithfulness = faithfulness(predictions_altering, gold_data)
Consistency = consistency(predictions_preserving, gold_data)


# Intervention-wise Consistency & Faithfullness HIDDEN
para_predictions, cont_predictions, numerical_para_predictions, numerical_cont_predictions, definitions_predictions = \
    extract_by_intervention(predictions, gold_data)
para_preserving = extract_by_causal_type(para_predictions, gold_data)[0]
cont_preserving, cont_altering = extract_by_causal_type(cont_predictions, gold_data)
numerical_para_preserving = extract_by_causal_type(numerical_para_predictions, gold_data)[0]
numerical_cont_preserving, numerical_cont_altering = extract_by_causal_type(numerical_cont_predictions, gold_data)
definitions_preserving = extract_by_causal_type(definitions_predictions, gold_data)[0]
para_Consistency = consistency(para_preserving, gold_data)
cont_Faithfulness = faithfulness(cont_altering, gold_data)
cont_Consistency = consistency(cont_preserving, gold_data)
numerical_para_Consistency = consistency(numerical_para_preserving, gold_data)
numerical_cont_Faithfulness = faithfulness(numerical_cont_altering, gold_data)
numerical_cont_Consistency = consistency(numerical_cont_preserving, gold_data)
definitions_Consistency = consistency(definitions_preserving, gold_data)

# Intervention-wise F1, Recall, Precision HIDDEN
Contrast_F1, Contrast_Rec, Contrast_Prec = F1_Recall_Precision(contrast_predictions, gold_data)
para_F1, para_Rec, para_Prec = F1_Recall_Precision(para_predictions, gold_data)
cont_F1, cont_Rec, cont_Prec = F1_Recall_Precision(cont_predictions, gold_data)
numerical_para_F1, numerical_para_Rec, numerical_para_Prec = F1_Recall_Precision(numerical_para_predictions, gold_data)
numerical_cont_F1, numerical_cont_Rec, numerical_cont_Prec = F1_Recall_Precision(numerical_cont_predictions, gold_data)
definitions_F1, definitions_Rec, definitions_Prec = F1_Recall_Precision(definitions_predictions, gold_data)

# Output results
with open('scores.txt', 'w') as f:
    print('Control_F1: ', Control_F1, file=f)
    print('Control_Recall: ', Control_Rec, file=f)
    print('Control_Precision: ', Control_Prec, file=f)
    print('Contrast_F1: ', Contrast_F1, file=f)
    print('Contrast_Recall: ', Contrast_Rec, file=f)
    print('Contrast_Precision: ', Contrast_Prec, file=f)
    print('Faithfulness: ', Faithfulness, file=f)
    print('Consistency: ', Consistency, file=f)
    print('Para_Consistency: ', para_Consistency, file=f)
    print('Cont_Faithfulness: ', cont_Faithfulness, file=f)
    print('Cont_Consistency: ', cont_Consistency, file=f)
    print('Numerical_Para_Consistency: ', numerical_para_Consistency, file=f)
    print('Numerical_Cont_Faithfulness: ', numerical_cont_Faithfulness, file=f)
    print('Numerical_Cont_Consistency: ', numerical_cont_Consistency, file=f)
    print('Definitions_Consistency: ', definitions_Consistency, file=f)
    print('Para_F1: ', para_F1, file=f)
    print('Para_Recall: ', para_Rec, file=f)
    print('Para_Precision: ', para_Prec, file=f)
    print('Cont_F1: ', cont_F1, file=f)
    print('Cont_Recall: ', cont_Rec, file=f)
    print('Cont_Precision: ', cont_Prec, file=f)
    print('Numerical_Para_F1: ', numerical_para_F1, file=f)
    print('Numerical_Para_Recall: ', numerical_para_Rec, file=f)
    print('Numerical_Para_Precision: ', numerical_para_Prec, file=f)
    print('Numerical_Cont_F1: ', numerical_cont_F1, file=f)
    print('Numerical_Cont_Recall: ', numerical_cont_Rec, file=f)
    print('Numerical_Cont_Precision: ', numerical_cont_Prec, file=f)
    print('Definitions_F1: ', definitions_F1, file=f)
    print('Definitions_Recall: ', definitions_Rec, file=f)
    print('Definitions_Precision: ', definitions_Prec, file=f)