# Imports

In [None]:
import random
import time
import matplotlib.pyplot as plt
import numpy as np
import json
from statistics import mean
from dotenv import load_dotenv
import re
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import openai
import os

In [None]:
# Load .env file with your API key
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

# Functions

In [None]:
"""
Ouputs metrics for a series of predictions.

predictions : array of tuples [(actual_value, predicted_value)...]
"""
def table_metrics(predictions):
    # Create a list of classes
    classes = sorted(set([x[0] for x in predictions] + [x[1] for x in predictions]), reverse=True)
    class_mapping = {label: idx for idx, label in enumerate(classes)}

    actual_values = [class_mapping[item[0]] for item in predictions]
    predicted_values = [class_mapping[item[1]] for item in predictions]

    return (f1_score(actual_values, predicted_values, average='micro'),
            f1_score(actual_values, predicted_values, average='macro'),
            precision_score(actual_values, predicted_values, average='macro'),
            recall_score(actual_values, predicted_values, average='macro'))

In [None]:
# Perplexity calculations
! pip install tqdm
from tqdm import tqdm
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

def perplexity(arr):
    device = 'cuda'
    model_id = 'gpt2-large'
    model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
    tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
    encodings = tokenizer('\n\n'.join(arr), return_tensors='pt')

    max_length = model.config.n_positions
    stride = 512

    lls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = i + stride
        input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:,:-stride] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs[0] * stride

        lls.append(log_likelihood)

    ppl = torch.exp(torch.stack(lls).sum() / i)
    return ppl

# Load data

In [None]:
# Load filtered negations
with open(f'../data/negations/negations_with_scores_gpt4.json') as f:
    items = json.load(f)
negation_to_claim = {}
results = []
for k in items:
    results.append((k, items[k][0], items[k][1]))
    negation_to_claim[items[k][0]] = k

# negation set for processing
negations_set = set()
for result in results:
    if len(result[2]) < 4 and int(result[2]) >= 90:
        negations_set.add(result[1])

In [None]:
supports = []
negations = []
counter = 0
corpus = {}
nei = 'NOT_ENOUGH_INFO' # Use 'UNRELATED' for ablation experiment

# Prepare the dataset to iterate through
with open(f'../data/scitance/corpus.jsonl') as f_pdf:
    for line in f_pdf:
        pdf_parse_dict = json.loads(line)
        corpus[pdf_parse_dict['doc_id']] = pdf_parse_dict
print("Corpus parsed.")

# Load train data
train = {}
c_train = 0
s_train = 0
nei_train = 0
with open(f'../data/scitance/train.jsonl') as f_pdf:
    for line in f_pdf:
        parse = json.loads(line)
        for i in range(len(parse['doc_ids'])):
          if str(parse['doc_ids'][i]) in parse['evidence']:
            if parse['evidence'][str(parse['doc_ids'][i])][0]['label'] + 'S' == "CONTRADICTS" and parse['claim'] not in negations_set: ## negation checker
              counter += 1
              continue
            temp = {
                'claim': parse['claim'],
                'label': parse['evidence'][str(parse['doc_ids'][i])][0]['label'] + 'S'
            }
          else:
            temp = {
                'claim': parse['claim'],
                'label': nei
            }
            nei_train += 1
          train[parse['doc_ids'][i]] = temp
          if train[parse['doc_ids'][i]]['label'] == 'SUPPORTS':
              supports.append(train[parse['doc_ids'][i]]['claim'])
              s_train += 1
          if train[parse['doc_ids'][i]]['label'] == 'CONTRADICTS':
              negations.append(train[parse['doc_ids'][i]]['claim'])
              c_train += 1
print("\nTrain parsed.")
print('Supports: ', s_train)
print('Negations:', c_train)
print('NEI:\t  ', nei_train)

# Load dev data
dev = []
c_dev = 0
s_dev = 0
nei_dev = 0
with open(f'../data/scitance/dev.jsonl') as f_pdf:
    for line in f_pdf:
        parse = json.loads(line)
        for i in range(len(parse['doc_ids'])):
          if str(parse['doc_ids'][i]) in parse['evidence']:
            if parse['evidence'][str(parse['doc_ids'][i])][0]['label'] + 'S' == "CONTRADICTS" and parse['claim'] not in negations_set: ## negation checker
              counter += 1
              continue
            temp = {
                'claim': parse['claim'],
                'label': parse['evidence'][str(parse['doc_ids'][i])][0]['label'] + 'S',
                'id': parse['doc_ids'][i]
            }
          else:
            temp = {
                'claim': parse['claim'],
                'label': nei,
                'id': parse['doc_ids'][i]
            }
            nei_dev += 1
          dev.append(temp)
          if temp['label'] == 'SUPPORTS':
              supports.append(temp['claim'])
              s_dev += 1
          if temp['label'] == 'CONTRADICTS':
              negations.append(temp['claim'])
              c_dev += 1
print("\nDev parsed.")
print('Supports: ', s_dev)
print('Negations:', c_dev)
print('NEI:\t  ', nei_dev)

# Load test data
test = []
c_test = 0
s_test = 0
nei_test = 0
with open(f'../data/scitance/test.jsonl') as f_pdf:
    for line in f_pdf:
        parse = json.loads(line)
        for i in range(len(parse['doc_ids'])):
          if str(parse['doc_ids'][i]) in parse['evidence']:
            if parse['evidence'][str(parse['doc_ids'][i])][0]['label'] + 'S' == "CONTRADICTS" and parse['claim'] not in negations_set: ## negation checker
              counter += 1
              continue
            temp = {
                'claim': parse['claim'],
                'label': parse['evidence'][str(parse['doc_ids'][i])][0]['label'] + 'S',
                'id': parse['doc_ids'][i]
            }
          else:
            temp = {
                'claim': parse['claim'],
                'label': nei,
                'id': parse['doc_ids'][i]
            }
            nei_test += 1
          test.append(temp)
          if temp['label'] == 'SUPPORTS':
              supports.append(temp['claim'])
              s_test += 1
          if temp['label'] == 'CONTRADICTS':
              negations.append(temp['claim'])
              c_test += 1
print("\nTest parsed.")
print('Supports: ', s_test)
print('Negations:', c_test)
print('NEI:\t  ', nei_test)

print("\nDataset Total:")
print('Supports: ', len(supports))
print('Negations:', len(negations))
print("NEI:\t  ", str(nei_train + nei_dev + nei_test))
print("\n" + str(counter), "negations removed from filtering.")

# Create a dict for ease of random sampling in multi-shot ICL
train_supports = {}
train_contradicts = {}
train_nei = {}
for key in train.keys():
  if train[key]['label'] == 'SUPPORTS':
    train_supports[key] = train[key]
  if train[key]['label'] == 'CONTRADICTS':
    train_contradicts[key] = train[key]
  if train[key]['label'] == nei:
    train_nei[key] = train[key]

# Perplexity testing

In [None]:
device = 'cuda'
model_id = 'gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
from scipy import stats

In [None]:
citance_perplexity = []
for x in supports:
    encodings = tokenizer(x, return_tensors="pt")

    input_ids = encodings.input_ids.to(device)
    target_ids = input_ids.clone()

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs.loss

    citance_perplexity.append(neg_log_likelihood)
citance_perplexity = [x.item() for x in citance_perplexity]

In [None]:
negation_perplexity = []
for x in negations:
    encodings = tokenizer(x, return_tensors="pt")

    input_ids = encodings.input_ids.to(device)
    target_ids = input_ids.clone()

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs.loss

    negation_perplexity.append(neg_log_likelihood)
negation_perplexity = [x.item() for x in negation_perplexity]

In [None]:
plt.hist(citance_perplexity, bins=20, color='skyblue', alpha=0.5, label='Distribution 1', edgecolor='black')
plt.hist(negation_perplexity, bins=20, color='salmon', alpha=0.5, label='Distribution 2', edgecolor='black')
plt.xlabel('Perplexity Values')
plt.ylabel('Frequency')
plt.title('Citance and Negation Perplexity')
plt.legend(('Citances', 'Negations'), loc='upper right');
plt.show()

In [None]:
print("Min:", round(np.min(citance_perplexity), 3), "\t", round(np.min(negation_perplexity), 3))
print("Q1: ", round(np.percentile(citance_perplexity, 25), 3), "\t", round(np.percentile(negation_perplexity, 25), 3))
print("Med:", round(np.median(citance_perplexity), 3), "\t", round(np.median(negation_perplexity), 3))
print("Q3: ", round(np.percentile(citance_perplexity, 75), 3), "\t", round(np.percentile(negation_perplexity, 75), 3))
print("Max:", round(np.max(citance_perplexity), 3), "\t", round(np.max(negation_perplexity), 3))

In [None]:
t_statistic, p_value = stats.ttest_ind(citance_perplexity, negation_perplexity)
print("T-statistic:", t_statistic)
print("P-value:", p_value)
stats.ttest_ind(citance_perplexity, negation_perplexity).confidence_interval()

# Run experiments

## 1. Zero-shot / claim only / no NEI

In [None]:
for i in range(5):
    random.seed(22)
    query = "Given a claim, please determine whether the existing academic literature SUPPORTS or CONTRADICTS the claim (even if you cannot reference specific abstracts). Please return your answer as only the capitalized token, as well as an explanation or rationale for the answer. \n\tClaim: {}"
    results = []
    for item in test:
        key = item['id']
        abstract = corpus[key]['abstract']
        claim = item['claim']
        label = item['label']
        if label == nei:
          continue
        query_string = query.format(claim)
        message = [{"role": "user", "content": query_string}]
        try:
          response = openai.ChatCompletion.create(model="gpt-4", messages=message, temperature=0.2)
        except Exception as e:
          print(e)
          continue
        predicted = response.choices[0].message.content
        results.append((label, predicted))
    obj = json.dumps(results, indent=4)
    with open(f'../data/results/zero_claim_only_no_nei/zero_claim_only_no_nei_{i}.json', 'w') as f:
      f.write(obj)

## 2. Zero-shot / claim and abstract / no NEI

In [None]:
for i in range(5):
    random.seed(22)
    query = "Please obey the following: With a specific abstract, please make an estimation whether the abstract SUPPORTS or CONTRADICTS the claim. You must choose SUPPORTS or CONTRADICTS. Please return your answer as only the capitalized token, as well as an explanation or rationale for the answer. \nAbstract: {}\n\tClaim: {}"
    results = []
    for item in test:
        key = item['id']
        abstract = corpus[key]['abstract']
        claim = item['claim']
        label = item['label']
        if label == nei:
          continue
        query_string = query.format(abstract, claim)
        message = [{"role": "user", "content": query_string}]
        try:
          response = openai.ChatCompletion.create(model="gpt-4", messages=message, temperature=0.2)
        except Exception as e:
          print(e)
          continue
        predicted = response.choices[0].message.content
        results.append((label, predicted))
    obj = json.dumps(results, indent=4)
    with open(f'../data/results/zero_with_abstract_no_nei/zero_with_abstract_no_nei_{i}.json', 'w') as f:
        f.write(obj)

## 3. Zero-shot / claim and abstract / with NEI

In [None]:
for i in range(5):
    random.seed(22)
    query = "Please obey the following: With a specific abstract, please make an estimation whether the abstract SUPPORTS or CONTRADICTS the claim, or if there is NOT_ENOUGH_INFO to determine. You must choose SUPPORTS or CONTRADICTS or NOT_ENOUGH_INFO. Please return your answer as only the capitalized token(s), as well as an explanation or rationale for the answer. \nAbstract: {}\n\tClaim: {}"
    # query = "Please obey the following: With a specific abstract, please make an estimation whether the abstract SUPPORTS, CONTRADICTS, or is UNRELATED to the claim. You must choose SUPPORTS or CONTRADICTS or UNRELATED. Please return your answer as only the capitalized token, as well as an explanation or rationale for the answer. \nAbstract: {}\n\tClaim: {}"
    results = []
    for item in test:
        time.sleep(2)
        key = item['id']
        abstract = corpus[key]['abstract']
        claim = item['claim']
        label = item['label']
        query_string = query.format(abstract, claim)
        message = [{"role": "user", "content": query_string}]
        try:
          response = openai.ChatCompletion.create(model="gpt-4", messages=message, temperature=0.2)
        except Exception as e:
          print(e)
          continue
        predicted = response.choices[0].message.content
        results.append((label, predicted))
    obj = json.dumps(results, indent=4)
    with open(f'../data/results/zero_with_abstract_with_nei/zero_with_abstract_with_nei_{i}.json', 'w') as f:
        f.write(obj)

## 4. Few-shot / claim only / no NEI

In [None]:
for i in range(5):
    random.seed(22)
    intro = "The following are examples of claims from a research paper and the corresponding abstract from the paper they are citing."
    supports =  "\nThis is an example of an abstract that SUPPORTS the claim: \n\tSupporting abstract: {} \n\tClaim: {}"
    contradicts = "\nThis is an example of an abstract that CONTRADICTS the claim: \n\tContradicting abstract: {} \n\tClaim: {}"
    query = "Please obey the following: With no specific abstracts, please make an estimation whether the existing academic literature (and not the abstracts above) SUPPORTS or CONTRADICTS the claim. You must choose SUPPORTS or CONTRADICTS. Please return your answer as only the capitalized token, as well as an explanation or rationale for the answer. \n\tClaim: {}"
    results = []
    for item in test:
        time.sleep(5.5)
        key = item['id']
        abstract = corpus[key]['abstract']
        claim = item['claim']
        label = item['label']
        if label == nei:
          continue
        query_string = query.format(claim)

        k, v = random.choice(list(train_supports.items()))
        supports_claim = v['claim']
        supports_abstract = corpus[k]['abstract']
        supports_string = supports.format(supports_abstract, supports_claim)

        k, v = random.choice(list(train_contradicts.items()))
        contradicts_claim = v['claim']
        contradicts_abstract = corpus[k]['abstract']
        contradicts_string = contradicts.format(contradicts_abstract, contradicts_claim)

        temp = [supports_string, contradicts_string]
        random.shuffle(temp)
        prompt = intro + temp[0] + temp[1] + query_string
        message = [{"role": "user", "content": prompt}]
        try:
          response = openai.ChatCompletion.create(model="gpt-4", messages=message, temperature=0.2)
        except Exception as e:
          print(e)
          continue
        predicted = response.choices[0].message.content
        results.append((label, predicted))
    obj = json.dumps(results, indent=4)
    with open(f'../data/results/few_claim_only_no_nei/few_claim_only_no_nei_{i}.json', 'w') as f:
        f.write(obj)

## 5. Few-shot / claim and abstract / no NEI

In [None]:
for i in range(5):
    random.seed(22)
    intro = "The following are examples of claims from a research paper and the corresponding abstract from the paper they are citing."
    supports =  "\nThis is an example of an abstract that SUPPORTS the claim: \n\tSupporting abstract: {} \n\tClaim: {}"
    contradicts = "\nThis is an example of an abstract that CONTRADICTS the claim: \n\tContradicting abstract: {} \n\tClaim: {}"
    query = "\nPlease obey the following: given a new abstract and claim pair, please make an estimation whether the abstract SUPPORTS or CONTRADICTS the claim. You must choose SUPPORTS or CONTRADICTS. Please return your answer as the capitalized token, as well as an explanation or rationale for the answer. \n\tNew abstract: {} \n\tClaim: {}"
    results = []
    for item in test:
        time.sleep(5.5)
        key = item['id']
        abstract = corpus[key]['abstract']
        claim = item['claim']
        label = item['label']
        if label == nei:
          continue
        query_string = query.format(abstract, claim)

        k, v = random.choice(list(train_supports.items()))
        supports_claim = v['claim']
        supports_abstract = corpus[k]['abstract']
        supports_string = supports.format(supports_abstract, supports_claim)

        k, v = random.choice(list(train_contradicts.items()))
        contradicts_claim = v['claim']
        contradicts_abstract = corpus[k]['abstract']
        contradicts_string = contradicts.format(contradicts_abstract, contradicts_claim)

        temp = [supports_string, contradicts_string]
        random.shuffle(temp)
        prompt = intro + temp[0] + temp[1] + query_string
        message = [{"role": "user", "content": prompt}]
        try:
          response = openai.ChatCompletion.create(model="gpt-4", messages=message, temperature=0.2)
        except Exception as e:
          print(e)
          continue
        predicted = response.choices[0].message.content
        results.append((label, predicted))
    obj = json.dumps(results, indent=4)
    with open(f'../data/results/few_with_abstract_no_nei/few_with_abstract_no_nei_{i}.json', 'w') as f:
      f.write(obj)

## 6. Few-shot / claim and abstract / with NEI

In [None]:
for i in range(5):
    random.seed(22)
    intro = "The following are examples of claims from a research paper and the corresponding abstract from the paper they are citing."
    supports =  "\nThis is an example of an abstract that SUPPORTS the claim: \n\tSupporting abstract: {} \n\tClaim: {}"
    contradicts = "\nThis is an example of an abstract that CONTRADICTS the claim: \n\tContradicting abstract: {} \n\tClaim: {}"
    # unrelated = "\nThis is an example of an abstract that is UNRELATED to the claim: \n\tUnrelated abstract: {} \n\tClaim: {}"
    # query = "\nPlease obey the following: given a new abstract and claim pair, please make an estimation whether the abstract SUPPORTS, CONTRADICTS, or is UNRELATED to the claim. You must choose SUPPORTS or CONTRADICTS or UNRELATED. Please return your answer as the capitalized token, as well as an explanation or rationale for the answer. \n\tNew abstract: {} \n\tClaim: {}"
    nei = "\nThis is an example of an abstract with NOT_ENOUGH_INFO about the claim: \n\tMissing info abstract: {} \n\tClaim: {}"
    query = "\nPlease obey the following: given a new abstract and claim pair, please make an estimation whether the abstract SUPPORTS or CONTRADICTS the claim, or if there is NOT_ENOUGH_INFO to determine. You must choose SUPPORTS or CONTRADICTS or NOT_ENOUGH_INFO. Please return your answer as the capitalized token(s), as well as an explanation or rationale for the answer. \n\tNew abstract: {} \n\tClaim: {}"

    results = []
    for item in test:
        time.sleep(5.5)
        key = item['id']
        abstract = corpus[key]['abstract']
        claim = item['claim']
        label = item['label']
        query_string = query.format(abstract, claim)

        k, v = random.choice(list(train_supports.items()))
        supports_claim = v['claim']
        supports_abstract = corpus[k]['abstract']
        supports_string = supports.format(supports_abstract, supports_claim)

        k, v = random.choice(list(train_contradicts.items()))
        contradicts_claim = v['claim']
        contradicts_abstract = corpus[k]['abstract']
        contradicts_string = contradicts.format(contradicts_abstract, contradicts_claim)

        k, v = random.choice(list(train_nei.items()))
        nei_claim = v['claim']
        nei_abstract = corpus[k]['abstract']
        unrelated_string = nei.format(nei_abstract, nei_claim)

        temp = [supports_string, contradicts_string, unrelated_string]
        random.shuffle(temp)
        prompt = intro + temp[0] + temp[1] + temp[2] + query_string
        message = [{"role": "user", "content": prompt}]
        try:
          response = openai.ChatCompletion.create(model="gpt-4", messages=message, temperature=0.2)
        except Exception as e:
          print(e)
          continue
        predicted = response.choices[0].message.content
        results.append((label, predicted))
    obj = json.dumps(results, indent=4)
    with open(f'../data/results/nei_ablation/few_with_abstract_with_nei/few_with_abstract_with_nei_{i}.json', 'w') as f:
      f.write(obj)


# Calculate metrics

In [None]:
# Define experiments to calculate results for
exps = ['zero_claim_only_no_unrelated',
        'zero_with_abstract_no_unrelated',
        'zero_with_abstract_with_unrelated',
        'multi_claim_only_no_unrelated',
        'multi_with_abstract_no_unrelated',
        'multi_with_abstract_with_unrelated',
        'scifact_baseline'
        ]

In [None]:
# Calculate metrics for each experiment
results = {}
for exp in exps:
    final_avgs = []
    abstentions = 0
    total = 0
    for i in range(5):
        with open('../data/results/' + exp + "/" + exp + f'_{i}.json') as f:
            items = json.load(f)
        if 'no_nei' in exp:
            labels = ['CONTRADICTS', 'SUPPORTS']
        else:
            labels = ['CONTRADICTS', 'SUPPORTS', nei]
        data = []
        # Iterate over each each response from the experiment
        for x in items:
            total += 1
            # Search for the label in the first ten words
            label = x[1].split()[:10]
            matched = False
            for potential_label in labels:
                if potential_label in [re.sub(r'[^\w\s]', '', x) for x in label]:
                    label = potential_label
                    matched = True
                    break
            if not matched:
                abstentions += 1
                label = 'ABSTENTION'
            data.append((x[0], label))
        # Calculate metrics and add experiment to results dict
        final_avgs.append(table_metrics(data))
        temp = [mean([x[n] for x in final_avgs]) for n in range(len(final_avgs[0]))]
        temp.append((abstentions, total))
        results[exp] = temp

In [None]:
"""
Display results per experiment in the format:

Micro F1
Macro F1
Macro Precision
Macro Recall
(abstentions, total)
"""
results

# Baseline on SciFact test set

## Load data

In [None]:
path_to_scifact_corpus = '' # Download SciFact and add file path (corpus.jsonl)

with open(path_to_scifact_corpus) as f_pdf:
    for line in f_pdf:
        pdf_parse_dict = json.loads(line)
        corpus[pdf_parse_dict['doc_id']] = pdf_parse_dict


path_to_scifact_predictions_topk '' # Use BEIR to retrieve top-k predicitons

test = []
with open(path_to_scifact_predictions_topk) as f_pdf:
    for line in f_pdf:
        parse = json.loads(line)
        for id in parse['cited_doc_ids']:
            test.append({
                'claim': parse['claim'],
                'id': id,
                'claim_id': parse['id']
            })

In [None]:
#### SCitance ICL - comment out if using SciFact ICL
train = []
with open(f'../data/dataset/train.jsonl') as f_pdf:
    for line in f_pdf:
        parse = json.loads(line)
        for i in range(len(parse['doc_ids'])):
          if str(parse['doc_ids'][i]) in parse['evidence']:
            if parse['evidence'][str(parse['doc_ids'][i])][0]['label'] + 'S' == "CONTRADICTS" and parse['claim'] not in negations_set: ## negation checker
              continue
            temp = {
                'claim': parse['claim'],
                'label': parse['evidence'][str(parse['doc_ids'][i])][0]['label'] + 'S',
                'id': str(parse['doc_ids'][i])
            }
          else:
            temp = {
                'claim': parse['claim'],
                'label': "NOT_ENOUGH_INFO",
                'id': str(parse['doc_ids'][i])
            }
          train.append(temp)

In [None]:
# #### SciFact ICL - comment out if using SCitance ICL
# train = []
# path_to_scifact_training_claims = '' # Download and add path to claims in SciFact's train set
# with open(path_to_scifact_training_claims) as f_pdf:
#     for line in f_pdf:
#         parse = json.loads(line)
#         # {'id': 0, 'claim': '0-dimensional biomaterials lack inductive properties.', 'evidence': {}, 'cited_doc_ids': [31715818]}
#         for id in parse['cited_doc_ids']:
#           if str(id) not in parse['evidence']:
#             train.append({
#                 'claim': parse['claim'],
#                 'label': "NOT_ENOUGH_INFO",
#                 'id': id
#             })
#           else:
#             train.append({
#                 'claim': parse['claim'],
#                 'label': parse['evidence'][str(id)][0]['label'] + "S",
#                 'id': id
#             })

In [None]:
train_supports = []
train_contradicts = []
train_nei = []
for item in train:
  if item['label'] == 'SUPPORTS':
    train_supports.append(item)
  if item['label'] == 'CONTRADICTS':
    train_contradicts.append(item)
  if item['label'] == "NOT_ENOUGH_INFO":
    train_nei.append(item)

## Run SciFact experiment

In [None]:
random.seed(22)

intro = "The following are examples of claims from a research paper and the corresponding abstract from the paper they are citing."
supports =  "\nThis is an example of an abstract that SUPPORTS the claim: \n\tSuppporting abstract: {} \n\tClaim: {}"
contradicts = "\nThis is an example of an abstract that CONTRADICTS the claim: \n\tContradicting abstract: {} \n\tClaim: {}"
unrelated = "\nThis is an example of an abstract with NOT_ENOUGH_INFO about the claim: \n\tMissing info abstract: {} \n\tClaim: {}"

# Zero shot
# query = "Please obey the following: given a new abstract and claim pair, please make an estimation whether the abstract SUPPORTS or CONTRADICTS the claim, or if there is NOT_ENOUGH_INFO to determine. You must choose SUPPORTS or CONTRADICTS or NOT_ENOUGH_INFO. Please return your answer with the capitalized token(s) at the beginning of the response. Also provide an explanation or rationale for the answer. \n\tAbstract: {} \n\tClaim: {}"

# Few shot
query = "\nPlease obey the following: With a specific abstract, please make an estimation whether the abstract SUPPORTS, CONTRADICTS, or if there is NOT_ENOUGH_INFO to determine. You must choose SUPPORTS or CONTRADICTS or NOT_ENOUGH_INFO. Please return your answer as only the capitalized token, as well as an explanation or rationale for the answer. \n\tAbstract: {} \n\tClaim: {}"

results = []
errors = 0

for item in test:
    time.sleep(5)
    doc_id = item['id']
    claim_id = item['claim_id']
    abstract = corpus[doc_id]['abstract']
    claim = item['claim']
    query_string = query.format(abstract, claim)

    # Few shot
    v = random.choice(train_supports)
    supports_claim = v['claim']
    supports_abstract = corpus[int(v['id'])]['abstract']
    supports_string = supports.format(supports_abstract, supports_claim)

    v = random.choice(train_contradicts)
    contradicts_claim = v['claim']
    contradicts_abstract = corpus[int(v['id'])]['abstract']
    contradicts_string = contradicts.format(contradicts_abstract, contradicts_claim)

    v = random.choice(train_nei)
    nei_claim = v['claim']
    nei_abstract = corpus[int(v['id'])]['abstract']
    unrelated_string = unrelated.format(nei_abstract, nei_claim)

    temp = [supports_string, contradicts_string, unrelated_string]
    random.shuffle(temp)
    prompt = intro + temp[0] + temp[1] + temp[2] + query_string

    # Zero shot
    # prompt = query_string
    message = [{"role": "user", "content": prompt}]
    try:
        response = openai.ChatCompletion.create(model="gpt-3.5-turbo-0125", messages=message, temperature=0.2)
    except Exception as e:
        print(e)
        errors += 1
        continue

    predicted = response.choices[0].message.content
    result = (claim_id, doc_id, predicted)
    results.append(result)
obj = json.dumps(results, indent=4)
with open('./data/results/scifact_test/our_icl_0_gpt3-5.json', "w") as f:
        f.write(obj)
print("Finished!")
print("OpenAI API Errors:", errors)