# Imports

In [None]:
import json
import re
import random
import time
import openai
import os

# Load data

In [None]:
# Create a lookup for the pdf parse based on paper ID
corpus = {}
with open(f'../data/scifact/corpus.jsonl') as f_pdf:
    for line in f_pdf:
        pdf_parse_dict = json.loads(line)
        corpus[pdf_parse_dict['doc_id']] = pdf_parse_dict
print("Corpus parsed.")

In [None]:
claim_id_to_claim = []
# Retrieve all claims for the train set
with open(f'../data/scifact/claims_train.jsonl') as f_pdf:
    for line in f_pdf:
        claim_dict = json.loads(line)
        if (claim_dict['evidence']):
          claim_id_to_claim.append(claim_dict)
# Retireve all claims from the dev set
with open(f'../data/scifact/claims_dev.jsonl') as f_pdf:
    for line in f_pdf:
        claim_dict = json.loads(line)
        if (claim_dict['evidence']):
          claim_id_to_claim.append(claim_dict)

print("Claims parsed.")

In [None]:
# create a lookup for the pdf parse based on paper ID
citances = []
with open(f'../data/scifact/claims_with_citances.jsonl') as f_pdf:
    for line in f_pdf:
        citance_dict = json.loads(line)
        citances.append((re.sub(r' \[\d+\]', '', citance_dict['citance']), citance_dict['claims']))
        # citances.append((citance_dict['citance'], citance_dict['claims']))
print("Citances parsed.")
print('Number of Citances:', len(citances))

claims_and_evidence = {}
for i in claim_id_to_claim:
    i['normalized_claim'] = re.sub(r'[.!?,\s]', '', i['claim'].lower())
    claims_and_evidence[i['normalized_claim']] = i['evidence']

citance_to_id_dict = {}
with open(f'../data/scifact/claims_with_citances.jsonl') as f_pdf:
    for line in f_pdf:
        citance_dict = json.loads(line)
        citance_to_id_dict[re.sub(r' \[\d+\]', '', citance_dict['citance'])] = citance_dict['s2orc_id']

# Negation Generation and Validation


In [None]:
# Load .env file with your API key
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
negations = {}
failures = 0
successes = 0
prompt = "Please negate this sentence by changing as few words as possible in the original sentence: "
# Loop through all citances
for citance in citances:
  time.sleep(5.5)
  citance = citance[0]
  # Use current citance as a variable in the standard prompt
  response = openai.Completion.create(model="text-davinci-003", prompt=prompt + citance, temperature=0, max_tokens=256) ######### THIS MODEL HAS BEEN DEPRECATED
  response = response.choices[0].text[2:]

  # Checking generated negation length within 20% of original
  rlen = len(response.split())
  clen = len(citance.split())
  if (rlen > clen * 1.2 or rlen < clen * 0.8):
    failures += 1
    continue
  else:
    negations[citance] = response
    successes += 1
  negations[citance] = response
print("Successes:", successes)
print("Failures: ", failures)

In [None]:
obj = json.dumps(negations, indent=4)
with open('../data/negations/negations.json', 'w') as f:
  f.write(obj)

In [None]:
query = "Given two sentences, please evaluate the extent to which the second sentence is a negation of the first. Please provide a confidence score in the domain [0, 100], where a score of 0 means identical meaning and a score of 100 means a perfect negation. \nFirst sentence: {} \nSecond sentence {}"
results = []
failures = 0
for key in negations:
  time.sleep(1)
  prompt = query.format(key, negations[key])
  message = [{"role": "user", "content": prompt}]
  try:
    response = openai.ChatCompletion.create(model="gpt-4", messages=message, temperature=0.2)
  except:
    failures += 1
    continue
  score = response.choices[0].message.content
  results.append((key, negations[key], score))

print("Failures: ", failures)

In [None]:
negation_score_dict = {}
for item in results:
  negation_score_dict[item[0]] = [item[1], item[2]]
obj = json.dumps(negation_score_dict, indent=4)

# Write to file
with open('../data/negations/negations_with_scores.json', 'w') as f:
  f.write(obj)

# Citance to Claim to Evidence Matching

In [None]:
with open(f'../data/negations/negations.json') as f:
    negations = json.load(f)

In [None]:
# Setup of SUPPORTS and CONTRADICTS
data = {}
dataset_corpus = {}
lines = []
failed_keys = []
errors = 0
int_id = 0
counter = 0
citance_set = set()
# Loop through citances dict
for citance in citances:
    # For each citance, loop through all claims
    for claim in citance[1]:
        claim['normalized'] = re.sub(r'[.!?,\s]', '', claim['text'].lower())
        # Skip if there is no matching claim
        if claim['normalized'] not in claims_and_evidence:
            failed_keys.append(claim['normalized'])
            continue

        support = {}
        contradict = {}
        # For each associated claim, loop through all associated evidence
        for doc_id in claims_and_evidence[claim['normalized']]:

            for sentences in claims_and_evidence[claim['normalized']][doc_id]:
                # Negations of the citance
                if sentences['label'] == 'SUPPORT': # Only take SUPPORT labels
                    # Actual citance
                    support['claim'] = citance[0]
                    support['id'] = int_id
                    int_id += 1

                    support['evidence'] = {int(doc_id): [{
                          'label': 'SUPPORT'
                          }]
                          }
                    support['citance_id'] = citance_to_id_dict[citance[0]]
                    support['doc_ids'] = [int(doc_id)]

                    # Negated citance
                    try:
                        contradict['claim'] = negations[citance[0]]
                    except Exception as e: # key error
                        errors += 1
                        continue
                    contradict['id'] = int_id
                    int_id += 1

                    contradict['evidence'] = {int(doc_id): [{
                          'label': 'CONTRADICT'
                          }]
                          }
                    contradict['citance_id'] = citance_to_id_dict[citance[0]]
                    contradict['doc_ids'] = [int(doc_id)]

                    data[citance[0]] = support # Create new data entry for SUPPORT and CONTRADICT
                    data[negations[citance[0]]] = contradict


                    dataset_corpus[int(doc_id)] = { # Update corpus with necessary docs
                        'title': corpus[int(doc_id)]['abstract'][0],
                        'abstract': corpus[int(doc_id)]['abstract'],
                        'doc_id': int(doc_id)
                    }
print(errors)
print(len(failed_keys))
print(len(data.keys()))

# NEI Generation

In [None]:
claim_id_to_claim = []
with open(f'/data/scifact/claims_train.jsonl') as f_pdf:
    for line in f_pdf:
        claim_dict = json.loads(line)
        nei_ids = []
        for id in claim_dict['cited_doc_ids']:
            if str(id) not in claim_dict['evidence']:
                nei_ids.append(id)
        if not nei_ids:
          continue
        claim_dict['cited_doc_ids'] = nei_ids
        claim_dict['evidence'] = {}
        claim_id_to_claim.append(claim_dict)
with open(f'/data/scifact/claims_dev.jsonl') as f_pdf:
    for line in f_pdf:
        claim_dict = json.loads(line)
        nei_ids = []
        for id in claim_dict['cited_doc_ids']:
            if str(id) not in claim_dict['evidence']:
                nei_ids.append(id)
        if not nei_ids:
          continue
        claim_dict['cited_doc_ids'] = nei_ids
        claim_dict['evidence'] = {}
        claim_id_to_claim.append(claim_dict)
print("Claims parsed.")

claims_and_doc_ids = {}
for i in claim_id_to_claim:
    i['normalized_claim'] = re.sub(r'[.!?,\s]', '', i['claim'].lower())
    claims_and_doc_ids[i['normalized_claim']] = i['cited_doc_ids']

In [None]:
# Setup of NEI
citances_to_id = {}
results = []
failed_keys = []
errors = 0
# Loop through citances dict
for citance in citances:
    # For each citance, loop through all claims
    for claim in citance[1]:

        claim['normalized'] = re.sub(r'[.!?,\s]', '', claim['text'].lower())
        nei = {}
        if claim['normalized'] in claims_and_doc_ids:
            ids = []
            for doc_id in claims_and_doc_ids[claim['normalized']]: # Update corpus with necessary docs
                if id not in ids:
                    ids.append(int(doc_id))
                    dataset_corpus[int(doc_id)] = {
                                'title': corpus[int(doc_id)]['abstract'][0],
                                'abstract': corpus[int(doc_id)]['abstract'],
                                'doc_id': int(doc_id)
                            }
            nei['claim'] = citance[0]
            nei['id'] = int_id
            nei['evidence'] = {}
            nei['citance_id'] = citance_to_id_dict[citance[0]]
            nei['doc_ids'] = ids
            if citance[0] not in data: # Add new entry for NEI
                data[citance[0]] = nei
                num += len(ids)
                int_id += 1
            else: # Update existing entry for NEI
              for id in ids:
                if id not in outer[citance[0]]['doc_ids']:
                  data[citance[0]]['doc_ids'].append(id)


print(errors)
print(len(failed_keys))

# Save dataset

In [None]:
random.seed(5)
m = list(data.values())
random.shuffle(m)
split_1 = int(0.7 * len(m))
split_2 = int(0.85 * len(m))
train = m[:split_1]
dev = m[split_1:split_2]
test = m[split_2:]

with open('../data/scitance/train.jsonl', "w") as f:
    for item in train:
        json_item = json.dumps(item)
        f.write(json_item + "\n")

with open('../data/scitance/dev.jsonl', "w") as f:
    for item in dev:
        json_item = json.dumps(item)
        f.write(json_item + "\n")

with open('../data/scitance/test.jsonl', "w") as f:
    for item in test:
        json_item = json.dumps(item)
        f.write(json_item + "\n")

with open('../data/scitance/corpus.jsonl', "w") as f:
    for item in list(dataset_corpus.values()):
        json_item = json.dumps(item)
        f.write(json_item + "\n")