In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance
import pickle
import json
import numpy as np
import jsonlines

In [None]:
claim_file = "SciFact/claims_train.jsonl"
corpus_file = "SciFact/corpus.jsonl"

In [None]:
corpus = {}
with open(corpus_file) as f:
    for line in f:
        abstract = json.loads(line)
        corpus[str(abstract["doc_id"])] = abstract

In [None]:
claims = []
with open(claim_file) as f:
    for line in f:
        claim = json.loads(line)
        claims.append(claim)
claims_by_id = {claim['id']:claim for claim in claims}

In [None]:
corpus_texts = []
corpus_ids = []
for k, v in corpus.items():
    original_sentences = [v['title']] + v['abstract']
    processed_paragraph = " ".join(original_sentences)
    corpus_texts.append(processed_paragraph)
    corpus_ids.append(k)
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range=(1, 2))
corpus_ids = np.array(corpus_ids)
corpus_vectors = vectorizer.fit_transform(corpus_texts)

In [None]:
claim_vectors = vectorizer.transform([claim['claim'] for claim in claims])

In [None]:
similarity_matrix = np.dot(corpus_vectors, claim_vectors.T).todense()

In [None]:
similarity_matrix.shape

In [None]:
k = 100
orders = np.argsort(similarity_matrix,axis=0)
retrieved_corpus = {claim["id"]: corpus_ids[orders[:,i][::-1][:k]].squeeze() for i, claim in enumerate(claims)} 

In [None]:
with jsonlines.open("SciFact/claims_train_retrieved_tfidf.jsonl", 'w') as output:
    claim_ids = sorted(list(claims_by_id.keys()))
    for id in claim_ids:
        claims_by_id[id]["retrieved_doc_ids"] = retrieved_corpus[id].tolist()
        output.write(claims_by_id[id])

In [None]:
claim_file = "SciFact/claims_dev.jsonl"

In [None]:
claims = []
with open(claim_file) as f:
    for line in f:
        claim = json.loads(line)
        claims.append(claim)
claims_by_id = {claim['id']:claim for claim in claims}

In [None]:
claim_vectors = vectorizer.transform([claim['claim'] for claim in claims])

In [None]:
similarity_matrix = np.dot(corpus_vectors, claim_vectors.T).todense()

In [None]:
similarity_matrix.shape

In [None]:
k = 100
orders = np.argsort(similarity_matrix,axis=0)
retrieved_corpus = {claim["id"]: corpus_ids[orders[:,i][::-1][:k]].squeeze() for i, claim in enumerate(claims)} 

In [None]:
with jsonlines.open("SciFact/claims_dev_retrieved_tfidf.jsonl", 'w') as output:
    claim_ids = sorted(list(claims_by_id.keys()))
    for id in claim_ids:
        claims_by_id[id]["retrieved_doc_ids"] = retrieved_corpus[id].tolist()
        output.write(claims_by_id[id])

In [None]:
TP = 0
FP = 0

for abstract_ids, claim in zip(retrieved_corpus, claims):
    gold_ids = claim["evidence"].keys()
    for abstract_id in abstract_ids:
        if abstract_id in gold_ids:
            TP += 1
        else: 
            FP += 1
precision = TP/(TP+FP)
print("Precision:", precision)

In [None]:
TP = 0
FN = 0

for predicted_ids, claim in zip(retrieved_corpus, claims):
    for abstract_id in claim["evidence"].keys():
        if abstract_id in predicted_ids:
            TP += 1
        else: 
            FN += 1
recall = TP/(TP+FN)
print("Recall:", recall)

In [None]:
f1 = 2 * precision * recall / (precision + recall)

In [None]:
f1