In [1]:
import pandas as pd

In [2]:
pub = pd.read_csv("D:/Master Thesis/PUBHEALTH/PUBHEALTH/train.tsv", sep='\t', header=0)

In [3]:
pub.head()

Unnamed: 0,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects
0,15661,"""The money the Clinton Foundation took from fr...","April 26, 2015","""Gingrich said the Clinton Foundation """"took m...",Katie Sanders,"""Hillary Clinton is in the political crosshair...",https://www.wsj.com/articles/clinton-foundatio...,false,"Foreign Policy, PunditFact, Newt Gingrich,"
1,9893,Annual Mammograms May Have More False-Positives,"October 18, 2011",This article reports on the results of a study...,,While the financial costs of screening mammogr...,,mixture,"Screening,WebMD,women's health"
2,11358,SBRT Offers Prostate Cancer Patients High Canc...,"September 28, 2016",This news release describes five-year outcomes...,"Mary Chris Jaklevic,Steven J. Atlas, MD, MPH,K...",The news release quotes lead researcher Robert...,https://www.healthnewsreview.org/wp-content/up...,mixture,"Association/Society news release,Cancer"
3,10166,"Study: Vaccine for Breast, Ovarian Cancer Has ...","November 8, 2011","While the story does many things well, the ove...",,"The story does discuss costs, but the framing ...",http://clinicaltrials.gov/ct2/results?term=can...,true,"Cancer,WebMD,women's health"
4,11276,Some appendicitis cases may not require ’emerg...,"September 20, 2010",We really don’t understand why only a handful ...,,"""Although the story didn’t cite the cost of ap...",,true,


In [None]:
pub.describe()

In [None]:
claims = pub['claim']

In [62]:
claims

['money clinton foundation took foreign governments hillary clinton secretary state clearly illegal … constitution says can’t take stuff',
 'annual mammograms may falsepositives',
 'sbrt offers prostate cancer patients high cancer control low toxicity fewer treatments',
 'study vaccine breast ovarian cancer potential',
 'appendicitis cases may require ’emergency’ surgery',
 'britain reveal trial criteria coronavirus antibody tests',
 'angioplasty wrist backed new study',
 'us says results encouraging healthcare delivery reforms',
 'latest trial jj talc litigations gets way california',
 'poor test results heart drugs',
 'opossums kill thousands ticks week inhibiting spread lyme disease humans',
 'democrats hoping flip house trashtalking trump',
 'hoodies riddled faux bullet holes bearing names schools involved massacres including columbine sandy hook marjory stoneman douglas available purchase',
 'cancer activist sounds alarm early testing genetic marker',
 'end 2016 23 percent fewer f

## Preprocessing Claims

In [None]:
import string

def remove_punctuation(text):
    if not isinstance(text, str):
        text = str(text)  # Convert non-strings to strings
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to the text column
pub['cleaned_text'] = pub['claim'].apply(remove_punctuation)

In [None]:
# Convert text to lowercase
pub['cleaned_text'] = pub['cleaned_text'].str.lower()

In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Remove stop words
pub['cleaned_text'] = pub['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

## Generate Embeddings for Claims

In [None]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel  

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")

In [None]:
claims = pub['cleaned_text'].to_list()

In [None]:
batch_encoded_input = tokenizer.batch_encode_plus(claims, padding=True, return_tensors='pt')

In [None]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel  

tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").cpu()


bs = 128 # batch size during inference
def get_embeddings(text):
    embeddings = []
    for i in tqdm(np.arange(0, len(text), bs)):
        toks = tokenizer.batch_encode_plus(text[i:i+bs], 
                                           padding="max_length", 
                                           max_length=25, 
                                           truncation=True,
                                           return_tensors="pt")
        toks_cuda = {}
        for k,v in toks.items():
            toks_cuda[k] = v.cpu()
        cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
        embeddings.append(cls_rep.cpu().detach().numpy())
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [None]:
claim_embeddings = get_embeddings(claims)

## Preprocessing Documents and Generate Embeddings

##### Load evidence documents

In [None]:
# Import Module 
import os 

# Folder Path 
path = "D:/Master Thesis/PUBHEALTH_EVIDENCE_DOCS/"

# Change the directory 
os.chdir(path) 

def read_text_file(file_path): 
    with open(file_path, 'r', encoding="utf8") as f:
        doc = f.read()
    return doc
        

In [None]:
# iterate through all file 
evidence_documents = []
for num,file in enumerate(os.listdir()):
    if file.endswith(".txt"):
        file_path = f"{path}/{file}"
        evidence_documents.append(read_text_file(file_path))
        

In [None]:
evidence_documents

In [None]:
preprocessed_documents = [remove_punctuation(doc) for doc in evidence_documents]

In [None]:
preprocessed_documents = [doc.lower() for doc in preprocessed_documents]

In [None]:
preprocessed_documents = [doc.replace('\n', ' ') for doc in preprocessed_documents]

In [None]:
preprocessed_documents = [doc.replace('\t', ' ') for doc in preprocessed_documents]

In [None]:
# Remove stopwords from each document in the list
preprocessed_documents = [' '.join([word for word in doc.split() if word not in stop_words]) 
                          for doc in preprocessed_documents]


In [41]:
document_embeddings = get_embeddings(preprocessed_documents)

  0%|          | 0/33 [00:00<?, ?it/s]

## Indexing , Saving and Querying Documents

In [42]:
import faiss
import numpy as np

embedding_matrix = np.vstack(document_embeddings)  # Assuming embeddings is a list of numpy arrays

d = embedding_matrix.shape[1]  # Dimension of embeddings

# Create a FAISS index
index = faiss.IndexFlatL2(d)

In [43]:
index.add(embedding_matrix)

In [44]:
faiss.write_index(index, "D:/Master Thesis/src/documents.index")

In [45]:
index = faiss.read_index("D:/Master Thesis/src/documents.index")

In [46]:
def search(query, k=10):
    query_embedding = get_embeddings([query])
    scores, indices = index.search(query_embedding, k)
    return scores, indices

In [57]:
scores, indices = search(claims[0])

  0%|          | 0/1 [00:00<?, ?it/s]

In [58]:
print(scores)

[[131.43166 161.05968 165.43791 165.79529 166.12045 166.12045 169.04494
  170.94376 173.53055 173.59906]]


In [59]:
print(indices)

[[ 711 1655 1426 2754 2963 2964 2905 2181 3244  695]]


In [60]:
nearest_documents = [evidence_documents[i] for i in indices[0]]

In [61]:
for score, doc in zip(scores[0], nearest_documents):
    print(f"Score: {score}, Document: {doc}")

Score: 131.43165588378906, Document: Congress had both good practical reason and Constitutional authority to enact PLCAA.
 Its purpose and effect was to call a halt to the campaign (backed by the administration of Bill Clinton, Hillary’s husband) to launch financially ruinous litigation against firearms makers and dealers — most of them thinly capitalized firms unable to withstand massive legal bills — and apply the resulting leverage to extract promises of gun control without the bother of seeking approval for those measures from a then‐​skeptical U.S. Congress.
 It was a campaign rightly decried as undemocratic even by such figures of the Left as former cabinet secretary Robert Reich.
 It was also a travesty of legal ethics, employing litigation as a pure weapon; thus then‐​HUD secretary Andrew Cuomo warned gunmakers that unless they cooperated they’d suffer “death by a thousand cuts”, while then‐​New York Attorney General Eliot Spitzer reportedly warned Glock: “If you do not sign, y