In [1]:
!pip install -U sentence-transformers rank_bm25



In [2]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")


#We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import string

def remove_punctuation(text):
    if not isinstance(text, str):
        text = str(text)  # Convert non-strings to strings
    return text.translate(str.maketrans('', '', string.punctuation))

In [4]:
# Import Module 
import os 

# Folder Path 
path_to_zip_file = "/home/elson/corpus.zip"
directory_to_extract_to = "/home/elson/"
import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

def read_text_file(file_path): 
    with open(file_path, 'r', encoding="utf8") as f:
        doc = f.read()
    return doc
        

In [5]:
# iterate through all file 
evidence_documents = []
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
path = "/home/elson/corpus/"
os.chdir(path)
for num,efile in enumerate(os.listdir()):
    if efile.endswith(".txt"):
        print(efile)
        file_path = f"{path}/{efile}"
        evidence = read_text_file(file_path)
        sentences = sent_tokenize(evidence)
        for sentence in sentences:
            if (sentence not in evidence_documents) and len(sentence)>=40:
                evidence_documents.append(sentence)

[nltk_data] Downloading package punkt to /home/elson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


doc394.txt
doc227.txt
doc462.txt
doc586.txt
doc810.txt
doc803.txt
doc653.txt
doc743.txt
doc108.txt
doc292.txt
doc36.txt
doc483.txt
doc710.txt
doc333.txt
doc786.txt
doc795.txt
doc471.txt
doc851.txt
doc645.txt
doc428.txt
doc47.txt
doc607.txt
doc775.txt
doc111.txt
doc530.txt
doc286.txt
doc250.txt
doc417.txt
doc633.txt
doc593.txt
doc764.txt
doc615.txt
doc748.txt
doc81.txt
doc666.txt
doc700.txt
doc797.txt
doc489.txt
doc425.txt
doc295.txt
doc598.txt
doc162.txt
doc113.txt
doc466.txt
doc97.txt
doc552.txt
doc207.txt
doc468.txt
doc19.txt
doc16.txt
doc711.txt
doc304.txt
doc750.txt
doc852.txt
doc33.txt
doc848.txt
doc737.txt
doc727.txt
doc551.txt
doc395.txt
doc385.txt
doc323.txt
doc229.txt
doc777.txt
doc778.txt
doc267.txt
doc729.txt
doc414.txt
doc218.txt
doc836.txt
doc649.txt
doc224.txt
doc453.txt
doc704.txt
doc345.txt
doc278.txt
doc828.txt
doc571.txt
doc628.txt
doc656.txt
doc354.txt
doc211.txt
doc435.txt
doc568.txt
doc173.txt
doc415.txt
doc685.txt
doc487.txt
doc759.txt
doc506.txt
doc112.txt
doc51.

In [6]:
evidence_documents[0]

'While many studies have shown a connection between stress and autoimmune disease,  most of the evidence for stress contributing to the onset and course of  autoimmune disease is circumstantial and the mechanisms by which stress affects  autoimmune disease are not fully understood.'

In [7]:
preprocessed_documents = [remove_punctuation(doc) for doc in evidence_documents]
preprocessed_documents = [doc.lower() for doc in preprocessed_documents]
preprocessed_documents = [doc.replace('\n', ' ') for doc in preprocessed_documents]
preprocessed_documents = [doc.replace('\t', ' ') for doc in preprocessed_documents]


In [8]:
device = 'cuda:3'

In [9]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document in the list
preprocessed_documents = [' '.join([word for word in doc.split() if word not in stop_words]) 
                          for doc in preprocessed_documents]

[nltk_data] Downloading package stopwords to /home/elson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
corpus_embeddings = bi_encoder.encode(evidence_documents, convert_to_tensor=True, show_progress_bar=True).to(device)

Batches: 100%|██████████| 3760/3760 [00:54<00:00, 69.06it/s]


In [11]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for doc in tqdm(evidence_documents):
    tokenized_corpus.append(bm25_tokenizer(doc))

bm25 = BM25Okapi(tokenized_corpus)

100%|██████████| 120302/120302 [00:02<00:00, 53162.43it/s]


In [12]:
# This function will search all wikipedia articles for passages that
# answer the query
def search(query):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['score'], evidence_documents[hit['corpus_id']].replace("\n", " ")))

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, evidence_documents[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['score'], evidence_documents[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], evidence_documents[hit['corpus_id']].replace("\n", " ")))

In [14]:
search("You can't get thyroid disease if you're young.")

Input question: You can't get thyroid disease if you're young.
Top-3 lexical search (BM25) hits
	18.635	But a TSH test can't show what is causing a thyroid problem.
	13.751	This is also true even if you're already feeling better.
	12.152	If you're healthy, the infection probably won't cause serious problems.
	12.152	But the test can't explain why your TSH levels may be too high or too low.
	11.986	Acute cerebrovascular disease in the young: the Stroke in Young Fabry Patients study :340–9.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.740	And TSH levels may be higher in people over age 80, even though they don't have any thyroid problems.
	0.727	If you have a history of thyroid disease, be sure to talk with your provider if you are pregnant or are thinking of becoming pregnant.
	0.723	Thyroid-related medical problems are exceedingly common.
	0.722	The unique challenge to the provider of  adolescent health care is that thyroid problems can adversely affect growth and  de

In [31]:
def search_ce(query):
    top_ks = ""
    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, evidence_documents[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]


    # Output of top-5 hits from re-ranker
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:3]:
        top_ks=top_ks+evidence_documents[hit['corpus_id']]+"||"
    print(top_ks)    
    return top_ks

In [18]:
pip install pandas

Collecting pandas
  Downloading pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
     |████████████████████████████████| 9.5 MB 3.5 MB/s            
Collecting pytz>=2017.2
  Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
     |████████████████████████████████| 505 kB 43.0 MB/s            
Installing collected packages: pytz, pandas
Successfully installed pandas-1.1.5 pytz-2024.1
Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
     |████████████████████████████████| 249 kB 6.0 MB/s            
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2
Note: you may need to restart the kernel to use updated packages.


In [23]:
import pandas as pd
df = pd.read_excel("/home/elson/Claims_withGeminiAnnotation.xlsx", engine='openpyxl')

In [32]:
df['claim'] = df['claim'].astype(str)  # Convert all claims to string
df['top_k_minilm_ce'] = df['claim'].apply(lambda claim: search_ce(claim))

The link of chocolate to acne vulgaris was replaced by the theory that a high glycemic index may contribute to acne vulgaris.||in Article
Effect of chocolate on acne vulgaris.||A 2021 systematic review of 53 studies (11 interventional clinical trials and 42 observational studies) showed that a high glycaemic-load diet, foods with a high glycaemic index, dairy products, chocolate and fatty food have a positive effect on the development of acne.||
The data available suggest that exposure to cold, either  through exposure to low environmental temperatures or during induced hypothermia,  increases the risk of developing upper and lower respiratory tract infections and  dying from them; in addition, the longer the duration of exposure the higher the  risk of infection.||This mechanism can explain why a person who expose to hypothermia with wet hair or a person who don’t use a beret or a hat during cold weather gets
and posterior eye pain.||There is a widely held belief that acute viral resp

In [34]:
df.head()

Unnamed: 0,folder,filename,claim,label,url,GOLD EXPLANATION,CATEGORY,gemini_label,gemini_explanation,top_k_minilm_ce
0,/content/drive/MyDrive/images/myths on urticaria,mythsonurticaria1.jpeg,Eating chocolate will cause acne.,SUPPORTED,https://www.jaad.org/article/S0190-9622(16)013...,The chocolate consumption group had a statisti...,Skin,REFUTED,There is no scientific evidence to support the...,The link of chocolate to acne vulgaris was rep...
1,/content/drive/MyDrive/images/myths on urticaria,mythsonurticaria1.jpeg,You can get a cold from being in the rain.,SUPPORTED,https://pubmed.ncbi.nlm.nih.gov/17705968/,Exposure to cold has often been associated wi...,General Health,REFUTED,"The common cold is caused by viruses, not by b...",The data available suggest that exposure to co...
2,/content/drive/MyDrive/images/myths on urticaria,mythsonurticaria1.jpeg,Stress can cause acne.,SUPPORTED,https://medicaljournalssweden.se/actadv/articl...,"Based on this study, increased stress does not...",Skin,SUPPORTED,There is evidence to suggest that stress can t...,This finding provides physiological support to...
3,/content/drive/MyDrive/images/myths on urticaria,mythsonurticaria1.jpeg,You can prevent acne by washing your face more...,NOT ENOUGH INFORMATION,https://www.tandfonline.com/doi/full/10.1080/0...,Washing and over-the-counter cleansers are com...,Skin,REFUTED,Washing your face more often does not prevent ...,Abstract\nPurpose: Washing and over-the-counte...
4,/content/drive/MyDrive/images/myths on vascula...,mythsonvascularsurgery3.jpeg,Varicose veins are caused by standing too much.,SUPPORTED,https://www.sjweh.fi/article/562,"For men working mostly in a standing position,...",Vascular,NOT ENOUGH INFORMATION,While standing for long periods of time can co...,Varicose veins are superficial veins in the su...


In [35]:
df.to_excel("/home/elson/topk_minilm.xlsx")