In [5]:
!pip install -U sentence-transformers rank_bm25



In [6]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")


#We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [7]:
import string

def remove_punctuation(text):
    if not isinstance(text, str):
        text = str(text)  # Convert non-strings to strings
    return text.translate(str.maketrans('', '', string.punctuation))

In [8]:
# Import Module 
import os 

# Folder Path 
path_to_zip_file = "/home/elson/corpus.zip"
directory_to_extract_to = "/home/elson/"
import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

def read_text_file(file_path): 
    with open(file_path, 'r', encoding="utf8") as f:
        doc = f.read()
    return doc
        

In [9]:
import os
import nltk
from nltk.tokenize import sent_tokenize

# Download necessary NLTK models
nltk.download('punkt')

# Function to read text from a file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Prepare to handle newline characters in sentences
def handle_newlines_in_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        if '\n' in sentence:
            # Replace newline characters with a period and space, then re-tokenize
            sub_sentences = sent_tokenize(sentence.replace('\n', '. '))
            processed_sentences.extend(sub_sentences)
        else:
            processed_sentences.append(sentence)
    return processed_sentences

# Path to the corpus directory
path = "/home/elson/corpus/"
evidence_documents = []

# Iterate through each text file in the directory
for efile in os.listdir(path):
    if efile.endswith(".txt"):
        print(efile)  # Print the file name
        file_path = os.path.join(path, efile)  # Construct full file path
        evidence = read_text_file(file_path)  # Read text from the file
        
        # Tokenize the document into sentences
        sentences = sent_tokenize(evidence)
        
        # Handle newline characters within sentences
        sentences = handle_newlines_in_sentences(sentences)
        
        # Filter and add unique sentences of adequate length
        for sentence in sentences:
            if (sentence not in evidence_documents) and len(sentence) >= 40:
                evidence_documents.append(sentence)


[nltk_data] Downloading package punkt to /home/elson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


doc394.txt
doc227.txt
doc462.txt
doc586.txt
doc810.txt
doc803.txt
doc653.txt
doc743.txt
doc108.txt
doc292.txt
doc36.txt
doc483.txt
doc710.txt
doc333.txt
doc786.txt
doc795.txt
doc471.txt
doc851.txt
doc645.txt
doc428.txt
doc47.txt
doc607.txt
doc775.txt
doc111.txt
doc530.txt
doc286.txt
doc250.txt
doc417.txt
doc633.txt
doc593.txt
doc764.txt
doc615.txt
doc748.txt
doc81.txt
doc666.txt
doc700.txt
doc797.txt
doc489.txt
doc425.txt
doc295.txt
doc598.txt
doc162.txt
doc113.txt
doc466.txt
doc97.txt
doc552.txt
doc207.txt
doc468.txt
doc19.txt
doc16.txt
doc711.txt
doc304.txt
doc750.txt
doc852.txt
doc33.txt
doc848.txt
doc737.txt
doc727.txt
doc551.txt
doc395.txt
doc385.txt
doc323.txt
doc229.txt
doc777.txt
doc778.txt
doc267.txt
doc729.txt
doc414.txt
doc218.txt
doc836.txt
doc649.txt
doc224.txt
doc453.txt
doc704.txt
doc345.txt
doc278.txt
doc828.txt
doc571.txt
doc628.txt
doc656.txt
doc354.txt
doc211.txt
doc435.txt
doc568.txt
doc173.txt
doc415.txt
doc685.txt
doc487.txt
doc759.txt
doc506.txt
doc112.txt
doc51.

doc565.txt
doc539.txt
doc508.txt
doc816.txt
doc91.txt
doc719.txt
doc522.txt
doc802.txt
doc584.txt
doc34.txt
doc603.txt
doc343.txt
doc153.txt
doc56.txt
doc316.txt
doc773.txt
doc253.txt
doc604.txt
doc200.txt
doc419.txt
doc629.txt
doc762.txt
doc321.txt
doc58.txt
doc191.txt
doc344.txt
doc516.txt
doc721.txt
doc444.txt
doc59.txt
doc314.txt
doc369.txt
doc537.txt
doc458.txt
doc309.txt
doc519.txt
doc856.txt
doc811.txt
doc491.txt
doc631.txt
doc54.txt
doc228.txt
doc124.txt
doc746.txt
doc541.txt
doc575.txt
doc289.txt
doc842.txt
doc68.txt
doc744.txt
doc246.txt
doc620.txt
doc561.txt
doc734.txt
doc695.txt
doc597.txt
doc485.txt
doc749.txt
doc433.txt
doc733.txt
doc728.txt
doc155.txt
doc515.txt
doc201.txt
doc326.txt
doc3.txt
doc454.txt
doc90.txt
doc573.txt
doc77.txt
doc661.txt
doc820.txt
doc767.txt
doc455.txt
doc644.txt
doc146.txt
doc83.txt
doc446.txt
doc139.txt
doc313.txt
doc133.txt
doc421.txt
doc496.txt
doc409.txt
doc558.txt
doc342.txt
doc31.txt
doc268.txt
doc186.txt
doc8.txt
doc562.txt
doc279.txt
doc

In [10]:
evidence_documents[0]

'While many studies have shown a connection between stress and autoimmune disease,  most of the evidence for stress contributing to the onset and course of  autoimmune disease is circumstantial and the mechanisms by which stress affects  autoimmune disease are not fully understood.'

In [20]:
with open('/home/elson/corpus.txt', 'w') as f:
    for line in evidence_documents:
        f.write(f"{line}\n")

In [15]:
preprocessed_documents = [remove_punctuation(doc) for doc in evidence_documents]
preprocessed_documents = [doc.lower() for doc in preprocessed_documents]
preprocessed_documents = [doc.replace('\n', ' ') for doc in preprocessed_documents]
preprocessed_documents = [doc.replace('\t', ' ') for doc in preprocessed_documents]


In [12]:
device = 'cuda:3'

In [13]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document in the list
preprocessed_documents = [' '.join([word for word in doc.split() if word not in stop_words]) 
                          for doc in preprocessed_documents]

[nltk_data] Downloading package stopwords to /home/elson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'preprocessed_documents' is not defined

In [14]:
corpus_embeddings = bi_encoder.encode(evidence_documents, convert_to_tensor=True, show_progress_bar=True).to(device)

Batches: 100%|██████████| 4810/4810 [01:09<00:00, 69.38it/s]


In [15]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np


# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for doc in tqdm(evidence_documents):
    tokenized_corpus.append(bm25_tokenizer(doc))

bm25 = BM25Okapi(tokenized_corpus)

100%|██████████| 153910/153910 [00:02<00:00, 66316.18it/s]


In [16]:
# This function will search all wikipedia articles for passages that
# answer the query
def search(query):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print("Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['score'], evidence_documents[hit['corpus_id']].replace("\n", " ")))

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, evidence_documents[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['score'], evidence_documents[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], evidence_documents[hit['corpus_id']].replace("\n", " ")))

In [17]:
search("You can't get thyroid disease if you're young.")

Input question: You can't get thyroid disease if you're young.
Top-3 lexical search (BM25) hits
	17.651	But a TSH test can't show what is causing a thyroid problem.
	14.024	Side effects are more likely if you're:.
	13.225	If you're using both topical corticosteroids and.
	13.225	This is also true even if you're already feeling better.
	13.225	If you're looking to donate for the first time, find out more about.

-------------------------

Top-3 Bi-Encoder Retrieval hits
	0.797	hence proving there is no relation of age to thyroid diseases.
	0.747	Thyroid Disorders in Childhood and Adolescence.
	0.740	And TSH levels may be higher in people over age 80, even though they don't have any thyroid problems.
	0.727	If you have a history of thyroid disease, be sure to talk with your provider if you are pregnant or are thinking of becoming pregnant.
	0.723	Thyroid-related medical problems are exceedingly common.
	0.722	The unique challenge to the provider of  adolescent health care is that thyroid

In [50]:
def search_ce(query):
    # Initializations
    top_hits = [None, None, None]  # To store the top 3 hits
    
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]
    
    cross_inp = [[query, evidence_documents[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    for idx, hit in enumerate(hits):
        hit['cross-score'] = cross_scores[idx]
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    
    for i, hit in enumerate(hits[:3]):
        if i < len(top_hits):
            top_hits[i] = evidence_documents[hit['corpus_id']]

    return top_hits

In [51]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [32]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [33]:
import pandas as pd
df = pd.read_excel("/home/elson/Claims_withGeminiAnnotation.xlsx", engine='openpyxl')

In [53]:
# Assuming df is your DataFrame
# Convert all claims to strings
df['claim'] = df['claim'].astype(str)

# Apply the function and expand the returned list into separate columns
df[['top_1_minilm_ce', 'top_2_minilm_ce', 'top_3_minilm_ce']] = df['claim'].apply(lambda claim: pd.Series(search_ce(claim)))


In [54]:
df.head()

Unnamed: 0,folder,filename,claim,label,url,GOLD EXPLANATION,CATEGORY,gemini_label,gemini_explanation,top_1_minilm_ce,top_2_minilm_ce,top_3_minilm_ce
0,/content/drive/MyDrive/images/myths on urticaria,mythsonurticaria1.jpeg,Eating chocolate will cause acne.,SUPPORTED,https://www.jaad.org/article/S0190-9622(16)013...,The chocolate consumption group had a statisti...,Skin,REFUTED,There is no scientific evidence to support the...,The link of chocolate to acne vulgaris was rep...,A 2021 systematic review of 53 studies (11 int...,Fig 2 demonstrates that the chocolate consumpt...
1,/content/drive/MyDrive/images/myths on urticaria,mythsonurticaria1.jpeg,You can get a cold from being in the rain.,SUPPORTED,https://pubmed.ncbi.nlm.nih.gov/17705968/,Exposure to cold has often been associated wi...,General Health,REFUTED,"The common cold is caused by viruses, not by b...",The data available suggest that exposure to co...,This mechanism can explain why a person who ex...,"As a general observation, wet hair in cold wea..."
2,/content/drive/MyDrive/images/myths on urticaria,mythsonurticaria1.jpeg,Stress can cause acne.,SUPPORTED,https://medicaljournalssweden.se/actadv/articl...,"Based on this study, increased stress does not...",Skin,SUPPORTED,There is evidence to suggest that stress can t...,This finding provides physiological support to...,The impact of pyschological stress on acne.,Both active acne and post-inflammatory hyperpi...
3,/content/drive/MyDrive/images/myths on urticaria,mythsonurticaria1.jpeg,You can prevent acne by washing your face more...,NOT ENOUGH INFORMATION,https://www.tandfonline.com/doi/full/10.1080/0...,Washing and over-the-counter cleansers are com...,Skin,REFUTED,Washing your face more often does not prevent ...,Purpose: Washing and over-the-counter cleanser...,Patients can also be advised to pat dry their ...,Treatment of acne should be started early to p...
4,/content/drive/MyDrive/images/myths on vascula...,mythsonvascularsurgery3.jpeg,Varicose veins are caused by standing too much.,SUPPORTED,https://www.sjweh.fi/article/562,"For men working mostly in a standing position,...",Vascular,NOT ENOUGH INFORMATION,While standing for long periods of time can co...,Varicose veins are superficial veins in the su...,Varicose veins are caused by poorly functionin...,Varicose veins are caused by poorly functionin...


In [55]:
df.to_excel("/home/elson/topk3_minilm.xlsx")