In [11]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Papers_Dengue_Fever'

from google.colab import auth
# auth.authenticate_user()








Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!pip install pymupdf




In [9]:
# Functions for preprocessing of the text:
import fitz  # PyMuPDF

import re
def remove_headers_footers(text):
    # Example: Remove "Page X" patterns
    text = re.sub(r'Page\s\d+', '', text)
    # Add other rules as needed
    return text
def remove_references(text):
    # Identify "References" or "Bibliography" section and remove
    text = re.split(r"(References|Bibliography)", text, flags=re.IGNORECASE)[0]
    return text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text.strip()
def preprocess_pdf_text(text):
    text = remove_headers_footers(text)
    text = remove_references(text)
    text = clean_text(text)
    return text

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page_num in range(pdf.page_count):
            page = pdf[page_num]
            text += page.get_text()
    return preprocess_pdf_text(text)
def chunk_text(text, chunk_size=200):
    words = text.split()
    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Usage with extracted text


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence-BERT model for generating embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all chunks

# Example for multiple PDFs
pdf_files=[]
for i in [1,2,3,4,5]:
  pdf_files.append(file_path+'/Paper'+str(i)+'.pdf')
texts = [extract_text_from_pdf(pdf) for pdf in pdf_files]


all_chunks = []
for text in texts:
    all_chunks.extend(chunk_text(text))


def deduplicate_chunks(chunks, similarity_threshold=0.95):
    embeddings = [embedder.encode(chunk) for chunk in chunks]
    similarity_matrix = cosine_similarity(embeddings)

    deduplicated_chunks = []
    seen_indices = set()

    for i in range(len(chunks)):
        if i not in seen_indices:
            deduplicated_chunks.append(chunks[i])
            seen_indices.add(i)
            for j in range(i + 1, len(chunks)):
                if similarity_matrix[i, j] > similarity_threshold:
                    seen_indices.add(j)

    return deduplicated_chunks
all_chunks = deduplicate_chunks(all_chunks)
chunk_embeddings = [embedder.encode(chunk) for chunk in all_chunks]


In [13]:
!pip install faiss-cpu




In [14]:
import numpy as np
import faiss

# Create FAISS index
dimension = len(chunk_embeddings[0])  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)

# Convert embeddings to numpy array and add to index
chunk_embeddings_np = np.array(chunk_embeddings)
index.add(chunk_embeddings_np)


In [15]:
!pip install transformers accelerate bitsandbytes

from transformers import AutoTokenizer, AutoModelForCausalLM

import torch

from huggingface_hub import login

login() # You'll be prompted to enter your Hugging Face token




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM
!pip install -U bitsandbytes

model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")


def retrieve_relevant_chunks(query, num_chunks=2):
    # Generate embedding for query
    query_embedding = embedder.encode(query).reshape(1, -1)

    # Search the index for similar chunks
    _, indices = index.search(query_embedding, num_chunks)

    # Retrieve and return the relevant chunks
    return [all_chunks[i] for i in indices[0]]

def generate_response_with_rag(query):
    # Retrieve relevant chunks based on the query
    context_chunks = retrieve_relevant_chunks(query)
    context_text = " ".join(context_chunks)

    # Combine context and query for input to GPT-2
    input_text = f"{context_text}\n\nQuestion: {query}\nAnswer:"

    # Tokenize the input text and move to the same device as the model
    input_ids = tokenizer.encode(input_text, return_tensors='pt', truncation=True, max_length=model.config.max_position_embeddings - 50)  # Truncate during encoding
    input_ids = input_ids.to(model.device)  # Move input_ids to the same device as the model

    # Generate response with adjusted parameters
    output = model.generate(
        input_ids,
        max_new_tokens=100,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,  # Prevent repetition of 2-grams
        temperature=0.7,         # Adjust temperature for more diversity
        top_k=50,                # Consider top 50 most likely tokens
        top_p=0.95               # Nucleus sampling for more focused generation
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    return response

def generate_response_pretrained(query):
    """Generates a response using only the pretrained model (no RAG)."""
    input_ids = tokenizer.encode(query, return_tensors='pt').to(model.device)
    output = model.generate(input_ids, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response






In [30]:
query = "What are some proteins that could be targeted for dengue fever drug?"

rag_response = generate_response_with_rag(query)
pretrained_response = generate_response_pretrained(query)

print("RAG Response:\n", rag_response)
print("\nPretrained Model Response:\n", pretrained_response)


RAG Response:
 ensure the clinical effects of quinine in dengue. Vaccine development was another method of preventing dengue fever. Sanofi Pasteur introduced to science a vaccine called Dengvaxia (CYD- TDV) [21]. The establishment of a dengue vaccine has been hindered by the distinct and intricate immunopathology of the dengue virus. The development of dengue vaccines has also been hampered by crucial concerns, such as a shortage of animal models for the disease, a lack of appropriate pro- tective immunity markers, and the presence of four distinct dengue serogroups [22]. As there is no particular targeted therapy for the dengue virus, our research is critical for find- ing a new drug candidate. Natural compounds with antiviral properties might be one alternative for drug development to combat the dengue virus. Plants produce natural compounds that are small mol- ecules and can be extracted in trace amounts. They include the plants primary and secondary metabolites. These iso- lated se

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Cloning into 'RAG_Dengue_Fever_LLM'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


cp: cannot stat '/content/RAG_Dengue_Drug_Discovery.ipynb': No such file or directory
