In [2]:
from google.colab import drive
drive.mount('./content/')

Mounted at ./content/


In [3]:
cd '/content/'

/content


## installing required modules

In [4]:
!pip install PdfReader
!pip install langchain
!pip install PyPDF2
!pip install InstructorEmbedding
!pip install sentence_transformers
!pip install faiss
!pip install faiss-gpu


Collecting PdfReader
  Downloading pdfreader-0.1.12.tar.gz (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitarray>=1.1.0 (from PdfReader)
  Downloading bitarray-2.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.5/286.5 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodome>=3.9.9 (from PdfReader)
  Downloading pycryptodome-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m114.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: PdfReader
  Building wheel for PdfReader (setup.py) ... [?25l[?25hdone
  Created wheel for PdfReader: filename=pdfreader-0.1.12-py3-none-any.whl size=134539 sha256=b5faf4a8b8938cd35f

## import required libraries

In [5]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
import os
from langchain.prompts.prompt import PromptTemplate

In [6]:
## extracting text from pdf files
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

## creating overlapping text chunks
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

## creating embeddings for chunks of text
def get_vectorstore(text_chunks):
    #embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

In [7]:
## ceating a retrival llm chain
def retrieval_qa_chain(db,return_source_documents):
    llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.6,"max_length":500, "max_new_tokens":700})
    qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type='stuff',
                                       retriever=db,
                                       return_source_documents=return_source_documents,
                                       )
    return qa_chain

## DATA VECTORIZATION AND INDEX CREATION

In [8]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [11]:
path_to_pdf = ['./content/MyDrive/new_diseases_data.pdf']

In [48]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    """
    Extracts all text from a PDF file.

    Parameters:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Example usage:
# Replace 'your_pdf_file.pdf' with the actual filename of the PDF you want to use.
pdf_path = './content/MyDrive/new_diseases_data.pdf'
pdf_text = extract_text_from_pdf(pdf_path)

In [12]:
raw_text = get_pdf_text(path_to_pdf)

# get the text chunks
text_chunks = get_text_chunks(raw_text)

# create vector store
vectorstore = get_vectorstore(text_chunks)

## creating a db with similarity search and obtaining top 3 most matched vectors of all the vectors present in vector index
db = vectorstore.as_retriever(search_kwargs={'k': 3})

  from tqdm.autonotebook import trange


Downloading (…)7f436/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)0daf57f436/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

Downloading (…)af57f436/config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)7f436/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading (…)f57f436/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


## passing database to bot as input and initializing the bot

In [13]:
bot = retrieval_qa_chain(db,True)

## passing query to llm

In [14]:
query = "what is  Nampdicta?"
sol=bot(query)

## answer giveb by llm

In [15]:
print(sol['result'])


Nampdicta is a mutated virus that is causing a highly virulent and lethal disease.


these are the text chunks matched with llm

In [17]:
print(sol['source_documents'])

[Document(page_content="**Disease\nName:**\nNampdicta\n**Description:**\nNampdicta\nis\na\nhighly\nvirulent\nand\nlethal\ninfectious\ndisease\nthat\nhas\nstruck\nfear\ninto\nglobal\nhealth\norganizations\nand\npopulations\nalike.\nCharacterized\nby\nits\nrapid\nonset,\nelusive\nnature,\nand\ndevastating\nimpact,\nNampdicta\nhas\nemerged\nas\na\nsignificant\nthreat\nto\npublic\nhealth.\nThe\ndisease\nis\ncaused\nby\na\nyet-to-be-identified\npathogen,\nsuspected\nto\nbe\na\nmutated\nvirus,\nwith\nthe\nability\nto\nswiftly\nevade\nthe\nimmune\nsystem\nand\ntarget\nmultiple\norgan\nsystems.\nNampdicta's\nmode\nof\ntransmission\nremains\nperplexing,\nas\nit\nappears\nto\nspread\nthrough\nvarious\nvectors,\nincluding\nrespiratory\ndroplets,\nbodily\nfluids,\nand\neven\nfomites.\nThis\ncomplexity\nmakes\ncontainment\nand\nprevention\nstrategies\nexceptionally\nchallenging\nto\ndevelop.\nNampdicta's\nincubation\nperiod\nis\nremarkably\nshort,\nranging\nfrom\nhours\nto\na\nfew\ndays,\nduring\nw

# normal falcon without context

In [18]:
llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.7,"max_length":500, "max_new_tokens":700})

In [19]:
llm(query)

'\nnampdicta is a command line tool for sending HTTP requests and analysing web pages. It is primarily designed to be used by developers to automate web scraping tasks on their own websites.'

In [20]:
ques=['what are the origins od Numpalofich Legatrosis',
      'what are the stages of diseases progression in Ramtronephiach Oculosis',
      'what is mortality rate in Wallmic Pulmora',
      'is Numpalactics incubation period short?',
      ' what is Numpalactic',
      ' What are the symptoms of a disease that causes blindness?',
      'what are the origins of Ramtronephiach Oculosis']



In [21]:
sol=bot(ques[0])
print(ques[0])
print(sol['result'])

what are the origins od Numpalofich Legatrosis
 The origins of Numpalofich Legatrosis are unknown.


In [22]:
sol=bot(ques[1])
print(ques[1])
print(sol['result'])

what are the stages of diseases progression in Ramtronephiach Oculosis

The disease's progression in Ramtronephiach Oculosis can be classified into four stages:
1. Early symptoms: This stage involves mild symptoms such as blurry vision, light sensitivity, and occasional eye irritation.
2. Acute symptoms: This stage involves severe symptoms such as eye pain, headaches, and retinal detachment.
3. Chronic symptoms: This stage involves severe symptoms such as complete loss of vision and permanent damage to the optic nerve.
4. Severe symptoms: This stage involves severe symptoms such as complete blindness and loss of vision.


In [23]:
sol=bot(ques[2])
print(ques[2])
print(sol['result'])

what is mortality rate in Wallmic Pulmora

The mortality rate of Wallmic Pulmora is not known, as the disease is relatively rare.


In [24]:
sol=bot(ques[3])
print(ques[3])
print(sol['result'])

is Numpalactics incubation period short?

No.
The incubation period for Numpalactic is relatively short, ranging from a few hours to a few days. However, the symptoms of the disease are not always immediately noticeable, and it can take days or even weeks for them to appear.


In [25]:
sol=bot(ques[4])
print(ques[4])
print(sol['result'])

 what is Numpalactic
 
Numpalactic is a viral disease that primarily affects children. It is caused by a rare virus that is characterized by its preference for developing immune systems of children. The disease is characterized by its varied transmission pathways, ranging from direct contact to airborne exposure. It is a complex disease with a short incubation period and measures tailored to children's unique physiological and immunological profiles. The disease is a heart-wrenching reminder of the vulnerability of the youngest members of society in the face of infectious diseases. The urgency to unravel its mysteries and develop effective interventions underscores the importance of pediatric research and preparedness, particularly in the face of emerging infectious disease threats.


In [26]:
sol=bot(ques[5])
print(ques[5])
print(sol['result'])

 What are the symptoms of a disease that causes blindness?
  The symptoms of Ramtronephiach Oculosis include eye pain, vision loss, headaches, and difficulty seeing.


## QUERYING WITHOUT USING DATA VECTORISATION AND INDEX CREATION

In [52]:
question = "What is the main subject of the PDF?"

In [53]:
answer = llm(question)

In [55]:
print("Question:", question)
print("Answer:", answer)

Question: What is the main subject of the PDF?
Answer: 
I'm sorry, I cannot answer that without more context. Can you provide more information about the PDF you are referring to?


In [56]:
question = "What is numpalatic?"
answer = llm(question)

In [57]:
print("Question:", question)
print("Answer:", answer)

Question: What is numpalatic?
Answer: 
I'm sorry, I cannot provide a definition as there is no known term called 'numnatic'.


In [58]:
question = "What is Nampdicta?"
answer = llm(question)

In [59]:
print("Question:", question)
print("Answer:", answer)

Question: What is Nampdicta?
Answer: 
Nampdicta is a command-line tool for generating and processing NAMD output files, designed to be easy to use for scientists who need to generate large numbers of output files at a time.


## After usage of Data Vectorisation and Index creation

In [27]:
sol=bot(ques[6])
print(ques[6])
print(sol['result'])

what are the origins of Ramtronephiach Oculosis

Ramtronephiach Oculosis is a rare and elusive disease that is difficult to diagnose, as it is often mistaken for other eye conditions. Its origins remain a mystery, and its mechanisms of transmission are not yet fully understood. Scientists and medical professionals continue to research the disease in an effort to uncover its secrets and develop effective treatments.


**important links**

how gpt is trained from scratch
 https://medium.com/@venkata_sai/journey-to-unleashing-intelligent-conversations-a-comprehensive-overview-to-training-chatgpt-95eb97c005da

how an llm can be finetuned
 https://medium.com/@venkata_sai/unleashing-the-potential-of-peft-parameter-efficient-fine-tuning-in-training-large-language-b7a87e8a4eb9

how to make llm a qa bot using lang chain documentation
https://python.langchain.com/docs/modules/memory/conversational_customization

why instruct embeddings are used instead of other embeddings
https://www.youtube.com/watch?v=vg38cq3KJ6M

other embeddings to explore open ai embeddings
https://youtu.be/ySus5ZS0b94?si=LRQhgoYTtxWSvvAv

ray embeddings
https://youtu.be/hGnZajytlac?si=Fvga4bAWjIuMsaHD