<a href="https://colab.research.google.com/github/jyothikumargoud/ML/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- 0. Setup ---
!pip install llama-cloud-services llama-index-core llama-index-readers-file python-dotenv chromadb langchain-community sentence-transformers
!pip install nest_asyncio

Collecting llama-cloud-services
  Downloading llama_cloud_services-0.6.12-py3-none-any.whl.metadata (3.4 kB)
Collecting llama-index-core
  Downloading llama_index_core-0.12.30-py3-none-any.whl.metadata (2.6 kB)
Collecting llama-index-readers-file
  Downloading llama_index_readers_file-0.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting chromadb
  Downloading chromadb-1.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-cloud<0.2.0,>=0.1.18 (from llama-cloud-services)
  Downloading llama_cloud-0.1.18-py3-none-any.whl.metadata (902 bytes)
Collecting banks<3.0.0,>=2.0.0 (from llama-index-core)
  Downloading banks-2.1.1-py3-none-any.whl.metadata (11 kB)
Collecting dataclasses-json (from llama-index-core)
  Downloading dataclasses_json-0.6.7-py3-none

In [4]:
from llama_cloud_services import LlamaParse
from google.colab import userdata
import nest_asyncio

nest_asyncio.apply()

parser = LlamaParse(api_key=userdata.get('LLAMAPARSE_API_KEY'), result_type="markdown", premium_mode=True)

file_url = "https://raw.githubusercontent.com/sahajsoft/rag-workshop/main/Documents/September%202023%20Report.pdf"

parsed_docs = parser.load_data(file_url)



Started parsing the file under job_id 28ae78ba-7d6e-49f6-a85c-233252387b47


In [8]:
for i in range(len(parsed_docs)):
    print(parsed_docs[i].text)

# WEEKLY OUTBREAK REPORT
Disease Alerts/Outbreaks reported and responded to by States/UTs through Integrated Disease Surveillance Program (IDSP)
(27th February 2023 to 5th March 2023)

## 9th Week

District wise disease alerts/outbreaks reported in the 9th week 2023

### 9th Week Map

[Map of India with disease outbreaks marked in different regions]

Raisen- Chickenpox

Durg- Fever with Rash

Aurangabad- Measles

Ahmednagar- Fever

Cuttack- Food Poisoning

Malappuram- Cholera

Rayagada- ADD

Thrissur- Adeno Virus

Palakkad- Chickenpox(2), Food Poisoning

Ernakulam- Adeno Virus

Tiruvannamalai- Chickenpox

Salem- Chickenpox

Kottayam- Adeno Virus

### Reporting Status of States/UT's

| Description | Number |
| - | - |
| No. of States/UT's Submitted outbreak report ( including NIL report) | 15 |
| No. of States/UT's Submitted 'NIL' outbreak report | 03 |


INTEGRATED DISEASE SURVEILLANCE PROGRAMME
NATIONAL CENTRE FOR DISEASE CONTROL
22 Sham Nath Marg, Civil Lines, Delhi-110054
Tel No. 23

In [10]:
# --- 2. Chunking the Document ---
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.schema import Document

In [23]:
# making an langchsain doc

docs = [Document(page_content = doc.text) for doc in parsed_docs]

In [24]:
MakrDown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=
                                               [("#","Heading1"),
                                               ("##","Heading2"),
                                               ("###","Heading3")])

In [25]:
chunks = []

for d in docs:
    chunks.extend(MakrDown_splitter.split_text(d.page_content))

In [27]:
chunks

[Document(metadata={'Heading1': 'WEEKLY OUTBREAK REPORT'}, page_content='Disease Alerts/Outbreaks reported and responded to by States/UTs through Integrated Disease Surveillance Program (IDSP)\n(27th February 2023 to 5th March 2023)'),
 Document(metadata={'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading2': '9th Week'}, page_content='District wise disease alerts/outbreaks reported in the 9th week 2023'),
 Document(metadata={'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading2': '9th Week', 'Heading3': '9th Week Map'}, page_content='[Map of India with disease outbreaks marked in different regions]  \nRaisen- Chickenpox  \nDurg- Fever with Rash  \nAurangabad- Measles  \nAhmednagar- Fever  \nCuttack- Food Poisoning  \nMalappuram- Cholera  \nRayagada- ADD  \nThrissur- Adeno Virus  \nPalakkad- Chickenpox(2), Food Poisoning  \nErnakulam- Adeno Virus  \nTiruvannamalai- Chickenpox  \nSalem- Chickenpox  \nKottayam- Adeno Virus'),
 Document(metadata={'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading2': '9t

In [39]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

emb_model = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')


In [42]:
vector_store = Chroma.from_documents(documents=chunks, embedding=emb_model, persist_directory="db")
vector_store.persist()

In [47]:
all_doc = vector_store.get()['documents']
all_doc

['Disease Alerts/Outbreaks reported and responded to by States/UTs through Integrated Disease Surveillance Program (IDSP)\n(27th February 2023 to 5th March 2023)',
 'District wise disease alerts/outbreaks reported in the 9th week 2023',
 '[Map of India with disease outbreaks marked in different regions]  \nRaisen- Chickenpox  \nDurg- Fever with Rash  \nAurangabad- Measles  \nAhmednagar- Fever  \nCuttack- Food Poisoning  \nMalappuram- Cholera  \nRayagada- ADD  \nThrissur- Adeno Virus  \nPalakkad- Chickenpox(2), Food Poisoning  \nErnakulam- Adeno Virus  \nTiruvannamalai- Chickenpox  \nSalem- Chickenpox  \nKottayam- Adeno Virus',
 "| Description | Number |\n| - | - |\n| No. of States/UT's Submitted outbreak report ( including NIL report) | 15 |\n| No. of States/UT's Submitted 'NIL' outbreak report | 03 |  \nINTEGRATED DISEASE SURVEILLANCE PROGRAMME\nNATIONAL CENTRE FOR DISEASE CONTROL\n22 Sham Nath Marg, Civil Lines, Delhi-110054\nTel No. 23913148, Fax No. 23922677; www.idsp.nic.in",
 '| 

In [54]:
for i,doc in enumerate(all_doc):
    print(f"Document:   {i}")
    print(doc[:50])
    print("------------------------------------------------------")

Document:   0
Disease Alerts/Outbreaks reported and responded to
------------------------------------------------------
Document:   1
District wise disease alerts/outbreaks reported in
------------------------------------------------------
Document:   2
[Map of India with disease outbreaks marked in dif
------------------------------------------------------
Document:   3
| Description | Number |
| - | - |
| No. of States
------------------------------------------------------
Document:   4
| Disease/alerts | No. of alerts/outbreaks |
| - |
------------------------------------------------------
Document:   5
| Disease | 2020 | 2021 | 2022 | 2023 |
| - | - | 
------------------------------------------------------
Document:   6
| Unique ID. | Name of State/UT | Name of District
------------------------------------------------------
Document:   7
COVID-19 was declared as a pandemic on 11ᵗʰ March,
------------------------------------------------------
Document:   8
District wise disease aler

In [57]:
vector_metadat = vector_store.get()['metadatas']
vector_metadat

[{'Heading1': 'WEEKLY OUTBREAK REPORT'},
 {'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading2': '9th Week'},
 {'Heading3': '9th Week Map',
  'Heading2': '9th Week',
  'Heading1': 'WEEKLY OUTBREAK REPORT'},
 {'Heading2': '9th Week',
  'Heading1': 'WEEKLY OUTBREAK REPORT',
  'Heading3': "Reporting Status of States/UT's"},
 {'Heading2': '9th Week Map',
  'Heading3': 'Disease alerts/outbreaks reported during this week (N=15)'},
 {'Heading3': 'Disease wise no of outbreaks reported under IDSP during 2020-2023 till 9th wk of Every Year',
  'Heading2': '9th Week Map'},
 {'Heading1': 'DISEASE OUTBREAKS OF PREVIOUS WEEKS REPORTED LATE'},
 {'Heading2': 'COVID-19 STATUS'},
 None,
 None,
 None,
 None,
 None,
 {'Heading1': 'WEEKLY OUTBREAK REPORT'},
 {'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading2': '9th Week'},
 {'Heading2': '9th Week',
  'Heading1': 'WEEKLY OUTBREAK REPORT',
  'Heading3': '9th Week Map'},
 {'Heading2': '9th Week',
  'Heading1': 'WEEKLY OUTBREAK REPORT',
  'Heading3': "Reporting Stat

In [58]:
for i in range(len(vector_metadat)):
    print(vector_metadat[i])

{'Heading1': 'WEEKLY OUTBREAK REPORT'}
{'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading2': '9th Week'}
{'Heading3': '9th Week Map', 'Heading2': '9th Week', 'Heading1': 'WEEKLY OUTBREAK REPORT'}
{'Heading2': '9th Week', 'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading3': "Reporting Status of States/UT's"}
{'Heading2': '9th Week Map', 'Heading3': 'Disease alerts/outbreaks reported during this week (N=15)'}
{'Heading3': 'Disease wise no of outbreaks reported under IDSP during 2020-2023 till 9th wk of Every Year', 'Heading2': '9th Week Map'}
{'Heading1': 'DISEASE OUTBREAKS OF PREVIOUS WEEKS REPORTED LATE'}
{'Heading2': 'COVID-19 STATUS'}
None
None
None
None
None
{'Heading1': 'WEEKLY OUTBREAK REPORT'}
{'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading2': '9th Week'}
{'Heading2': '9th Week', 'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading3': '9th Week Map'}
{'Heading2': '9th Week', 'Heading1': 'WEEKLY OUTBREAK REPORT', 'Heading3': "Reporting Status of States/UT's"}
{'Heading3': 'Disease alerts/outb

In [59]:
!pip install huggingface-hub llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.8.tar.gz (67.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.8-cp311-cp311-linux_x86_64.whl size=5959641 sha256=2cb848f1a1ebb973ed97f

In [70]:
retriever = vector_store.as_retriever(search_type = "similarity",search_kwargs={'k':3})
query = "How many instances of chickenpox were observed"
relevant_chunks = retriever.get_relevant_documents(query)
for i, chunk in enumerate(relevant_chunks):
    print(f"\nChunk {i+1}:")
    print(chunk.page_content)


Chunk 1:
| Disease | 2020 | 2021 | 2022 | 2023 |
| - | - | - | - | - |
| Food Poisoning | ++++++++++++ | +++++ | ++ | ++++++++++++ |
| Acute Diarrheal Disease | +++++ | ++++++++ | + | ++++++++ |
| Chickenpox | +++++++++++++++ | ++ | + | +++++ |
| Measles | + | + | | ++++ |
| Viral Hepatitis | ++ | + | | ++ |
| Dengue | + | + | | + |
| Chikungunya | + | ++ | | + |
| Acute Encephalitis Syndrome (AES) | | | | + |
| Mumps | ++ | | | |
| Diphtheria | | | | + |
| Leptospirosis | | | | + |
| Malaria | + | | | |
| Rabies | | | | + |  
Note: Each + symbol represents approximately 10 outbreaks.

Chunk 2:
| Disease | 2020 | 2021 | 2022 | 2023 |
| - | - | - | - | - |
| Food Poisoning | ++++++++++++ | +++++ | ++ | ++++++++++++ |
| Acute Diarrheal Disease | +++++ | ++++++++ | + | ++++++++ |
| Chickenpox | +++++++++++++++ | ++ | + | +++++ |
| Measles | + | + | | ++++ |
| Viral Hepatitis | ++ | + | | ++ |
| Dengue | + | + | | + |
| Chikungunya | + | ++ | | + |
| Acute Encephalitis Syndrome (AES) | | 

In [89]:
from llama_cpp import Llama

# Load local LLaMA model
llm = Llama.from_pretrained(
    repo_id="prithivMLmods/Llama-Sentient-3.2-3B-Instruct-GGUF",
    filename="Llama-Sentient-3.2-3B-Instruct.Q5_K_M.gguf",
    verbose=False
)



llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


{'id': 'cmpl-21c6309e-9031-48a5-91a1-4e51cd4f30c7', 'object': 'text_completion', 'created': 1744366463, 'model': '/root/.cache/huggingface/hub/models--prithivMLmods--Llama-Sentient-3.2-3B-Instruct-GGUF/snapshots/011b69eb6ceecb2ead6f1609fff329c8fe8a780a/./Llama-Sentient-3.2-3B-Instruct.Q5_K_M.gguf', 'choices': [{'text': ' Individuals who were most affected by the disease were those who were less than 15 years old. The most affected age group was children. The age distribution of the', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 237, 'completion_tokens': 32, 'total_tokens': 269}}


In [None]:
from llama_cpp import Llama

# Load local LLaMA model
llm = Llama.from_pretrained(
    repo_id="prithivMLmods/Llama-Sentient-3.2-3B-Instruct-GGUF",
    filename="Llama-Sentient-3.2-3B-Instruct.Q5_K_M.gguf",
    verbose=False
)

# Question
question = "What is the age of the individuals who were most affected?"

# Extract text from relevant chunks
context_text = "\n".join([doc.page_content for doc in relevant_chunks])

# Run inference
output = llm(
    prompt = f"""
    You are a helpful assistant. Use the context below to answer the question.

    Context:
    {context_text[:500]}

    Question:
    {question}

    Answer:
    """,
    max_tokens=32,
    stop=["Q:", "\n"],
)

# Print just the model's generated text
print(output['choices'][0]['text'].strip())


llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
