<a href="https://colab.research.google.com/github/hivani/-Question-Answering-system-using-an-LLM-/blob/main/steps_AI_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## ***Advanced Chunking of Data***

In [None]:
import json
from sentence_transformers import SentenceTransformer, util

# Load JSON Data
with open('/content/scraped_data.json') as f:
    data = json.load(f)

# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to chunk data based on semantic similarity
def chunk_data(doc):
    sentences = doc.split('. ')
    embeddings = model.encode(sentences)
    clusters = util.community_detection(embeddings, min_community_size=2)
    chunks = [' '.join([sentences[idx] for idx in cluster]) for cluster in clusters]
    return chunks

# Chunk all documents
all_chunks = []
for entry in data:
    chunks = chunk_data(entry['content'])
    for chunk in chunks:
        all_chunks.append({
            'text': chunk,
            'meta': {'web_link': entry.get('web_link', 'N/A')}
        })

print(f"Total chunks created: {len(all_chunks)}")


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Total chunks created: 5


# ***Convert Data to Embedding Vectors***

In [None]:
# Convert text chunks to embedding vectors
chunk_texts = [chunk['text'] for chunk in all_chunks]
chunk_embeddings = model.encode(chunk_texts)


In [None]:
pip install grpcio==1.63.0



In [None]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


## **Create Vector Database **

In [None]:
import faiss
import numpy as np

# Convert the embeddings to float32
chunk_embeddings = np.array(chunk_embeddings).astype('float32')

# Create a FAISS index
dimension = chunk_embeddings.shape[1]
index = faiss.IndexHNSWFlat(dimension, 32)  # HNSW index with 32 neighbors
index.hnsw.efConstruction = 200

# Add embeddings to the index
index.add(chunk_embeddings)

# Store metadata (in-memory for simplicity, you may use a database or file)
metadata = [{'web_link': chunk['meta']['web_link']} for chunk in all_chunks]


# ** Retrieval and Re-ranking**

In [None]:
# Use a simple query expansion technique (e.g., synonyms)
def expand_query(query):
    synonyms = ["synonym1", "synonym2"]  # Example synonyms
    expanded_query = f"{query} {' '.join(synonyms)}"
    return expanded_query

# Expand the query
query = "example query"
expanded_query = expand_query(query)


In [None]:
from transformers import pipeline

# Use a simple query expansion technique (e.g., synonyms)
def expand_query(query):
    synonyms = ["synonym1", "synonym2"]  # Example synonyms
    expanded_query = f"{query} {' '.join(synonyms)}"
    return expanded_query

# Expand the query
query = "example query"
expanded_query = expand_query(query)


# **Hybrid Retrieval Methods**

In [None]:

# Encode the query
query_embedding = model.encode([expanded_query]).astype('float32')

# Search the index
k = 10  # Number of nearest neighbors
distances, indices = index.search(query_embedding, k)

# Extract results
retrieved_chunks = [chunk_texts[i] for i in indices[0]]
retrieved_metadata = [metadata[i] for i in indices[0]]

print(f"Retrieved Chunks: {retrieved_chunks}")
print(f"Retrieved Metadata: {retrieved_metadata}")


Retrieved Chunks: ['This guide summarizes the ways that applications can be fine-tuned to gain additional speedups by leveraging Hopper GPU Architectureâ\x80\x99s features.</p>\r\n</dd>\r\n<dt><a class="reference internal" href="ada-tuning-guide/index.html"><span class="doc">Ada Tuning Guide</span></a></dt><dd><p>The NVIDIAÂ® Ada GPU architecture is NVIDIAâ\x80\x99s latest architecture for CUDAÂ® compute applications This guide summarizes the ways that applications can be fine-tuned to gain additional speedups by leveraging NVIDIA Ampere GPU Architectureâ\x80\x99s features.</p>\r\n</dd>\r\n<dt><a class="reference internal" href="hopper-tuning-guide/index.html"><span class="doc">Hopper Tuning Guide</span></a></dt><dd><p>Hopper GPU Architecture is NVIDIAâ\x80\x99s 9th-generation architecture for CUDA compute applications This guide summarizes the ways that applications can be fine-tuned to gain additional speedups by leveraging Turing architectural features.</p>\r\n</dd>\r\n<dt><a class=

In [None]:
# Re-rank the retrieved results
reranked_results = util.semantic_search(model.encode(query), model.encode(retrieved_chunks), top_k=10)

# Flatten the re-ranked results
reranked_chunks = [retrieved_chunks[idx['corpus_id']] for idx in reranked_results[0]]


In [None]:
from transformers import pipeline

# Initialize QA pipeline
qa_pipeline = pipeline('question-answering', model='deepset/roberta-base-squad2')


In [None]:
# Use the top re-ranked result for question answering
context = reranked_chunks[0]
answer = qa_pipeline({'question': query, 'context': context})

print(f"Answer: {answer['answer']}")


Answer: NVIDIA Ampere GPU Architecture


In [None]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4

In [None]:
import streamlit as st

# Streamlit UI for input query and displaying the answer
st.title('Question Answering System')

query = st.text_input('Enter your query:')
if query:
    expanded_query = expand_query(query)
    query_embedding = model.encode([expanded_query]).astype('float32')
    k = 10  # Number of nearest neighbors
    distances, indices = index.search(query_embedding, k)

    # Extract results
    retrieved_chunks = [chunk_texts[i] for i in indices[0]]
    retrieved_metadata = [metadata[i] for i in indices[0]]

    # Re-rank the retrieved results
    reranked_results = util.semantic_search(model.encode(query), model.encode(retrieved_chunks), top_k=10)
    reranked_chunks = [retrieved_chunks[idx['corpus_id']] for idx in reranked_results[0]]

    # Use the top re-ranked result for question answering
    context = reranked_chunks[0]
    answer = qa_pipeline({'question': query, 'context': context})

    st.write(f"Answer: {answer['answer']}")
