<h4>üì• Load scholarship data from Hugging Face dataset (parquet format)</h4>

In [None]:
! pip install pandas

In [None]:
import pandas as pd
from pprint import pprint

# Read scholarship data from parquet file
df = pd.read_parquet("hf://datasets/NetraVerse/indian-govt-scholarships/data/train-00000-of-00001.parquet")

# Select only text and label columns 
df = df[['label', 'text']]

# Convert to records format
data = df.to_dict('records')

print(f"Loaded {len(data)} scholarship documents")
pprint(data[:2])

<h4>‚úÖ Validate Dataset Quality and Structure</h4>

In [None]:
# ============================================
# CHUNKING (Enable/Disable by setting flag)
# ============================================
# Set this to True to enable chunking, False to disable
ENABLE_CHUNKING = True  # Change to False to disable chunking

def chunk_text(text, chunk_size=500, overlap=100):
    '''Split text into overlapping chunks'''
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

if ENABLE_CHUNKING:
    # Create chunked version of data
    chunked_data = []
    for doc in data:
        text = doc['text']
        chunks = chunk_text(text, chunk_size=500, overlap=100)
        
        for i, chunk in enumerate(chunks):
            chunked_data.append({
                'label': doc['label'],
                'text': chunk,
                'chunk_id': i,
                'total_chunks': len(chunks)
            })
    
    # Replace original data with chunked data
    data = chunked_data
    print(f"\n{'='*50}")
    print(f"‚úÖ CHUNKING ENABLED")
    print(f"Chunked into {len(data)} pieces")
    print(f"{'='*50}\n")
    
    # Display first chunk example - FULL TEXT
    print("FIRST CHUNK EXAMPLE:")
    print(f"Label: {data[0]['label']}")
    print(f"Chunk ID: {data[0]['chunk_id']} of {data[0]['total_chunks']}")
    print(f"Text Length: {len(data[0]['text'])} characters")
    print(f"FULL TEXT:\n{data[0]['text']}")
    print(f"\n{'='*50}\n")
else:
    print(f"\n{'='*50}")
    print(f"‚ùå CHUNKING DISABLED - Using full documents")
    print(f"Total documents: {len(data)}")
    print(f"{'='*50}\n")
    print("FIRST DOCUMENT EXAMPLE:")
    print(f"Label: {data[0]['label']}")
    print(f"Text Length: {len(data[0]['text'])} characters")
    print(f"FULL TEXT:\n{data[0]['text']}")
    print(f"\n{'='*50}\n")

<h4>üì¶ Install required dependencies for vector database, embeddings, and deep learning</h4>

In [None]:
! pip install qdrant-client
! pip install sentence-transformers
! pip install torch

<h4>üîß Initialize Qdrant vector database client and SentenceTransformer embedding encoder</h4>

In [None]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance


# Create the embedding encoder
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

<h4>üóÑÔ∏è Create vector collection for storing scholarship embeddings with cosine similarity</h4>

In [None]:
# Create collection to store the scholarship data
collection_name="scholarships"

qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

<h4>‚¨ÜÔ∏è Generate embeddings for each document and upload to vector database</h4>

In [None]:
points_to_upload = []
for idx, doc in enumerate(data):
    points_to_upload.append(
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["text"]).tolist(),  # Use 'text' field for scholarship data
            payload=doc
        )
    )

# vectorize!
qdrant.upload_points(
    collection_name=collection_name,
    points=points_to_upload
)

<h4>‚¨ÜÔ∏è Check the embeddings</h4>

In [None]:
# Display first document's text and embedding
first_doc = data[0]
first_text = first_doc['text']
first_vector = encoder.encode(first_text).tolist()

print("DOCUMENT TEXT:")
print(f"Text (first 500 chars): {first_text[:500]}...")
print("EMBEDDING VECTOR:")
print(f"Vector dimension: {len(first_vector)}")
print(f"First 20 values: {first_vector[:20]}")


<h4>üí¨ Define user query for testing the RAG system</h4>

In [None]:
user_prompt = "what is the percetnage reservations for women in NSPG Scheme"

<h4>üîç Convert user query into embedding vector for semantic search</h4>

In [None]:
query_vector = encoder.encode(user_prompt).tolist()

<h4>üéØ Search vector database for top 3 most relevant scholarship documents</h4>

In [None]:
# Search time for awesome wines!
from qdrant_client import QdrantClient
from qdrant_client.models import SearchParams, ScoredPoint

hits = qdrant.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=5
)

<h4>üìÑ Display retrieved search results with metadata and similarity scores</h4>

In [None]:
for hit in hits.points: # Corrected: iterate over hits.points to get the ScoredPoint objects
  pprint(hit)

<h4>ü§ñ Load TinyLlama model and generate response WITHOUT retrieval context (baseline)</h4>

In [None]:
# For Hugging Face models
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login


# Log in to Hugging Face Hub (requires a token set in Colab secrets as 'HF_TOKEN')
# You can get a token from https://huggingface.co/settings/tokens and add it to Colab secrets.
try:
    hf_token =""
    if hf_token:
        login(token=hf_token)
        print("[green]Successfully logged into Hugging Face Hub.")
    else:
        print("Warning: Hugging Face token not found in Colab secrets. Some models might require authentication")
except Exception as e:
    print(f"Error during Hugging Face login: {e}. Some models might not load.")


# Set up device (GPU if available, else CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load TinyLlama model and tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

prompt = [
    {"role": "system", "content": "You are a helpful chatbot specializing in Indian government scholarships. Your top priority is to help users find relevant scholarship information and guide them with their queries. ONLY use information from the retrieved documents"},
    {"role": "user","content": user_prompt},
]
inputs = tokenizer.apply_chat_template(
	prompt,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=2048)
pprint("Response without RAG and with TinyLlama:")
pprint(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

<h4>üìã Extract payload data from search results for RAG augmentation</h4>

In [None]:
# define a variable to hold the search results
search_results = [hit.payload for hit in hits.points]

<h4>‚ú® Generate response WITH retrieval context (RAG-enhanced)</h4>

In [None]:
# Use the already-loaded model and tokenizer from the previous cell
# No need to reload the model - just create a new prompt with RAG context

prompt = [
    {"role": "system", "content": f"You are a helpful chatbot specializing in Indian government scholarships. Use the following retrieved documents to answer the user's question accurately.ONLY use information from the retrieved documents.\n\nRetrieved Documents:\n{str(search_results)}"},
    {"role": "user", "content": user_prompt},
]
inputs = tokenizer.apply_chat_template(
	prompt,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)
pprint("Response with  RAG and with TinyLlama:")

outputs = model.generate(**inputs, max_new_tokens=500)
pprint(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

In [None]:
# ============================================
# DEEP DIVE: Analyze Retrieved Chunks
# ============================================

print("üîç RETRIEVED CHUNKS ANALYSIS:")
print("=" * 80)

for i, result in enumerate(search_results, 1):
    print(f"\nüìÑ CHUNK {i}:")
    print(f"Label: {result['label']}")
    print(f"Chunk ID: {result.get('chunk_id', 'N/A')} of {result.get('total_chunks', 'N/A')}")
    print(f"Text length: {len(result['text'])} characters")
    print(f"\nüìù FIRST 500 CHARACTERS OF TEXT:")
    print(result['text'][:500])
    print(f"\nüìù LAST 200 CHARACTERS OF TEXT:")
    print(result['text'][-200:])
    print("-" * 80)

In [None]:
! pip install datasets

In [None]:
from datasets import load_dataset

# Load the dataset using Hugging Face datasets library
print("Loading dataset from Hugging Face...")
dataset = load_dataset("NetraVerse/indian-govt-scholarships", split="train")

print("\nüìä DATASET INFORMATION:")
print("=" * 80)
print(f"Number of rows: {len(dataset)}")
print(f"Features/Columns: {dataset.features}")
print(f"\nüîç First record:")
print("-" * 80)
pprint(dataset[0])
print("\n" + "=" * 80)

<h4>üîç Verify RAG Output - Check for Hallucinations</h4>
<p>Compare the retrieved documents with the model's response to ensure accuracy</p>

<h4>üåê Launch interactive Gradio chatbot interface with full RAG pipeline</h4>

In [None]:
import gradio as gr

def scholarship_chatbot(message, history):
    # Encode user query
    query_vector = encoder.encode(message).tolist()
    
    # Search for relevant scholarships
    hits = qdrant.query_points(
        collection_name=collection_name,
        query=query_vector,
        limit=3
    )
    
    search_results = [hit.payload for hit in hits.points]
    
    # Generate response with LLM
    prompt = [
        {"role": "system", "content": f"You are a helpful chatbot specializing in Indian government scholarships. Use the following retrieved documents to answer accurately:\n\nRetrieved Documents:\n{str(search_results)}"},
        {"role": "user", "content": message}
    ]
    
    inputs = tokenizer.apply_chat_template(
        prompt,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)
    
    outputs = model.generate(**inputs, max_new_tokens=250)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])
    
    return response

# Launch Gradio interface
demo = gr.ChatInterface(
    scholarship_chatbot,
    title="üéì Indian Government Scholarship Chatbot",
    description="Ask me about Indian government scholarships!",
    examples=[
        "What scholarships are available for engineering students?",
        "Tell me about AICTE scholarships",
        "Are there scholarships for women in STEM?"
    ]
)

demo.launch()