# **Install Required Libraries**

In [1]:
!pip install llama-index llama-index-core llama-index-embeddings-huggingface pinecone


Collecting llama-index
  Downloading llama_index-0.12.36-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-core
  Downloading llama_index_core-0.12.36-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.5.4-py3-none-any.whl.metadata (458 bytes)
Collecting pinecone
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting llama-index-agent-openai<0.5,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.7-py3-none-any.whl.metadata (438 bytes)
Collecting llama-index-cli<0.5,>=0.4.1 (from llama-index)
  Downloading llama_index_cli-0.4.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-embeddings-openai<0.4,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.11-py3-non

In [2]:
# pcsk_5VWi9Y_Ejpj6tvvsYkskoiqBL8PyHzoVCUpxdiHs24XyBefeZz6GtEVStgv9SXVXG4ftun
# multilingual-chatbot

# **Import Necessary Modules**

In [3]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from pinecone import Pinecone

# **Configure Pinecone**

In [33]:
PINECONE_API_KEY = "pcsk_5VWi9Y_Ejpj6tvvsYkskoiqBL8PyHzoVCUpxdiHs24XyBefeZz6GtEVStgv9SXVXG4ftun"
PINECONE_ENV = "us-east-1-aws"
#pinecone_index_name = "multilingual-chatbot"

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
#index = pc.Index(pinecone_index_name)


In [34]:
from pinecone import ServerlessSpec

In [35]:
index_name = "mental-health-chatbot"

In [37]:
# Delete index if exists (optional reset)
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

In [38]:
# Create Pinecone index with correct dimension (384 for both models)
pc.create_index(index_name, dimension=384, metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"))
index = pc.Index(index_name)

# **Load Multilingual PDFs (English + Urdu)**

In [39]:
documents = SimpleDirectoryReader(input_files=[
    "/content/mental_health_en.pdf",
    "/content/mental_health_ur.pdf"
]).load_data()


# **Parse and Chunk Text**

**Step A: Chunking Strategy 1 – Fixed-Length Chunking**

In [40]:
node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=30)
nodes = node_parser.get_nodes_from_documents(documents)


**Step B: Chunking Strategy 2 – Sentence-Based Chunking**

In [47]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Sentence-based chunking (e.g., 3 sentences per chunk)
def sentence_based_chunking(text, n=3):
    sentences = sent_tokenize(text)
    chunks = [' '.join(sentences[i:i+n]) for i in range(0, len(sentences), n)]
    return chunks

# Apply on each document
sentence_chunks = []
for doc in documents:
    text = doc.text
    chunks = sentence_based_chunking(text, n=3)
    sentence_chunks.extend(chunks)


print(f"Total sentence-based chunks: {len(sentence_chunks)}")


Total sentence-based chunks: 21


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Embed with All Three Models And Upload For Fixed Length Chunk**

**1. Sentence-BERT Embedding and Upload**

In [42]:
sbert_model = HuggingFaceEmbedding("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
sbert_upsert = []
for i, node in enumerate(nodes):
    text = node.get_content()
    node.embedding = sbert_model.get_text_embedding(text)
    node.metadata = {
        "source_text": text[:300],
        "language": "urdu" if "ہے" in text or "کو" in text else "english",
        "model": "sentence-bert",
        "chunking": "fixed"
    }
    sbert_upsert.append({
        "id": f"sbert-{i}",
        "values": node.embedding,
        "metadata": node.metadata
    })
index.upsert(vectors=sbert_upsert)
print("✅ SBERT embeddings stored successfully!")

✅ SBERT embeddings stored successfully!


**2. DistilBERT Embedding and Upload**

In [43]:
index_name2 = "distilbert-index"

In [44]:
# Delete index if exists (optional reset)
if index_name2 in pc.list_indexes().names():
    pc.delete_index(index_name2)

In [45]:
# Create Pinecone index with correct dimension
pc.create_index(index_name2, dimension=768, metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"))
index = pc.Index(index_name2)

In [46]:
distil_model = HuggingFaceEmbedding("distilbert-base-multilingual-cased")
distil_upsert = []
for i, node in enumerate(nodes):
    text = node.get_content()
    node.embedding = distil_model.get_text_embedding(text)
    node.metadata = {
        "source_text": text[:300],
        "language": "urdu" if "ہے" in text or "کو" in text else "english",
        "model": "distilbert",
        "chunking": "fixed"
    }
    distil_upsert.append({
        "id": f"distil-{i}",
        "values": node.embedding,
        "metadata": node.metadata
    })
distil_index.upsert(vectors=distil_upsert)
print("✅ DistilBERT embeddings stored successfully!")



✅ DistilBERT embeddings stored successfully!


# **Similarity Search Testing Script**

In [56]:
def similarity_search(query, model_name="sbert", top_k=3, language=None):
    if model_name == "sbert":
        embedding = sbert_model.get_text_embedding(query)
        index = sbert_index
    elif model_name == "distilbert":
        embedding = distil_model.get_text_embedding(query)
        index = distil_index
    else:
        raise ValueError("Unsupported model")

    filter_dict = {"language": language} if language else None

    results = index.query(
        vector=embedding,
        top_k=top_k,
        include_metadata=True,
        filter=filter_dict
    )

    print(f"\n🧠 Top {top_k} results for query: '{query}' using {model_name.upper()}:\n")
    for i, match in enumerate(results['matches']):
        print(f"Result #{i+1} | Score: {match['score']:.4f}")
        print("Chunk:", match['metadata']['source_text'])
        print("-" * 60)


In [57]:
sbert_index = pc.Index("mental-health-chatbot")
distil_index = pc.Index("distilbert-index")

In [61]:
# For Sentence-BERT (SBERT)
print("SBERT -> ENGLISH")
similarity_search("What are symptoms of anxiety?", model_name="sbert", language="english")
print("SBERT -> URDU")
similarity_search("ذہنی دباؤ کی علامات کیا ہیں؟", model_name="sbert", language="urdu")

# For DistilBERT
print("DistilBERT -> ENGLISH")
similarity_search("What are symptoms of anxiety?", model_name="distilbert", language="english")
print("DistilBERT -> URDU")
similarity_search("ذہنی دباؤ کی علامات کیا ہیں؟", model_name="distilbert", language="urdu")

SBERT -> ENGLISH

🧠 Top 3 results for query: 'What are symptoms of anxiety?' using SBERT:

Result #1 | Score: 0.1734
Chunk: School-based social and emotional learning programmes are among the most 
effective promotion strategies for countries at all income levels. 
Promoting and protecting mental health at work is a growing area of interest and can be 
supported through legislation and regulation, organizational strategi
------------------------------------------------------------
Result #2 | Score: 0.1529
Chunk: parenting and physical punishment is known to undermine child health and bullying is a leading 
risk factor for mental health conditions. 
Protective factors similarly occur throughout our lives and serve to strengthen resilience. They 
include our individual social and emotional skills and attribut
------------------------------------------------------------
Result #3 | Score: 0.1386
Chunk: Mental health 
 
Key facts 
• Affordable, effective and feasible strategies exist to p