**install required libraries**

In [1]:
!pip install pinecone-client open-clip-torch transformers PyPDF2 torch


Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting open-clip-torch
  Downloading open_clip_torch-2.29.0-py3-none-any.whl.metadata (31 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading open_clip_torch-2.29.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5

In [2]:
import os
import PyPDF2

def extract_text_from_pdfs(pdf_folder):
    pdf_texts = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            with open(pdf_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
                pdf_texts.append((filename, text))
    return pdf_texts

# Example: Set the path to your folder containing PDFs
pdf_folder = "/content"
pdf_data = extract_text_from_pdfs(pdf_folder)


In [3]:
def split_into_chunks(text, chunk_size=500):
    """
    Split the text into smaller chunks for better embedding generation.
    """
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

processed_data = []
for filename, content in pdf_data:
    chunks = split_into_chunks(content)
    for i, chunk in enumerate(chunks):
        processed_data.append({"id": f"{filename}-{i}", "text": chunk})


In [8]:
!pip install --upgrade sympy


Collecting sympy
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.13.3-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu121 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.13.3 which is incompatible.[0m[31m
[0mSuccessfully installed sympy-1.13.3


In [None]:
import torch
import open_clip
from pinecone import Pinecone

# Initialize the open_clip model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, _, _ = open_clip.create_model_and_transforms(
    'hf-hub:laion/CLIP-ViT-B-32-laion2B-s34B-b79K'
)
tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-B-32-laion2B-s34B-b79K')
model = model.to(device)

# Generate embeddings
def generate_embeddings(texts, model, tokenizer, device):
    model.eval()
    with torch.no_grad():
        tokenized = tokenizer(texts)
        embeddings = model.encode_text(tokenized.to(device))
        return embeddings.cpu().numpy()

# Initialize Pinecone

# Upload data
for data in processed_data:
    embedding = generate_embeddings([data["text"]], model, tokenizer, device)[0]
    index.upsert([(data["id"], embedding, {"text": data["text"]})])


 **Initialize API Keys**

In [None]:
import openai

# Set up OpenAI API Key
#openai.api_key = "XYZ"

def query_chatbot(query, model, tokenizer, index, device):
    # Step 1: Generate query embedding
    query_embedding = generate_embeddings([query], model, tokenizer, device)[0].tolist()  # Convert to Python list

    # Step 2: Query Pinecone to retrieve relevant documents
    results = index.query(
        vector=query_embedding,
        top_k=5,
        include_metadata=True
    )

    # Step 3: Print and inspect the metadata structure of the matches
    for match in results["matches"]:
        print(match["metadata"])  # Inspect the metadata keys

    # Step 4: Construct the prompt for ChatGPT
    prompt = "You are an expert on epilepsy. Based on the following documents, answer the question.\n\n"
    for match in results["matches"]:
        # Replace 'text' with the correct key from your metadata
        prompt += f"Document: {match['metadata']['text']}\n\n"  # Modify based on the inspection result
    prompt += f"Question: {query}\nAnswer:"

    # Step 5: Query ChatGPT for a response
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a medical assistant with expertise in epilepsy."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        temperature=0.7
    )

    # Step 6: Extract and return the response text
    return response['choices'][0]['message']['content'].strip()


In [17]:
response = query_chatbot(
    query="What are the symptoms of epilepsy?",
    model=model,
    tokenizer=tokenizer,
    index=index,
    device=device
)
print(response)


{'content': 'mildest types of epilepsy. • It is almost always outgrown by puberty and is never diagnosed in adults. • Seizures tend to occur during sleep and are most often simple partial motor seizures that involve the face and secondarily gener-alized (grand mal) seizures. Type 3: Symptomatic Generalized Epilepsy • This is caused by widespread brain damage, and injury during birth is the most common cause. • When the cause of symptomatic general epilepsy cannot be identi- ﬁ ed, the disorder may be referred to as cryptogenic epilepsy. • Speciﬁ c, inherited brain diseases, such as adrenoleukodystrophy (ADL, which was featured in the movie “Lorenzo’s Oil”), or brain infections (such as meningitis and encephalitis) can also cause symptomatic generalized epilepsy. • In addition to seizures, these patients often have other neurologi- cal problems, such as developmental delay, mental retardation or cerebral palsy. • These epilepsies include different subtypes—the most typical is the Lennox-

KeyError: 'text'