In [2]:
%pwd

'f:\\Generative AI\\LangChain Projects\\Medical-Chatbot-Generative-AI\\research'

In [3]:
import os
os.chdir('../')
%pwd

'f:\\Generative AI\\LangChain Projects\\Medical-Chatbot-Generative-AI'

In [4]:
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                    glob='*.pdf',
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents
    

In [6]:
extracted_data = load_pdf_file(data = 'Data/')

In [7]:
#extracted_data

In [8]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks = text_split(extracted_data)
len(text_chunks)

6970

In [10]:
#text_chunks

In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

In [12]:
def download_huggingface_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [13]:
embeddings = download_huggingface_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
query_vector = embeddings.embed_query("hello World")
print("length",len(query_vector))
print(query_vector)

length 384
[-0.03447727486491203, 0.03102317824959755, 0.006734970025718212, 0.026108985766768456, -0.03936202451586723, -0.16030244529247284, 0.06692401319742203, -0.006441489793360233, -0.0474504791200161, 0.014758856035768986, 0.07087527960538864, 0.05552763119339943, 0.019193334504961967, -0.026251312345266342, -0.01010954286903143, -0.02694045566022396, 0.022307461127638817, -0.022226648405194283, -0.14969263970851898, -0.017493007704615593, 0.00767625542357564, 0.05435224249958992, 0.0032543970737606287, 0.031725890934467316, -0.0846213847398758, -0.02940601296722889, 0.05159561336040497, 0.04812406003475189, -0.0033148222137242556, -0.058279167860746384, 0.04196927323937416, 0.022210685536265373, 0.1281888335943222, -0.022338971495628357, -0.011656315997242928, 0.06292839348316193, -0.032876335084438324, -0.09122604131698608, -0.031175347045063972, 0.0526994913816452, 0.04703482985496521, -0.08420311659574509, -0.030056199058890343, -0.02074483036994934, 0.009517835453152657, -0

In [15]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
#OPENAI_API_KEY

In [16]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import  ServerlessSpec

index_name = 'medicalbot'
pc = Pinecone(api_key=PINECONE_API_KEY)
pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-65y5jbg.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [17]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [18]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [19]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [20]:
#docsearch

In [21]:
retreiver = docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [22]:
retrieved_doc = retreiver.invoke("what is Acne?")

In [23]:
retrieved_doc

[Document(id='ea50c75e-ffb7-435e-895c-59bccfddd3e0', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 425.0, 'page_label': '426', 'producer': 'GPL Ghostscript 9.10', 'source': 'Data\\MEDICINE_SECOND_EDITION.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='Corticosteriod—A group of synthetic hormones\nthat are used to prevent or reduce inflammation.\nToxic effects may result from rapid withdrawal after\nprolonged use or from continued use of large doses.\nPatch test—A skin test that is done to identify aller-\ngens. A suspected substance is applied to the skin.\nAfter 24–48 hours, if the area is red and swollen,\nthe test is positive for that substance. If no reaction\noccurs, another substance is applied. This is con-'),
 Document(id='f93206e6-dee0-4aff-912f-80cdcc199ef2', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords':

In [24]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4,max_tokens=500)

In [25]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompts = (
    "You are an assistant for Q/A tasks."
    "Use the following peace of retreived for answers"
    "the questions,If you don't know the answer,say that"
    "you don't know,use 3 sentence fmaximum"
    "and keep the naswer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
    ('system',system_prompts),
    ('human',"{input}"),
    ]
)

In [26]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain=create_retrieval_chain(retreiver,question_answer_chain)

In [31]:
import time
from openai import RateLimitError

max_retries = 5
retry_delay = 2  # seconds

for i in range(max_retries):
    try:
        response = rag_chain.invoke({"input": "what is Acne?"})
        break
    except RateLimitError as e:
        print(f"Rate limited. Retry {i + 1}/{max_retries} in {retry_delay} seconds.")
        time.sleep(retry_delay)
        retry_delay *= 2  # exponential backoff


# Usage
response


Rate limited. Retry 1/5 in 2 seconds.
Rate limited. Retry 2/5 in 4 seconds.
Rate limited. Retry 3/5 in 8 seconds.
Rate limited. Retry 4/5 in 16 seconds.
Rate limited. Retry 5/5 in 32 seconds.


NameError: name 'response' is not defined