In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

# Set API keys
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

# Langsmith Tracking
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_TRACING_V2"] = "true"  # Correctly setting the tracing variable
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

In [10]:
! nvidia-smi

Tue Sep 17 17:22:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...    Off | 00000000:01:00.0 Off |                  N/A |
| N/A   47C    P8               4W /  60W |   1209MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
import os
from langchain_community.vectorstores import FAISS
#import fitz
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
#from langchain_chroma import Chroma
#from langchain_openai import OpenAIEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEndpoint
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate
import torch

In [4]:
pdf_file_folder="/home/swaroop/Documents/Coding/LLMS/Ayurveda Bot/Data2"
os.listdir(pdf_file_folder)

['Amrita Sondhi - The Modern Ayurvedic Cookbook_ Healthful, Healing Recipes for Life (2006, Arsenal Pulp Press) - libgen.li_removed.pdf']

In [5]:
import torch
print(torch.cuda.is_available())

True


In [6]:
def HFEmbeddings():
    model_name = "sentence-transformers/all-mpnet-base-v2"
    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': False}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return embeddings
    

In [7]:
def create_vector_database(pdf_folder_path):
    # Constants
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 200
    loader = PyPDFDirectoryLoader(pdf_file_folder)
    docs=loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len)
    splits = text_splitter.split_documents(docs)
    vectorstore = FAISS.from_documents(documents=splits, embedding=HFEmbeddings())
    vectorstore.save_local("./faiss_index")
    retriever = vectorstore.as_retriever()
    return retriever 

In [8]:
os.getcwd()

'/home/swaroop/Documents/Coding/LLMS/Ayurveda Bot'

In [9]:
retriever=create_vector_database(pdf_file_folder)

  from tqdm.autonotebook import tqdm, trange


In [11]:
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="cuda", 
                                             torch_dtype="auto", 
                                             trust_remote_code=True)
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10
)
llm = HuggingFacePipeline(pipeline=pipe)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [13]:
def Question_answer(retriever,llm,query):
    # Define the RAG prompt template
    # RAG
    template = """Answer the following question based on this context.If you feel like you don't have enough information to answer the question, say "Data Not Available":
    {context}
    
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)
    #prompt = hub.pull("rlm/rag-prompt")
    rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
    )
    response=rag_chain.invoke(query)#.replace("\n"," ")
    return response
    
    

In [15]:
query="what are the symptoms for fever?"
resp=Question_answer(retriever,llm,query)
print(resp)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 7.75 GiB of which 1024.00 KiB is free. Including non-PyTorch memory, this process has 7.74 GiB memory in use. Of the allocated memory 7.57 GiB is allocated by PyTorch, and 43.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)