In [8]:
!pip install transformers==4.41.2
!pip install bitsandbytes==0.43.1
!pip install accelerate==0.31.0
!pip install langchain==0.2.5
!pip install langchainhub==0.1.20
!pip install langchain-chroma==0.1.1
!pip install langchain-community==0.2.5
!pip install langchain_huggingface==0.0.3
!pip install python-dotenv==1.0.1
!pip install pypdf==4.2.0
!pip install numpy
!pip install chainlit==1.1.304


# Install npm packages
!npm install -g localtunnel
!npm install -g npm


Collecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cublas_

In [1]:
%%writefile app.py

import chainlit as cl
import torch

from chainlit.types import AskFileResponse

from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer , AutoModelForCausalLM , pipeline
from langchain_huggingface.llms import HuggingFacePipeline

from langchain.memory import ConversationBufferMemory
from langchain_community.chat_message_histories import ChatMessageHistory

from langchain.chains import ConversationalRetrievalChain

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader , TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain import hub


text_splitter = RecursiveCharacterTextSplitter(chunk_size =1000, chunk_overlap = 100)
embedding = HuggingFaceEmbeddings()


def process_file (file: AskFileResponse):
  if file.type == "text/plain":
    Loader = TextLoader
  elif file.type == "application/pdf":
    Loader = PyPDFLoader

  loader = Loader (file.path)
  documents = loader.load()
  docs = text_splitter.split_documents(documents)

  for i, doc in enumerate(docs):
    doc.metadata ["source"] = f"source_{i}"
    return docs


def get_vector_db(file: AskFileResponse):
    docs = process_file(file)
    cl.user_session.set("docs", docs)
    vector_db = Chroma.from_documents(documents=docs, embedding=embedding)
    return vector_db


def get_huggingface_llm(model_name: str = "lmsys/vicuna-7b-v1.5", max_new_token: int = 512):
    nf4_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=nf4_config,
        low_cpu_mem_usage=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_token,
        pad_token_id=tokenizer.eos_token_id,
        device_map="auto"
    )

    llm = HuggingFacePipeline(
        pipeline=model_pipeline,
    )

    return llm

LLM = get_huggingface_llm()


welcome_message = """Welcome to the PDF QA! To get started:
1. Upload a PDF or text file
2. Ask a question about the file
"""


@cl.on_chat_start
async def on_chat_start():
    files = None
    while files is None:
        files = await cl.AskFileMessage(
            content=welcome_message,
            accept=["text/plain", "application/pdf"],
            max_size_mb=20,
            timeout=180,
        ).send()

    file = files[0]

    msg = cl.Message(content=f"Processing '{file.name}'...", disable_feedback=True)
    await msg.send()

    vector_db = await cl.make_async(get_vector_db)(file)

    message_history = ChatMessageHistory()
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        chat_memory=message_history,
        return_messages=True,
    )

    retriever = vector_db.as_retriever(
        search_type="mmr",
        search_kwargs={'k': 3}
    )

    chain = ConversationalRetrievalChain.from_llm(
        llm=LLM,
        chain_type="stuff",
        retriever=retriever,
        memory=memory,
        return_source_documents=True
    )

    msg.content = f"'{file.name}' processed. You can now ask questions!"
    await msg.update()

    cl.user_session.set("chain", chain)


@cl.on_message
async def on_message(message: cl.Message):
    chain = cl.user_session.get("chain")
    cb = cl.AsyncLangchainCallbackHandler()
    res = await chain.ainvoke(message.content, callbacks=[cb])

    answer = res["answer"]
    source_documents = res["source_documents"]
    text_elements = []

    if source_documents:
        for source_idx, source_doc in enumerate(source_documents):
            source_name = f"source_{source_idx}"
            text_elements.append(
                cl.Text(content=source_doc.page_content, name=source_name)
            )

        source_names = [text_el.name for text_el in text_elements]

        if source_names:
            answer += f"\nSources: {', '.join(source_names)}"
        else:
            answer += "\nNo sources found"

    await cl.Message(content=answer, elements=text_elements).send()

Writing app.py


## MAIN


In [9]:
!pip install pyngrok -q

In [10]:
from pyngrok import ngrok
# Add your url-token
!ngrok config add-authtoken <url_token>

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
public_url = ngrok.connect(8000).public_url
print(public_url)

In [12]:
!chainlit run app.py

2024-07-05 06:22:24 - Created default config file at /content/.chainlit/config.toml
2024-07-05 06:22:24 - Created default translation directory at /content/.chainlit/translations
2024-07-05 06:22:24 - Created default translation file at /content/.chainlit/translations/en-US.json




2024-07-05 06:22:34.618600: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 06:22:34.618654: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 06:22:34.763450: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-05 06:22:35.070532: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




2024-07-05 06:22:46 - Use pytorch device_name: cuda
2024-07-05 06:22:46 - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
modules.json: 100% 349/349 [00:00<00:00, 1.81MB/s]
config_sentence_transformers.json: 100% 116/116 [00:00<00:00, 725kB/s]
README.md: 100% 10.6k/10.6k [00:00<00:00, 45.9MB/s]
sentence_bert_config.json: 100% 53.0/53.0 [00:00<00:00, 341kB/s]
config.json: 100% 571/571 [00:00<00:00, 3.58MB/s]
model.safetensors: 100% 438M/438M [00:02<00:00, 149MB/s] 
tokenizer_config.json: 100% 363/363 [00:00<00:00, 2.21MB/s]
vocab.txt: 100% 232k/232k [00:00<00:00, 50.8MB/s]
tokenizer.json: 100% 466k/466k [00:00<00:00, 725kB/s]
special_tokens_map.json: 100% 239/239 [00:00<00:00, 1.35MB/s]
1_Pooling/config.json: 100% 190/190 [00:00<00:00, 1.25MB/s]
config.json: 100% 615/615 [00:00<00:00, 3.79MB/s]
pytorch_model.bin.index.json: 100% 26.8k/26.8k [00:00<00:00, 96.3MB/s]
Downloading shards:   0% 0/2 [00:00<?, ?it/s]
pytorch_model-00001-of-00002.bin:   0% 0.00/9.98G