<a href="https://colab.research.google.com/github/highplainscomputing/Mistral-Gradio-fine-tuning/blob/main/RAG_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [1]:
!pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai chromadb bs4
!pip install -q sentence-transformers
# !pip install -q faiss-cpu
!pip install -q torch datasets
!pip install -q pypdf
!pip install -q tqdm
!pip install -q faiss-gpu
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q gradio
!pip install -q trl==0.4.7
!pip install -q ipywidgets==7.7.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m807.5/807.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.9/256.9 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

# Import Dependencies

In [2]:
import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain_community.document_loaders import BSHTMLLoader
from io import StringIO
from datasets import load_dataset, Dataset
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import RetrievalQA
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain import HuggingFaceHub
import gradio as gr
import os
import shutil
from langchain.llms import HuggingFacePipeline
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
from huggingface_hub import login
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Global Variables

In [18]:
LOAD_IN_4BIT = True
BNB_4BIT_USE_DOUBLE_QUANT = True
BNB_4BIT_QUANT_TYPE = "nf4"
ADD_EOS_TOKEN = True
PADDING_SIDE = "left"
ADD_BOS_TOKEN = True
HUGGINGFACE_API_KEY = "Your_Huggingface_API_KEY"

os.mkdir("data")

# Parameters

In [5]:
class RAG_Parameters:
  def __init__(self, files = None, question_col_name = "Context", answer_col_name = "Response",model_name = "mistralai/Mistral-7B-v0.1",
               prompt = "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.",
               embedding_model = "sentence-transformers/all-mpnet-base-v2", temperature = 0.7, max_new_tokens = 128, repetition_penalty = 1.15,
               top_k = 1, top_p = 0.75, k_context = 4, chunk_size = 512, chunk_overlap = 100):

    # Dataset Parameters
    self.files = files
    self.question_col_name = question_col_name
    self.answer_col_name = answer_col_name

    # Model Parameters
    self.model_name = model_name

    # Config Parameters
    self.prompt = prompt
    self.embedding_model = embedding_model
    self.temperature = temperature
    self.max_new_tokens = max_new_tokens
    self.repetition_penalty = repetition_penalty
    self.top_k = top_k
    self.top_p = top_p
    self.k_context = k_context
    self.chunk_size = chunk_size
    self.chunk_overlap = chunk_overlap

  # Functions to modify variables
  def set_files(self, files):
      self.files = files

  def set_question_col_name(self, question_col_name):
      self.question_col_name = question_col_name

  def set_answer_col_name(self, answer_col_name):
      self.answer_col_name = answer_col_name

  def set_model_name(self, model_name):
      self.model_name = model_name

  def set_prompt(self, prompt):
    self.prompt = prompt

  def set_embedding_model(self, embedding_model):
      self.embedding_model = embedding_model

  def set_temperature(self, temperature):
      self.temperature = temperature

  def set_max_new_tokens(self, max_new_tokens):
    self.max_new_tokens = max_new_tokens

  def set_repetition_penalty(self, repetition_penalty):
      self.repetition_penalty = repetition_penalty

  def set_top_k(self, top_k):
      self.top_k = top_k

  def set_top_p(self, top_p):
    self.top_p = top_p

  def set_k_context(self, k_context):
    self.k_context = k_context

  def set_chunk_size(self, chunk_size):
    self.chunk_size = chunk_size

  def set_chunk_overlap(self, chunk_overlap):
    self.chunk_overlap = chunk_overlap

  def success_msg_params(self):
    return "Parameters set successfully"



# Model

In [6]:
class Model(RAG_Parameters):
  def __init__(self):
    super().__init__()
    self.config = None


  def set_config(self):
    self.bnb_config = BitsAndBytesConfig(
                                        load_in_4bit = LOAD_IN_4BIT,
                                        bnb_4bit_use_double_quant = BNB_4BIT_USE_DOUBLE_QUANT,
                                        bnb_4bit_quant_type = BNB_4BIT_QUANT_TYPE,
                                        bnb_4bit_compute_dtype = torch.bfloat16
)
  def load_model(self):
    self.set_config()
    self.model = AutoModelForCausalLM.from_pretrained(self.model_name, quantization_config = self.bnb_config, low_cpu_mem_usage=True)

  def load_tokenizer(self):
   tokenizer = AutoTokenizer.from_pretrained(
                                            self.model_name,
                                            padding_side=PADDING_SIDE,
                                            add_eos_token=ADD_EOS_TOKEN,
                                            add_bos_token=ADD_BOS_TOKEN,
                                            )
   tokenizer.pad_token = tokenizer.eos_token
   self.tokenizer = tokenizer
  #  return self.tokenizer

  def success_msg_load(self):
    return "Successfully load Model and Tokenizer"

# Data

In [8]:
class Data(Model):
  def __init__(self):
    super().__init__()
    self.directory = "/content/data"

  def loading_dataset(self):
    print(f"Files duplicates : {self.files}")
    self.files = list(set(self.files)) # remove duplicate path
    print(f"Files without duplicates : {self.files}")
    self.file_extension = self.files[0]
    # print(f"Files : {self.files}")
    # print("-------------------------------------------------------------------------------------------------------------")
    # print(f"Type of files {type(self.files)}")
    for file_path in self.files:
    # Extract the file name from the file path
      file_name = os.path.basename(file_path)
      # Construct the destination path
      destination_path = os.path.join(self.directory, file_name)
      # Move the file
      shutil.move(file_path, destination_path)
      print(f"Moved {file_name} to {self.directory}")

  def get_file_types(self):
    file_types = set()
    for filename in os.listdir(self.directory):
      if os.path.isfile(os.path.join(self.directory, filename)):
        file_extension = filename.split(".")[-1].lower()
        file_types.add(file_extension)

      self.file_types_string = ", ".join(file_types)
      self.number_of_files = len(os.listdir(self.directory))

      return self.file_types_string, self.number_of_files


  def create_vector_store_index(self):
    file_extension, directory = self.get_file_types()

    if file_extension == "md":
      loader = DirectoryLoader(
          self.directory,
          glob="*.md",
          loader_cls=TextLoader,
          show_progress=True,
          )
      pages = loader.load()
      text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = self.chunk_size,
      chunk_overlap = self.chunk_overlap,
      )
      self.documents = text_splitter.split_documents(pages)

    elif file_extension == "pdf":
      loader = DirectoryLoader(
      self.directory,
      glob="*.pdf",
      loader_cls=PyPDFLoader,
      show_progress=True,
      )
      pages = loader.load()
      text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = self.chunk_size,
      chunk_overlap = self.chunk_overlap,
      )
      self.documents = text_splitter.split_documents(pages)
    elif file_extension == "txt":
      loader = DirectoryLoader(
      self.directory,
      glob="*.txt",
      loader_cls=TextLoader,
      show_progress=True,
      )
      pages = loader.load()
      text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = self.chunk_size,
      chunk_overlap = self.chunk_overlap,
      )
      self.documents = text_splitter.split_documents(pages)
    return self.documents


  def success_msg_data(self):
    return "Documents created successfully"


# Create Database

In [9]:
class Create_vector(Data):
  def __init__(self):
    super().__init__()

  def load_embedding_model(self):
      self.embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model)

  def create_db(self):
      self.vectordb = FAISS.from_documents(self.documents, self.embeddings)
      vector = self.vectordb
      return vector
  def success_msg_data_db(self):
    return "Embedding Model and Vector Database created successfully"

# Pipeline and LLM

In [28]:
class Pipeline_and_llm(Create_vector):
  def __init__(self):
    super().__init__()
    self.nothing = None

  def create_pipeline(self):
    pipe = pipeline(
        model=self.model,
        task='text-generation',
        tokenizer=self.tokenizer,
        temperature=self.temperature,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        max_new_tokens=self.max_new_tokens,  # mex number of tokens to generate in the output
        repetition_penalty=self.repetition_penalty,  # without this output begins repeating
        top_k=self.top_k,
        top_p=self.top_p,
    )
    self.pipeline = HuggingFacePipeline(pipeline=pipe)

    self.retriever = self.vectordb.as_retriever(search_kwargs={"k": self.k_context}) # set here to that all parameters can be set in UI

    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
    {context}
    Question: {question}
    Helpful Answer:"""
    self.prompt = PromptTemplate.from_template(template)


  def llm_chain(self):
    self.chain = LLMChain(llm=self.pipeline, prompt=self.prompt)


  def retriever_chain(self):
    self.retrieval_chain = (
      {"context": self.retriever, "question": RunnablePassthrough()}
      | self.chain
    )

  def success_msg_pipeline(self):
    return "Successfully created Pipeline and LLM chain"

  def inference(self, query):
    self.query = query

    if self.query is None:
        raise ValueError("Query is not set. Please provide a query.")

    answer = self.retrieval_chain.invoke(self.query)
    context = answer.get("context")
    context_processed = [context[i].page_content.replace("\n", " ") for i in range(len(context))]
    meta_data = [context[i].metadata for i in range(len(context))]
    # post_processed_answer = answer["text"].split("Helpful Answer:")[-1].strip()
    return context_processed, meta_data, answer.get("text")

# Instance

In [29]:
Chatbot = Pipeline_and_llm()

# Login

In [32]:
class login_setup:
  def HF_login(self, hf_token):
    self.hf_token = hf_token
    login(token = self.hf_token, add_to_git_credential=True)
    return "Successfully Login into HuggingFace"

In [33]:
LOGIN = login_setup()
LOGIN.HF_login(HUGGINGFACE_API_KEY)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


'Successfully Login into HuggingFace'

# UI

In [37]:
with gr.Blocks(theme = "ParityError/anime") as demo:
  with gr.Tab("Parameters"):

    with gr.Row():
      with gr.Column():
        gr.Markdown("File must be PDF")
        files_upload = gr.File(type='filepath', file_count="directory")
      with gr.Column():
        question_column_name = gr.components.Textbox(label = "First Column", info = "Please provide a valid column name from given dataset. i.e for this dataset Context.", value = "Questions")
        answer_column_name = gr.components.Textbox(label = "Second Column", info = "Please provide a valid column name from given dataset. i.e for this dataset Response.", value = "Answers")

    with gr.Row():
      # UI for Model Parameters
      model_name = gr.components.Textbox(label = "Model name", info = "Please provide a valid model id from HuggingFace. i.e mistralai/Mistral-7B-v0.1", value = "mistralai/Mistral-7B-v0.1")

    with gr.Row():
      with gr.Column():
        prompt = gr.components.Textbox(label = "Prompt", info = "Please provide your own prompt template if you want.", value = "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.")

    with gr.Row():
      with gr.Column():
        embedding_model = gr.components.Textbox(label = "Embedding Model", info = "Please provide a embedding model from Huggingface.", value = "sentence-transformers/all-mpnet-base-v2")


    with gr.Row():
      with gr.Column():
        temperature = gr.Slider(minimum = 0, maximum = 1, label = "Temperature", info = " Close to 1 general, close to zero specific to data.", value = 0.7, step = 0.1)
        max_new_tokens = gr.Slider(minimum = 100, maximum = 1024, label = "Max New Tokens", info = "Number of max new token you wan.", value = 100, step = 1)
        repetition_penalty = gr.Slider(minimum = 0.1, maximum = 100, label = "Repetition Penalty", info = "", value = 1.15, step = 0.01)

    with gr.Row():
      with gr.Column():
        top_k = gr.Slider(minimum = 0.1, maximum = 100, label = "Top k", info = "", value = 1.15, step = 0.01)
        top_p = gr.Slider(minimum = 0.1, maximum = 100, label = "Top p", info = "", value = 1.15, step = 0.01)
        k_context = gr.Slider(minimum = 1, maximum = 10, label = "K context", info = "", value = 3, step = 1)

    with gr.Row():
      with gr.Column():
        chunk_size = gr.Slider(minimum = 100, maximum = 2048, label = "Chunk Size", info = "", value = 1024, step = 1)
        chunk_overlap = gr.Slider(minimum = 100, maximum = 2048, label = "Chunk Overlap", info = "", value = 100, step = 1)

    with gr.Row():
      with gr.Column():
        success_message = gr.components.Textbox(label = "Message")

    parameters_btn = gr.Button("Set Parameters")
    parameters_btn.click(Chatbot.set_files, inputs=[files_upload]).then(Chatbot.set_question_col_name, inputs=[question_column_name]).then(Chatbot.set_answer_col_name, inputs=[answer_column_name]
                        ).then(Chatbot.set_prompt, inputs=[prompt]).then(Chatbot.set_embedding_model, inputs = [embedding_model]).then(Chatbot.set_temperature, inputs = [temperature]
                        ).then(Chatbot.set_max_new_tokens, inputs = [max_new_tokens]).then(Chatbot.set_repetition_penalty, inputs = [repetition_penalty]
                        ).then(Chatbot.set_top_k, inputs = [top_k]).then(Chatbot.set_chunk_size, inputs = [chunk_size]).then(Chatbot.set_chunk_overlap, inputs = [chunk_overlap]
                        ).then(Chatbot.success_msg_params, outputs=[success_message])

  with gr.Tab("Data, Model and DataBase"):

    with gr.Row():
        with gr.Column():
            model_and_tokenizer_load = gr.components.Textbox(label = "Message")

    model_and_tokenizer_load_btn = gr.Button("Load Model and Tokenizer")
    model_and_tokenizer_load_btn.click(Chatbot.set_config).then(Chatbot.load_model).then(Chatbot.load_tokenizer
                                      ).then(Chatbot.success_msg_load, outputs=model_and_tokenizer_load, show_progress=True)

    with gr.Row():
        with gr.Column():
            data_success_msg_data = gr.components.Textbox(label = "Message")
    data_setup_btn = gr.Button("Documents")
    data_setup_btn.click(Chatbot.loading_dataset).then(Chatbot.create_vector_store_index).then(Chatbot.success_msg_data, outputs=data_success_msg_data, show_progress=True)

    with gr.Row():
        with gr.Column():
            data_success_msg_db = gr.components.Textbox(label = "Message")
            vector_display = gr.components.Textbox(label = "Message", lines = 8)
    data_setup_btn = gr.Button("Database")
    data_setup_btn.click(Chatbot.load_embedding_model).then(Chatbot.create_db, outputs = vector_display, show_progress=True).then(Chatbot.success_msg_data_db, outputs=data_success_msg_db, show_progress=True)     # (Chatbot.create_vector_store_index).then

    with gr.Row():
        with gr.Column():
            data_success_msg_pipeline = gr.components.Textbox(label = "Message")
    data_setup_btn = gr.Button("Pipeline")
    data_setup_btn.click(Chatbot.create_pipeline).then(Chatbot.llm_chain).then(Chatbot.retriever_chain).then(Chatbot.success_msg_pipeline, outputs=data_success_msg_pipeline, show_progress=True)

  with gr.Tab("Inference"):
      with gr.Row():
          with gr.Column():
              inference_input = gr.components.Textbox(label = "Submit", info = "Write query related to your docs.")
              inference_output_question = gr.components.Textbox(label = "Relevant Text", lines=5)
              inference_output_context = gr.components.Textbox(label = "Metadata", lines=3)
              inference_output_text = gr.components.Textbox(label = "Answer")

      btn = gr.Button("Generate")
      btn.click(Chatbot.inference, inputs=[inference_input], outputs=[inference_output_question, inference_output_context, inference_output_text], show_progress=True)

if __name__ == "__main__":
  demo.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://463da40eb0df97e1cc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Files duplicates : ['/tmp/gradio/78235aaf28039bbe83f2f10d6660ecb991c88eb9/1506.02640.pdf', '/tmp/gradio/4ba72887b96b1caf8086f203185c5bf940fd63f1/1706.03762.pdf']
Files without duplicates : ['/tmp/gradio/78235aaf28039bbe83f2f10d6660ecb991c88eb9/1506.02640.pdf', '/tmp/gradio/4ba72887b96b1caf8086f203185c5bf940fd63f1/1706.03762.pdf']
Moved 1506.02640.pdf to /content/data
Moved 1706.03762.pdf to /content/data


100%|██████████| 2/2 [00:01<00:00,  1.40it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://463da40eb0df97e1cc.gradio.live
