# RAG Pipeline

## Set up
### Import Packages and API keys 

In [None]:
# !pip install transformers datasets torch pinecone-client langchain-community faiss-cpu sentence-transformers
from getpass import getpass
from dotenv import load_dotenv
import os
from pathlib import Path

env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

huggingface_api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')

if not huggingface_api_token:
    huggingface_api_token = getpass("Enter your Hugging Face Hub API token: ")

### Model selection

In [None]:
from langchain_community.llms import HuggingFaceHub
from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
# model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b-instruct")

# I will be using T5 model from open source huggingface library
model_name = "tiiuae/falcon-7b-instruct"
# model_name = "meta-llama/Llama-2-7b"
# model_name = "google/flan-t5-xxl"

llm = HuggingFaceHub(repo_id=model_name, model_kwargs={"temperature":0.5, "max_length":1024, "max_new_tokens":200})

## Template-based Prompting

In [None]:
# I will be using Langchain

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import pipeline
# pipeline = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=200,
# )

# llm = HuggingFacePipeline(pipeline=pipeline)

template= """
Try to be helpful as you can in a Computer Science context.
Question: {question}
Response:
"""

prompt = PromptTemplate(template=template, input_variables=["question"])
# # llm_chain = LLMChain(prompt=prompt, llm=llm)
# # llm_chain = load_qa_chain(llm, chain_type="stuff")
llm_chain = LLMChain(prompt=prompt, llm=llm)



### Chat Interface

In [None]:
import gradio as gr
# def chat_interface(textbox, chat):
#     input_dict = {'question': textbox}
#     response = llm_chain.run(input_dict)

#     print("user:", textbox)
#     print("bot:", response)
#     return response

def chat_interface(textbox, chat):
    input_dict = {'question': textbox}
    response_dict = llm_chain.invoke(input_dict)
    text = response_dict['text']  # Extract the text from the dictionary
    # Split the text based on "Response:" and extract the part after it
    response_text = text.split("Response:")[1].strip()
    return response_text

gr.ChatInterface(
    fn=chat_interface,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
    title="Chatbot",
    description="Ask Chatbot any question",
    theme="soft",
    examples=["What does AI stand for?", "What is Software Engineering?", "What is Cybersecurity?"],
    cache_examples=False,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
).launch()

#I am a final year Computer Science student seeking to find a graduate role in __. What are practical skills required for a career in __?
#I am a beginner that wants to get into __, where should I start?


### Evaluation
Evaluating the model with prompting

In [None]:
# Load standardized test set
    # IT Consultant, Cloud Engineer...

# ROGUE? BLUE?

## RAG from synthetic data set

In [1]:
# Use langchain packages to help with implementing retrieval augmentation generation
from datasets import load_dataset
from langchain.document_loaders.csv_loader import CSVLoader
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from operator import itemgetter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

loader = CSVLoader(file_path="rag_sample.csv")
documents = loader.load()  # Load data for retrieval

# Step 2: Split Documents
text_split = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=150)
# text_split = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
d = text_split.split_documents(documents)

# Step 3: Create a FAISS Index
model_name = "sentence-transformers/gtr-t5-base"
modelPath = model_name

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

llm = SentenceTransformer('sentence-transformers/gtr-t5-base')

text = "This is a test document."
query_result = embeddings.embed_query(text)

db = FAISS.from_documents(d, embedding=embeddings)

# Step 5: Perform RAG
template= """
Try to be helpful as you can in a Computer Science context.
Question: {question}
Response:
"""

prompt = ChatPromptTemplate.from_template(template)

# Specify the model name you want to use
model_name = "tiiuae/falcon-7b-instruct"
# model_name = "meta-llama/Llama-2-7b"
# model_name = "google/flan-t5-xxl"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a text generation pipeline using the model and tokenizer
hf = pipeline(
    "text-generation", 
    model=model_name, 
    tokenizer=tokenizer,
    return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=hf,
    model_kwargs={"temperature": 0.7, "max_length": 512, "max_new_tokens": 20},
)

# hf = HuggingFacePipeline.from_model_id(
#     model_id=model_name,
#     task="text-generation",
#     pipeline_kwargs={"max_new_tokens": 10},
# )

# llm_chain = (
#     {
#         "context": itemgetter("question") | db.as_retriever(),
#         "question": itemgetter("question"),
#     }
#     | prompt
#     | hf
#     | StrOutputParser()
# )
# chain = RetrievalQA.from_chain_type(
#         llm=llm,
#         chain_type=chain_type,
#         retriever=docsearch.as_retriever(),
#         return_source_documents=True,
#         chain_type_kwargs={"prompt":prompt}
#     )

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [02:06<00:00, 63.35s/it]


In [2]:
from langchain.chains import RetrievalQA

retriever = db.as_retriever(search_kwargs={"k": 4})
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)


In [3]:
import gradio as gr

def chat_interface(textbox, chat):
    # input_dict = {'question': textbox}
    # # response = llm_chain.invoke(input_dict)
    # # response_text = response['text'].split("Response:")[1].strip()
    # result = qa.run({"query": question})
    # print(result["result"])
    # return result["result"]
    # Access the user's question from the textbox parameter
    question = textbox
    
    # Run the QA pipeline with the user's question
    result = qa.run({"query": question})
    
    # Extract and return the response from the QA result
    response_text = result["result"]
    return response_text
# def chat_interface(textbox, chat):
#     docs = db.similarity_search(textbox)
#     vectorstore = FAISS.from_texts(
#     [textbox], embedding=embeddings
#     )
#     retriever = vectorstore.as_retriever()
#     # input_dict = {'question': textbox, 'input_documents': docs }
#     input_dict = {'question': textbox}
#     response = llm_chain.invoke(input_dict)
#     response_text = text.split("Response:")[1].strip()
#     return response_text

gr.ChatInterface(
    fn=chat_interface,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
    title="Chatbot",
    description="Ask Chatbot any question",
    theme="soft",
    examples=["What does AI stand for?", "What is Software Engineering?", "What is Cybersecurity?"],
    cache_examples=False,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
).launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Traceback (most recent call last):
  File "/Users/huishingchong/agile_llm/venv/lib/python3.11/site-packages/gradio/queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/huishingchong/agile_llm/venv/lib/python3.11/site-packages/gradio/route_utils.py", line 232, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/huishingchong/agile_llm/venv/lib/python3.11/site-packages/gradio/blocks.py", line 1561, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/huishingchong/agile_llm/venv/lib/python3.11/site-packages/gradio/blocks.py", line 1177, in call_function
    prediction = await fn(*processed_input)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/huishingchon

### Evaluation

## Fine Tuning

In [None]:
# MAYBE DO THIS FIRST? AND SEE THE DOWNSIDE, AND LEARN THAT IT IS NOT REQUIRED (doesn't solve hallucinations and timely context!)
# Fine-tune with input and output example data sets

# Compare with different models (one fine-tuned one just pre-trained)

### Evaluation

In [None]:
# Load test set
# Find that fine-tuning is not needed?

## Full adapted model (combined of all approaches)

In [None]:
# Knowledge retrieved
# Augmented Prompt
# Fine-tuned/pre-trained LLM

import gradio as gr
def chat_interface(textbox, chat):
    # docs = db.similarity_search(textbox)
    # input_dict = {'question': textbox, 'input_documents': docs }
    input_dict = {'question': textbox}
    response = llm_chain.run(input_dict)

    print("user:", textbox)
    print("bot:", response)
    return response

gr.ChatInterface(
    fn=chat_interface,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
    title="Chatbot",
    description="Ask Chatbot any question",
    theme="soft",
    examples=["What does AI stand for?", "What is Software Engineering?", "What is Cybersecurity?"],
    cache_examples=False,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
).launch()