<a href="https://colab.research.google.com/github/lehuong240823/rag-company-knowledge-consultant-chatbot/blob/main/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get Started

In [None]:
# @title Install Libraries
!pip -q install pinecone llama-cpp-python
!pip -q install langchain_community langchain_openai langchain_pinecone langchain-huggingface langsmith

In [None]:
# @title Import Libraries
import os, yaml, inspect, json, datetime, pytz, hashlib
import pandas as pd
from os.path import join
from google.colab import userdata
from langchain.chains import ConversationalRetrievalChain
from langchain_community.chat_models import ChatLlamaCpp
from langchain.document_loaders import TextLoader, CSVLoader, GithubFileLoader
from langchain.prompts import PromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate
from langchain.llms import LlamaCpp
from langchain.embeddings import LlamaCppEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_openai import ChatOpenAI
from langchain_pinecone.vectorstores import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from langsmith import Client, traceable

In [None]:
# @title Configuration
config = yaml.safe_load('''
repo: lehuong240823/rag-company-knowledge-consultant-chatbot
branch: main
ref_data: reference_data
format:
  - .txt
  - .csv
chat_model:
  name: Qwen3-0.6B-Q8_0-GGUF
  path: /content/Qwen3-0.6B-Q8_0.gguf
  source: https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf
embedding_model:
  name: Qwen3-Embedding-0.6B-Q8_0-GGUF
  path: /content/Qwen3-Embedding-0.6B-Q8_0.gguf
  source: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B-GGUF/resolve/main/Qwen3-Embedding-0.6B-Q8_0.gguf
grader_model:
  name: 'Llama-3.2-1B-Instruct-Q8_0-GGUF'
  path: '/content/llama-3.2-1b-instruct-q8_0.gguf'
  source: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF/resolve/main/llama-3.2-1b-instruct-q8_0.gguf'

''')

In [None]:
# @title Download Quantized Model
!wget -N -q {config['chat_model']['source']}
!wget -N -q {config['embedding_model']['source']}
!wget -N -q {config['grader_model']['source']}

In [None]:
# @title Set Environment Variables
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('HF_TOKEN')
os.environ['LANGSMITH_API_KEY'] = userdata.get('LANGSMITH_API_KEY')
os.environ['LANGSMITH_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGSMITH_PROJECT'] = userdata.get('PINECONE_INDEX_NAME')
os.environ['LANGSMITH_TRACING'] = 'true'
os.environ['OPENAI_API_BASE'] = 'https://router.huggingface.co/v1' #'https://openrouter.ai/api/v1'
os.environ['OPENAI_API_KEY'] = userdata.get('HF_TOKEN') #userdata.get('OPENROUTER_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['PINECONE_INDEX_NAME'] = userdata.get('PINECONE_INDEX_NAME')
os.environ['GITHUB_PERSONAL_ACCESS_TOKEN'] = userdata.get('GITHUB_PERSONAL_ACCESS_TOKEN')
root_path =  userdata.get('ROOT_PATH')
embedding_path = join(root_path, 'embedding_data')
reference_path = join(root_path, 'reference_data')
evaluators_path = join(root_path, 'evaluators')

# Method

###Utility Functions

In [None]:
# @title Timestamp and Hashing Utilities
def get_timestamp():
  vietnam_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
  vietnam_time = datetime.datetime.now(vietnam_timezone)
  return vietnam_time.isoformat()

def md5(text):
  '''MD% hash for track change'''
  return hashlib.md5(text.encode('utf-8')).hexdigest()

In [None]:
# @title Pinecone Utilities
def get_index(index_name=os.environ['PINECONE_INDEX_NAME']):
  pc = Pinecone()
  if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

  return pc.Index(index_name)

In [None]:
# @title Document Loading and Processing Utilities
def create_text_splitter(chunk_size=100, chunk_overlap=0):
  return RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

def Loader(file=None, mode='github', repo=config['repo'], branch=config['branch'], dest_dir='reference_data', format=('.txt', '.csv')):
  if file is None and mode=='github':
    loader = GithubFileLoader(
      repo=repo,
      branch=branch,
      file_filter=lambda file_path: (file_path.endswith(format) and file_path.startswith(dest_dir))
    )

  if file is not None and mode=='local':
    if file.endswith('.csv'):
      loader = CSVLoader(file)
    elif file.endswith('.txt'):
      loader = TextLoader(file)

  return loader

def add_texts(text):
  texts = text_splitter.split_text(text)
  vectorstore.add_texts(texts)
  print(f'Added {len(texts)} text chunks to vector store.')

def add_documents(path=None, ids=None, mode='github'):
  loader = Loader(file=path, mode=mode)
  documents = loader.load()
  src = '\n'.join([doc.metadata['path'] for doc in documents]) if path is None else path

  print(f'Loaded {len(documents)} document(s) from: {src}')
  docs = text_splitter.split_documents(documents)

  for idx, each in enumerate(docs):
    part = 0 if idx==0 or each.metadata['source']!=docs[idx-1].metadata['source'] else part + 1
    each.metadata['timestamp'] = get_timestamp()
    each.metadata['md5'] = md5(each.page_content)
    if ids is None:
      id = each.metadata['path'].split('/')[-1].split('.')[0] + f'_{part}'
    else:
      id = f'{ids}_{str(idx)}'
    vectorstore.add_documents([each], ids=[id])
  #vectorstore.add_documents(documents=docs)
  print(f'Added {len(docs)} text chunk(s) to vector store.')

In [None]:
# @title LangSmith Dataset Utilities
def create_dataset(file_name=None, dataset_name=None, provider='langsmith', mode='local', overwrite=False):
  '''Load json file only'''
  if mode=='local' and file_name!=None:
    dataset_name = file_name[:-5] if dataset_name is None else dataset_name
    path = join(embedding_path, provider, file_name)
    with open(path) as f:
      if not client.has_dataset(dataset_name=dataset_name):
        client.create_dataset(dataset_name=dataset_name)
        client.create_examples(
          dataset_name=dataset_name,
          examples=json.load(f)
        )
        print(f'Create {dataset_name} successfully from {path}!')
      else:
        print(f'Dataset {dataset_name} already exists.')

  if mode=='github':
    json_loader = Loader(mode='github', dest_dir=f'embedding_data/{provider}', format=('json'))
    jsons = json_loader.load()
    for each in jsons:
      dataset_name = each.metadata['path'].split('/')[-1].split('.')[0]
      if overwrite:
        client.delete_dataset(dataset_name=dataset_name)
      if not client.has_dataset(dataset_name=dataset_name):
        client.create_dataset(dataset_name=dataset_name)
        client.create_examples(
          dataset_name=dataset_name,
          examples=json.loads(each.page_content)
        )
        print(f'Create {dataset_name} successfully from {each.metadata["source"]}!')
      else:
        print(f'Dataset {dataset_name} already exists.')


In [None]:
# @title Evaluator Utilities
def get_evaluator_path(evaluator_name, mode='github'):
  if mode=='local':
    path = join(evaluators_path, evaluator_name)
  if mode=='github':
    path = join('evaluators', evaluator_name)
  return path

def load_chat_prompt(evaluator, mode='github'):
  role_map = {'system': SystemMessagePromptTemplate, 'user': HumanMessagePromptTemplate, 'ai': AIMessagePromptTemplate}
  messages = []
  file_path = join(get_evaluator_path(evaluator, mode), 'prompt.yaml')
  if mode=='local':
    with open(file_path, 'r') as f:
      config = yaml.safe_load(f)
  if mode=='github':
    yaml_loader = Loader(mode=mode, dest_dir=file_path, format=('.yaml')).load()
    config = yaml.safe_load(yaml_loader[0].page_content)

  for msg_config in config['messages']:
    messages.append(role_map[msg_config['role']].from_template(msg_config['template']))

  return ChatPromptTemplate.from_messages(messages)

def load_output_schema(evaluator, mode='github'):
  file_path = join(get_evaluator_path(evaluator, mode), 'output_schema.yaml')
  if mode=='local':
    with open(file_path, 'r') as f:
      config = yaml.safe_load(f)
  if mode=='github':
    yaml_loader = Loader(mode=mode, dest_dir=file_path, format=('.yaml')).load()
    config = yaml.safe_load(yaml_loader[0].page_content)
  return config

def create_structured_grader_llm(temperature=0, evaluator=None):
  return ChatLlamaCpp(
    temperature=0,
    model_path='/content/llama-3.2-1b-instruct-q8_0.gguf',
    n_ctx=2048,
    n_batch=256,
    max_tokens=200,
    top_p=0.8,
    top_k=20,
    repeat_penalty=1.5,
    verbose=False,
    streaming=False,
  ).with_structured_output(
    load_output_schema(evaluator),
  )
  """return ChatOpenAI(model=model, temperature=temperature).with_structured_output(
    load_output_schema(evaluator),
    method='json_schema', strict=True
  )"""

## Evaluators

### Evaluation Citeria

In [None]:
# @title Correctness
def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
  evaluator = inspect.currentframe().f_code.co_name
  grader_llm = create_structured_grader_llm(evaluator=evaluator)
  grade = grader_llm.invoke(load_chat_prompt(evaluator).format(
    question=inputs['question'],
    reference=reference_outputs['answer'],
    answer=outputs['answer']
  ))
  return grade['correct']

In [None]:
# @title Relevance
def relevance(inputs: dict, outputs: dict) -> bool:
  evaluator = inspect.currentframe().f_code.co_name
  grader_llm = create_structured_grader_llm(evaluator=evaluator)
  grade = grader_llm.invoke(load_chat_prompt(evaluator).format(
    question=inputs['question'],
    answer=outputs['answer']
  ))
  return grade['relevant']

In [None]:
# @title Groundedness
def groundedness(inputs: dict, outputs: dict) -> bool:
  evaluator = inspect.currentframe().f_code.co_name
  grader_llm = create_structured_grader_llm(evaluator=evaluator)
  grade = grader_llm.invoke(load_chat_prompt(evaluator).format(
    facts='\n'.join(doc.page_content for doc in outputs['documents']),
    answer=outputs['answer']
  ))
  return grade['grounded']

In [None]:
# @title Retrieval relevance
def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
  evaluator = inspect.currentframe().f_code.co_name
  grader_llm = create_structured_grader_llm(evaluator=evaluator)
  grade = grader_llm.invoke(load_chat_prompt(evaluator).format(
    facts='\n'.join(doc.page_content for doc in outputs['documents']),
    question=inputs['question']
  ))
  return grade['relevant']

### Target and Experiment

In [None]:
# @title Target
@traceable()
def rag_bot(question: str) -> dict:
  ai_msg = qa_chain.invoke({'question': question, 'chat_history': []})
  return {'answer': ai_msg['answer'], 'documents': ai_msg['source_documents']}

def target(inputs: dict) -> dict:
  return rag_bot(inputs['question'])

In [None]:
# @title Experiment
def experiment(chunk_size, overlap, k, experiment_prefix='rag-doc-relevance', data='fqa_mini'):
  experiment_results = client.evaluate(
    target,
    data=data,
    evaluators=[correctness, relevance, groundedness, retrieval_relevance],
    experiment_prefix=experiment_prefix
  )

  return experiment_results

### Tuning

In [None]:
# @title Dump Experiments to CSV
def save_experiment(df, path=join(root_path, 'experiments', 'tuning_experiments')):
  if not os.path.isfile(path):
    return df.to_csv(path, index=False, mode='w', encoding='utf-8')
  else:
    return df.to_csv(path, index=False, mode='a', header=False, encoding='utf-8')
  print(f'Save {path} successfully!')

In [None]:
# @title Chunk Size
def chunk_size_tunning(start=200, end=500, step=100, chunk_overlap=0, k=2, sync_embedded_data=False):
  for chunk_size in range(start, end+step, step):
    namespace = f'chunk{chunk_size}_overlap{chunk_overlap}'
    prefix = f'tune_{namespace}_k{k}'
    print(f'Begin chunk tunning {prefix}')

    init_global_var(namespace=namespace, chunk_size=chunk_size, chunk_overlap=chunk_overlap, k=k)

    if sync_embedded_data:
      add_documents(mode='github')

    experiment_results = experiment(chunk_size=chunk_size, overlap=chunk_overlap, k=k, experiment_prefix=prefix)
    pd_resutlts = experiment_results.to_pandas()
    pd_resutlts['experiment'] = prefix
    pd_resutlts['chunk'] = chunk_size
    pd_resutlts['overlap'] = chunk_overlap
    pd_resutlts['k'] = k

    print(f'Complete chunk tunning {prefix}. TOTAL: {(chunk_size-start)/(end-start+step)*100}%')

In [None]:
# @title Initialize Components
def init_global_var(namespace=None, chunk_size=100, chunk_overlap=0, k=2, model='Qwen/Qwen3-8B:featherless-ai'):
  global index, client, llm, embeddings, vectorstore, text_splitter, qa_chain

  index = get_index()

  client = Client()

  #llm = ChatOpenAI(model=model)

  llm = LlamaCpp(
    model_path=config['chat_model']['path'],
    temperature=0,
    n_ctx=1024,
    max_tokens=100,
    n_batch=256,
    top_p=0.8,
    top_k=20,
    repeat_penalty=1.5,
    stop=["\n"],
    streaming=False,
    verbose=False,
  )

  '''embeddings = HuggingFaceEndpointEmbeddings(model='Qwen/Qwen3-Embedding-8B')'''

  embeddings = LlamaCppEmbeddings(
    model_path=config['embedding_model']['path'],
    verbose=False
  )

  vectorstore = PineconeVectorStore(index=index, embedding=embeddings, namespace=namespace)

  text_splitter = create_text_splitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

  retriever = vectorstore.as_retriever(k=k, search_kwargs={ "namespace": namespace })

  qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    combine_docs_chain_kwargs={'prompt': load_chat_prompt('qa_chain')},
    return_source_documents=True
  )


In [None]:
init_global_var(namespace='chunk200_overlap0', )
experiment_results = experiment(chunk_size=100, overlap=0, k=2, experiment_prefix='TEST')

In [None]:
create_dataset(mode='github', overwrite=False)

In [None]:
chunk_size_tunning()

In [None]:
print(experiment_results.to_pandas().to_markdown())