<a href="https://colab.research.google.com/github/lehuong240823/rag-company-knowledge-consultant-chatbot/blob/main/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get Started

In [106]:
# @title Install Libraries
!pip -q install pinecone
!pip -q install langchain_community
!pip -q install langchain_openai
!pip -q install langchain_pinecone
!pip -q install langchain-huggingface
!pip -q install langsmith

In [206]:
# @title Import Libraries
import os, yaml, inspect, json
from os.path import join
from google.colab import userdata
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import TextLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate
from langchain_pinecone.vectorstores import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from langsmith import Client, traceable

In [121]:
# @title Set Environment Variables
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('HF_TOKEN')
os.environ['LANGSMITH_API_KEY'] = userdata.get('LANGSMITH_API_KEY')
os.environ['LANGSMITH_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGSMITH_PROJECT'] = userdata.get('PINECONE_INDEX_NAME')
os.environ['LANGSMITH_TRACING'] = 'true'
os.environ['OPENAI_API_BASE'] = 'https://router.huggingface.co/v1' #'https://openrouter.ai/api/v1'
os.environ['OPENAI_API_KEY'] = userdata.get('HF_TOKEN') #userdata.get('OPENROUTER_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['PINECONE_INDEX_NAME'] = userdata.get('PINECONE_INDEX_NAME')
root_path =  userdata.get('ROOT_PATH')
embedding_path = join(root_path, 'embedding_data')
reference_path = join(root_path, 'reference_data')
evaluators_path = join(root_path, 'evaluators_path')

# Method

In [None]:
# @title Get Pinecone Index
def get_index(index_name=os.environ['PINECONE_INDEX_NAME']):
  pc = Pinecone()
  if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=4096,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

  return pc.Index(index_name)

In [None]:
# @title Create Text Splitter
def create_text_splitter(chunk_size=1000, chunk_overlap=0):
  return CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

In [213]:
# @title Add Documents to Vector Store
def Loader(file):
  if file.endswith('.csv'):
    loader = CSVLoader(file)
  elif file.endswith('.txt'):
    loader = TextLoader(file)
  return loader

def add_documents(path, ids):
  loader = Loader(path)
  documents = loader.load()
  print(f'Loaded {len(documents)} document(s) from {path}')
  docs = text_splitter.split_documents(documents)
  print(docs)
  #print([f'{ids}_{str(i)}' for i in range(0, len(docs))])
  #vectorstore.add_documents(docs, ids=["doc1", "doc2"])
  print(f'Added {len(docs)} text chunk(s) to vector store.')

In [None]:
loader = Loader('https://github.com/lehuong240823/rag-company-knowledge-consultant-chatbot/landing_doanh_nghiep.txt')
documents = loader.load()
print(documents)

In [None]:

for root, dirs, files in os.walk(reference_path):
  for name in files:
    #add_documents(join(root, name))
    path = join(root, name)
    add_documents(path, name[:-4])



In [None]:
# @title Add Texts to Vector Store
def add_texts(text):
  texts = text_splitter.split_text(text)
  vectorstore.add_texts(texts)
  print(f'Added {len(texts)} text chunks to vector store.')

In [None]:
# @title Get Evaluator Path
def get_evaluator_path(evaluator_name):
  return f'{evaluators_path}/{evaluator_name}'

In [None]:
# @title Load Output Schema
def load_output_schema(evaluator_name):
  file_path = f'{get_evaluator_path(evaluator_name)}/output_schema.yaml'
  with open(file_path, 'r') as f:
    return yaml.safe_load(f)

In [None]:
# @title Load Chat Prompt
def load_chat_prompt(evaluator):
  file_path = f'{get_evaluator_path(evaluator)}/prompt.yaml'
  with open(file_path, 'r') as f:
    config = yaml.safe_load(f)

  role_map = {'system': SystemMessagePromptTemplate, 'user': HumanMessagePromptTemplate, 'ai': AIMessagePromptTemplate}

  messages = []
  for msg_config in config['messages']:
    messages.append(role_map[msg_config['role']].from_template(msg_config['template']))
  #messages = [role_map[m['role']].from_template(m['template']) for m in cfg['messages']]

  return ChatPromptTemplate.from_messages(messages)

In [112]:
# @title Create LangSmith Dataset
def create_dataset(file_name, dataset_name=None, provider='langsmith'):
  #Load json file only
  dataset_name = file_name[:-5] if dataset_name is None else dataset_name
  with open(join(embedding_path, provider, file_name)) as f:
    if not client.has_dataset(dataset_name=dataset_name):
      return client.create_dataset(dataset_name=dataset_name)
      client.create_examples(
        dataset_id=dataset.id,
        examples=json.load(f)
      )

In [None]:
# @title Create Structured Grader LLM
def create_structured_grader_llm(model='qwen/qwen3-8b:free', temperature=0, evaluator=None):
  return ChatOpenAI(model=model, temperature=temperature).with_structured_output(
    load_output_schema(evaluator),
    method='json_schema', strict=True
  )

# Evaluators

In [None]:
# @title Initialize Components
index = get_index()

client = Client()

#llm = ChatOpenAI(model='qwen/qwen3-8b:free')
llm = ChatOpenAI(model='Qwen/Qwen3-8B:featherless-ai')

embeddings = HuggingFaceEndpointEmbeddings(model='Qwen/Qwen3-Embedding-8B')

vectorstore = PineconeVectorStore(index=index, embedding=embeddings)

text_splitter = create_text_splitter()

retriever = vectorstore.as_retriever()

qa_chain = ConversationalRetrievalChain.from_llm(
  llm,
  retriever=retriever,
  combine_docs_chain_kwargs={'prompt': load_chat_prompt('qa_chain')},
  return_source_documents=True
)

create_dataset('FQA.json')

In [None]:
chat_history = []
query = 'công ty này ở phường nào'
response = qa_chain.invoke({'question': query, 'chat_history': chat_history})

print(response['source_documents'])

[Document(id='46b8f2d3-955d-4652-82d6-b98c9b66f8ac', metadata={'source': '/content/drive/MyDrive/Colab Notebooks/public/RAG_ChatBot/dataset/introduction.txt'}, page_content='Mã số thuế\t\n2301025890 - Ngày cấp: 17/04/2018\nTên đơn vị\t\nCÔNG TY TNHH THƯƠNG MẠI VÀ ĐẦU TƯ TỔNG HỢP ANH PHÁT\nĐịa chỉ theo CQT\t\nNR ông Nguyễn Văn Trường, xóm Rừng, khu Bồ Sơn, Phường Võ Cường, Tỉnh Bắc Ninh, Việt Nam\nĐịa chỉ sau sáp nhập\t\nHệ thống tìm thấy 1 kết quả địa chỉ mới liên quan của MST 2301025890:\n\n- Địa chỉ 1: NR ông Nguyễn Văn Trường, xóm Rừng, khu Bồ Sơn, Phường Võ Cường, Tỉnh Bắc Ninh, Việt Nam\n\n- Căn cứ:\n\nPhường Võ Cường: Sắp xếp toàn bộ diện tích tự nhiên, quy mô dân số của các phường Đại Phúc, Phong Khê và Võ Cường thành phường mới có tên gọi là phường Võ Cường.\n(Thông tin mang tính tham khảo, để có thông tin chính xác vui lòng tra cứu từ website của Cục thuế hoặc Cổng thông tin doanh nghiệp quốc gia trước khi lập/xuất hóa đơn, chứng từ điện tử)\n\nTrạng thái\tNNT đang hoạt động

In [None]:
# @title Correctness
def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
  evaluator = inspect.currentframe().f_code.co_name
  grader_llm = create_structured_grader_llm(evaluator=evaluator)
  grade = grader_llm.invoke(load_chat_prompt(evaluator).format(
    question=inputs['question'],
    reference=reference_outputs['answer'],
    answer=outputs['answer']
  ))
  return grade['correct']

In [None]:
# @title Relevance
def relevance(inputs: dict, outputs: dict) -> bool:
  evaluator = inspect.currentframe().f_code.co_name
  grader_llm = create_structured_grader_llm(evaluator=evaluator)
  grade = grader_llm.invoke(load_chat_prompt(evaluator).format(
    question=inputs['question'],
    answer=outputs['answer']
  ))
  return grade['relevant']

In [None]:
# @title Groundedness
def groundedness(inputs: dict, outputs: dict) -> bool:
  evaluator = inspect.currentframe().f_code.co_name
  grader_llm = create_structured_grader_llm(evaluator=evaluator)
  grade = grader_llm.invoke(load_chat_prompt(evaluator).format(
    facts='\n'.join(doc.page_content for doc in outputs["documents"]),
    answer=outputs['answer']
  ))
  return grade['grounded']

In [None]:
# @title Retrieval relevance
def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
  evaluator = inspect.currentframe().f_code.co_name
  grader_llm = create_structured_grader_llm(evaluator=evaluator)
  grade = grader_llm.invoke(load_chat_prompt(evaluator).format(
    facts='\n'.join(doc.page_content for doc in outputs["documents"]),
    answer=outputs['answer']
  ))
  return grade['relevant']

In [None]:
# @title Target for experiment
@traceable()
def rag_bot(question: str) -> dict:
  ai_msg = qa_chain.invoke({'question': question, 'chat_history': []})
  return {'answer': ai_msg['answer'], 'documents': ai_msg['source_documents']}

def target(inputs: dict) -> dict:
  return rag_bot(inputs['question'])

In [None]:
# @title Experiment
def experiment():
  experiment_results = client.evaluate(
    target,
    data='FQA',
    evaluators=[correctness, relevance, groundedness, retrieval_relevance],
    experiment_prefix='rag-doc-relevance',
  )

In [None]:
experiment()

View the evaluation results for experiment: 'rag-doc-relevance-299079d8' at:
https://smith.langchain.com/o/00fa7863-e8c4-446a-9682-0dec08610f9c/datasets/ae8be8f8-25cb-40c3-aeae-5ef8b9c93069/compare?selectedSessions=22118175-c703-43bd-aed6-7f2c943ba064




0it [00:00, ?it/s]

ERROR:langsmith.evaluation._runner:Error running target function: Error code: 429 - {'error': {'message': 'Rate limit exceeded: free-models-per-day. Add 10 credits to unlock 1000 free model requests per day', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '50', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1757548800000'}, 'provider_name': None}}, 'user_id': 'user_31uNc4Fy7pr9cnTj6uhRqBKDZdV'}
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/langsmith/evaluation/_runner.py", line 1924, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "/tmp/ipython-input-2994525380.py", line 8, in target
    return rag_bot(inputs['question'])
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-2994525380.py", line 4, in rag_bot
    ai_msg = qa_chain.invoke({'question': question, 'chat_history': []})
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lan

In [None]:
print(experiment_results.to_pandas().to_markdown())

|    | inputs.question                                                      | outputs.answer                                                                                                                                                                                                                                                 | outputs.documents                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        