In [None]:
!pip install -qq scrapy
!pip install langchain
!pip install -qU langchain-community faiss-cpu
!pip install -qU langchain-openai
!pip install --upgrade --quiet  rank_bm25
!pip install langchain_experimental
!pip install -U FlagEmbedding
!pip install --upgrade --quiet  langchain-google-genai
!pip install pyngrok
!pip install flask_cors

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

ngrok_token = os.getenv('NGROK_TOKEN')
gemini_token = os.getenv('GEMINI_TOKEN')

# Data Crawling

In [None]:
import requests
import xml.etree.ElementTree as ET
import json

# URLs of the sitemaps
sitemap_urls = [
    'https://www.tiemtraannhien.vn/product-sitemap.xml'
]

all_urls = []

# Function to fetch and parse XML
def fetch_sitemap(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        root = ET.fromstring(response.content)
        # Extract all <loc> elements that contain the URLs
        for url_element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
            all_urls.append(url_element.text)
    except Exception as e:
        print(f"Error fetching or parsing {url}: {e}")

# Fetch and parse both sitemaps
for sitemap_url in sitemap_urls:
    fetch_sitemap(sitemap_url)

# Write the URLs to a JSON file
with open('all_urls.json', 'w') as f:
    json.dump(all_urls, f, indent=4)

print(f"Extracted {len(all_urls)} URLs and saved to all_urls.json")


In [None]:
import pandas as pd
import json
import re
# Specify the filename
filename = '/content/all_urls.json'

# Load the all_urls list from the JSON file
with open(filename, 'r') as file:
    all_urls = json.load(file)

print(f"All URLs loaded from {filename}")


import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup

class CustomSpider(scrapy.Spider):
    name = 'custom_spider'
    start_urls = all_urls

    # Initialize a counter
    request_count = 0

    def parse(self, response):
        self.request_count += 1  # Increment the counter with each request
        description = ""

        # Scraping the review title (h1 tag inside div.product_title)
        review_title = response.css('h1.product-title::text').get()

        if review_title:
            h1_tag = review_title.strip()
        else:
            h1_tag = ""

        # Now, h1_tag contains the content of the h1 tag
        print(h1_tag)


        price = response.css('span.woocommerce-Price-amount')

        if price:
            price = price.get().strip()
        else:
            price = ""
        match = re.search(r'>([\d.,]+₫)<', price)
        if match:
            price = match.group(1)
        # Scraping the ck-content
        ck_contents = response.css('div.woocommerce-Tabs-panel--description')

        for ck_content in ck_contents:
            for element in ck_content.xpath('./*'):
                # Extract the text from h2 and h3 tags
                if element.root.tag == 'h2':
                    description += ' '.join(element.css('::text').getall()).strip() + "\n"
                elif element.root.tag == 'h3':
                    description += ' '.join(element.css('::text').getall()).strip() + "\n"

                # Extract the text from p tags
                elif element.root.tag == 'p':
                    description += ' '.join(element.css('::text').getall()).strip() + "\n"

                # Extract the list items from ul tags
                elif element.root.tag == 'ul':
                    li_tags = element.css('li')
                    for li_tag in li_tags:
                        description += f"- {' '.join(li_tag.css('::text').getall()).strip()}" + "\n"

        # Initialize an empty array to hold image URLs
        image_urls = []

        # Select all the div elements with the specific class
        image_elements = response.css('div.woocommerce-product-gallery__image')

        # Loop through each image element to extract the URLs
        for element in image_elements:
            # Extract the main image URL from the 'data-large_image' attribute
            image_url = element.css('img::attr(data-large_image)').get()

            # Add the extracted image URL to the array
            if image_url:
                image_urls.append(image_url)

        data = {}

        if h1_tag and description:
            data = {
                "url": response.url,  # Add the URL of the request
                "content": description,
                "price": price,
                "title": h1_tag,  # h1_tag is now guaranteed to be a string
                "image_urls": image_urls
            }

            yield data
        # Print out the current request count
        print('====> h1_tag', h1_tag)
        print('====>description', description)
        print('====>image_urls', image_urls)
        print('====>price', price)

        self.logger.info(f"Number of requests done: {self.request_count}")
        self.logger.info(f"Crawled: {response.url}")


# Initialize the Scrapy crawler process
process = CrawlerProcess({
    'LOG_LEVEL': 'INFO',
    'FEEDS': {
        'output.json': {
            'format': 'json',
            'encoding': 'utf8',
            'store_empty': False,
            'fields': None,
            'indent': 4,
        },
    },
    'CLOSESPIDER_TIMEOUT': 60000000000,  # Close the spider after 60 seconds (adjust as needed)
    'DOWNLOAD_DELAY': 3,  # Delay of 2 seconds between each request
})

# Start the spider
process.crawl(CustomSpider)
process.start()


# Data Preprocessing

In [4]:
import json

# Load the `output.json` file
with open('output.json', 'r',encoding="utf8") as f:
    data = json.load(f)

In [5]:
len(data)

47

In [None]:
data[0]['content']

In [7]:
#remove excess characters and duplicate text
import re
for item in data:
  item['content'] = re.sub(r'[^\S ]+', ' ', item['content'])

In [8]:
introduction = "Tiệm trà An Nhiên chuyên cung cấp các sản phẩm chăm sóc sắc đẹp và sức khỏe có thành phần từ thiên nhiên với công nghệ sản xuất khác biệt, trong đó có thể kể đến các sản phẩm như trà dưỡng tâm an thần, trà gừng đường nâu thảo mộc, trà dưỡng nhan, trà gạo lứt đậu đen, bột cần tây nguyên chất, bột củ sen ngũ cốc dinh dưỡng,…xuất xứ tự nhiên không chứa chất bảo quản, không hóa chất độc hại và rất lành tính. Những nguyên liệu của Tiệm trà An Nhiên đều được tuyển chọn kỹ càng để có thể mang đến cho khách hàng trải những trải nghiệm tốt nhất!"

In [None]:
count = 0
exception = []
for item in data:
  if introduction in item['content']:
    count += 1
    item['content'] = item['content'].replace(introduction, '')
  else:
    exception.append(item)
print(count)

In [10]:
warranty = "TIỆM TRÀ AN NHIÊN – NUÔI DƯỠNG VẺ ĐẸP TỪ SÂU BÊN TRONG  ⚫ Cam kết 100% nguyên liệu từ thiên nhiên, không chất phụ gia, chất bảo quản, chất hóa học.  ⚫ Sản phẩm có giấy tờ chứng nhận đầy đủ  ⚫ Nhiệt tình hỗ trợ, tư vấn khách hàng 24/7.  ⚫ Miễn phí trả hàng 07 ngày theo quy định của Shopee. QUY ĐỊNH HỖ TRỢ ĐỔI TRẢ HÀNG CỦA TIỆM TRÀ AN NHIÊN:  ❶ Điều kiện áp dụng (trong vòng 07 ngày kể từ khi nhận sản phẩm)  – Hàng hoá vẫn còn mới, chưa qua sử dụng  – Hàng hoá bị lỗi hoặc hư hỏng do vận chuyển hoặc do nhà sản xuất ❷ Trường hợp được chấp nhận:  – Hàng không đúng loại như quý khách đặt hàng  – Không đủ số lượng như trong đơn hàng ❸ Trường hợp không đủ điều kiện áp dụng chính sách:  – Quá 07 ngày kể từ khi Quý khách nhận hàng  – Gửi lại hàng không đúng mẫu mã, không phải sản phẩm của Tiệm trà An Nhiên"

In [None]:
count = 0
exception = []
for item in data:
  if warranty in item['content']:
    count += 1
    item['content'] = item['content'].replace(warranty, '')
  else:
    exception.append(item)
print(count)

# Convert all the records into LangChain's Document format

In [12]:
from uuid import uuid4
from bs4 import BeautifulSoup
from langchain_core.documents import Document

In [13]:
list_of_documents = []
for item in data:
  # Extract the price value from HTML tag
  soup = BeautifulSoup(item['price'], 'html.parser')
  price = soup.bdi.text.strip()
  content = item['title'] + ' - Giá tiền: ' + price + ' - ' + item['content']
  # Convert the records to LangChain's Document format and append them to a list
  list_of_documents.append(Document(page_content=content, metadata={"source": item['url'],
                                                                    "image_urls":item['image_urls']}))

In [14]:
#append introduction and warranty
list_of_documents.append(Document(page_content=introduction))
list_of_documents.append(Document(page_content=warranty))

In [15]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(list_of_documents))]

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=gemini_token)

In [None]:
# Initialize the FAISS vector store
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
vector_store.add_documents(documents=list_of_documents, ids=uuids)

In [None]:
# Similarity search example with the vector store
semantic_results = vector_store.similarity_search(
    "Cho tôi một loại trà có chiết xuất từ hoa hồng",
    k=10,
)
for res in semantic_results:
    print(f"* {res.page_content} [{res.metadata}]")

In [20]:
# Using BM25 retriever from LangChain
from langchain_community.retrievers import BM25Retriever

In [21]:
bm25_retriever = BM25Retriever.from_documents(
  list_of_documents, k = 10
)

In [None]:
# Example with BM25 retriever
bm25_results = bm25_retriever.invoke("Cho tôi một loại trà có chiết xuất từ hoa hồng")
for res in bm25_results:
    print(f"* {res.page_content} [{res.metadata}]")

In [None]:
# Get the result chunks from BM25 Retriever and FAISS vector search
content = set()
retrieval_docs = []

for result in semantic_results:
  if result.page_content not in content:
    content.add(result.page_content)
    retrieval_docs.append(result)

for result in bm25_results:
  if result.page_content not in content:
    content.add(result.page_content)
    retrieval_docs.append(result)

len(retrieval_docs)

In [None]:
for res in retrieval_docs:
    print(f"* {res.page_content} [{res.metadata}]")

In [None]:
# Use the BAAI/bge-reranker-v2-m3 model to rerank the order of the result chunks based on relevance score
from FlagEmbedding import FlagReranker
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
pairs = [["Cho tôi một loại trà có chiết xuất từ hoa hồng",doc.page_content] for doc in retrieval_docs]
score = reranker.compute_score(pairs,normalize = True)
score

In [26]:
# Put all the methods above into a class called 'Retriever'

class Retriever:
  def __init__(self, semantic_retriever, bm25_retriever, reranker):
    self.semantic_retriever = semantic_retriever
    self.bm25_retriever = bm25_retriever
    self.reranker = reranker

  def __call__(self,query):
    semantic_results = self.semantic_retriever.similarity_search(
      query,
      k=10,
    )
    bm25_results = self.bm25_retriever.invoke(query)

    content = set()
    retrieval_docs = []

    for result in semantic_results:
      if result.page_content not in content:
        content.add(result.page_content)
        retrieval_docs.append(result)

    for result in bm25_results:
      if result.page_content not in content:
        content.add(result.page_content)
        retrieval_docs.append(result)

    pairs = [[query,doc.page_content] for doc in retrieval_docs]

    scores = self.reranker.compute_score(pairs,normalize = True)

    # Retrieve the parent document from the child chunk based on a threshold score.
    context_1 = []
    context_2 = []
    context = []
    parent_ids = set()
    for i in range(len(retrieval_docs)):
      # Relevance score >= 0.6 will be used as context type 1 (indicating higher relevance to the query)
      if scores[i] >= 0.6:
        context_1.append(dict(retrieval_docs[i])['page_content'])
      # Relevance score >= 0.1 will be used as context type 2 (indicating medium to lower relevance to the query)
      elif scores[i] >= 0.1:
        context_2.append(dict(retrieval_docs[i])['page_content'])
      # If the relevance score < 0.1, it indicates that there are no relevant documents.
    if len(context_1) > 0:
      print('Context 1')
      context=context_1
    elif len(context_2) > 0:
      print('Context 2')
      context=context_2
    else:
      print('No relevant context')
    return context

In [None]:
# Test the Retriever
retriever = Retriever(semantic_retriever = vector_store, bm25_retriever = bm25_retriever, reranker = reranker)
context = retriever("Hoa hồng")
context

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import AIMessage,HumanMessage
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_openai import ChatOpenAI
answerModel = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=gemini_token, temperature = 0.5)

In [32]:
answerPrompt = PromptTemplate.from_template("""
     Bạn là chuyên viên tư vấn khách hàng của tiệm trà thảo mộc An Nhiên.
     Câu hỏi của khách hàng: {query}\nChat History: {chat_history}\nTrả lời câu hỏi dựa vào Chat History và các thông tin sản phẩm của cửa hàng dưới đây: {source_information}.
 """)

In [33]:
answerChain = answerPrompt | answerModel

In [None]:
query = 'Tư vấn cho tôi loại trà có nguyên liệu là củ sen'
source_information = retriever(query)
answerChain.invoke({"query":query, "source_information":source_information, "chat_history":None})

# Expose API

In [37]:
import os
import json
from google.colab import userdata
from pyngrok import ngrok
from flask import Flask, jsonify, request
from flask_cors import CORS

In [None]:

ngrok.set_auth_token(ngrok_token)

app = Flask(__name__)
CORS(app)  # Apply CORS to the Flask app

@app.route('/v1/chat', methods=['POST'])
def chat_v1():
    # Extract parameters from the request
    user_message = request.json.get('message', {})
    chat_history = request.json.get('context', [])
    stream = True  # Default to False if not provided

    print(f'Message: {user_message}')
    print(f'Chat History: {chat_history}')

    context = retriever(user_message['human'])
    source_information = ""
    for doc in context:
      # content = dict(doc)['page_content'] + ' - Link ảnh: #' + str(doc.metadata['image_urls'])
      content = doc
      source_information+= content + "\n"
    if stream:
      def generate():
        for chunk in answerChain.stream({"query": user_message['human'], "chat_history": chat_history,"source_information": source_information}):
          yield chunk.content
      return app.response_class(generate(), mimetype='text/plain')
    else:
      reponse = answerChain.invoke({"query": user_message['human'], "chat_history":chat_history,"source_information": source_information})
      return jsonify({'response': reponse.content})

if __name__ == '__main__':
    # Start ngrok to tunnel the Flask app
    url = ngrok.connect(5000)
    print(f" * ngrok tunnel: {url}")

    # Start Flask app
    app.run(port=5000)