<a href="https://colab.research.google.com/github/kavyajeetbora/nlp_rag/blob/master/langchain_masterclass/04_02_RAG_webcrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!pip install -q langchain langchain_community langchain-openai chromadb randomname langchain_huggingface firecrawl-py

In [27]:
import os
import shutil
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from dotenv import load_dotenv

from langchain_community.document_loaders import WebBaseLoader, FireCrawlLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser  import StrOutputParser
import randomname
from glob import glob

In [24]:
if os.path.exists(".env"):
    os.remove(".env")

from google.colab import files
uploaded = files.upload()
if uploaded:
    if load_dotenv(".env"):
        print("Uploaded and Loaded Sucessfully")

Saving .env to .env
Uploaded and Loaded Sucessfully


## Using basic WebLoader

In [32]:
urls = ["https://www.motorola.in/"]

web_loader = WebBaseLoader(urls)
documents = web_loader.load()

In [33]:
for doc in documents:
    print(doc.metadata)
    print("Number of characters:",len(doc.page_content))

{'source': 'https://www.motorola.in/', 'title': 'motorola IN | Android phones & Razr', 'description': '#hellomoto | Discover our new unlocked Android phones from motorola and stay informed about our offers and promotions.', 'language': 'en-IN'}
Number of characters: 1028


## Using FireCrawl AI Scrapper

In [41]:
firecrawl_loader = FireCrawlLoader(url = urls, api_key = os.environ['FIRECRAWL_API_KEY'], mode='scrape')
documents = web_loader.load()

for doc in documents:
    print(doc.metadata)
    print("Number of characters:",len(doc.page_content))

{'source': 'https://www.motorola.in/', 'title': 'motorola IN | Android phones & Razr', 'description': '#hellomoto | Discover our new unlocked Android phones from motorola and stay informed about our offers and promotions.', 'language': 'en-IN'}
Number of characters: 1028


## Splitting the large documents into chunks

In [42]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size= 100,
    chunk_overlap=20
)
chunks = text_splitter.split_documents(documents)

print(f"The documents were splitted into {len(chunks)} chunks")

The documents were splitted into 12 chunks


## Storing the embeddings into vector database

In [43]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [44]:
os.makedirs("db", exist_ok=True)

def create_vector_database(chunks: list, embedding_model, name="vector_store"):

    random_suffix = randomname.get_name()

    persistent_directory = f"db/chroma-{name}-({random_suffix})"

    ## If already there, delete and create a new one

    for folder in glob(f"db/chroma-{name}*"):
        if os.path.exists(folder):
            shutil.rmtree(folder)

    os.mkdir(persistent_directory)

    vector_db = Chroma.from_documents(
        documents = chunks,
        collection_name = "movie_embeddings_v2",
        embedding = embedding_model,
        persist_directory = persistent_directory
    )

    return vector_db

In [45]:
%%time
vector_db = create_vector_database(chunks=chunks, embedding_model=embedding_model, name='assam_vector_store')

CPU times: user 113 ms, sys: 35.1 ms, total: 148 ms
Wall time: 962 ms


## Retrieving the data using a query

In [46]:
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [47]:
query = "List the mobiles that are under 20000 Rs ?"
results = retriever.invoke(query)

for result in results:
    print(result.page_content)
    print("-"*30)

Shipping*Fast Delivery* Republic day saleUNBEATABLE OFFERS ON  BEST SELLING SMARTPHONES
------------------------------
5gDealmoto g34 5g Dealmoto g04s moto g04Dealcomplete familyall phonesEar Budsmoto buds+moto
------------------------------
razrMotorola  razr 50  Motorola  razr 50 ultra motorola razr  40 ultra complete familymotorola
------------------------------


In [39]:
query = "List the mobiles that are under 20000 Rs ?"
results = retriever.invoke(query)

for result in results:
    print(result.page_content)
    print("-"*30)

Shipping*Fast Delivery* Republic day saleUNBEATABLE OFFERS ON  BEST SELLING SMARTPHONES
------------------------------
5gDealmoto g34 5g Dealmoto g04s moto g04Dealcomplete familyall phonesEar Budsmoto buds+moto
------------------------------
razrMotorola  razr 50  Motorola  razr 50 ultra motorola razr  40 ultra complete familymotorola
------------------------------
