In [None]:
!pip install -q langchain-community yt-dlp
!pip install --upgrade yt-dlp langchain-community
!pip install langchain-groq


In [2]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
import json
from dotenv import load_dotenv
import os

load_dotenv()

llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    temperature=0.7
)

prompt = ChatPromptTemplate(messages=["Write a poem about {word}, 100 words maximum"])
chain = prompt | llm 
response = chain.invoke({"word": "moon"})
print(response.content)

Silver crescent in the night,
The moon glows with gentle light.
Her phases mark the passage of time,
From new to full, a constant rhyme.

With gentle beams, she illuminates,
The darkness, and our soul creates,
A sense of peace, a sense of rest,
Under the moon's soft, lunar nest.


In [None]:
!pip install unstructured beautifulsoup4

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_sub_urls(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, "html.parser")

    all_links = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        full_url = urljoin(base_url, href)

        # Optional: filter out non-HTTP(S) and non-visible links like mailto: or JavaScript
        if full_url.startswith("http"):
            all_links.add(full_url)

    return all_links

In [5]:
base_url = "https://docs.python.org/3/tutorial/index.html"
len(get_sub_urls(base_url))

156

In [6]:
base_url = "https://www.tensorflow.org/tutorials"
len(get_sub_urls(base_url))

167

In [7]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote

def get_sub_urls_with_metadata(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, "html.parser")

    results = set()
    cleaned_results = []

    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        full_url = urljoin(base_url, href)

        if full_url.startswith("http") and full_url not in results:
            results.add(full_url)

            # Extract anchor text
            anchor_text = a_tag.get_text(strip=True)

            # Extract last part of path or fragment
            parsed_url = urlparse(full_url)
            last_part = unquote(parsed_url.fragment or parsed_url.path.rstrip("/").split("/")[-1])

            # Combine anchor text + last part of URL for better metadata
            full_text = anchor_text
            if last_part and last_part.lower() not in anchor_text.lower():
                full_text += f" ({last_part})"

            cleaned_results.append({
                "url": full_url,
                "text": full_text
            })

    return cleaned_results


In [8]:
base_url = "https://docs.python.org/3/tutorial/index.html"
get_sub_urls_with_metadata(base_url)[10:15]

[{'url': 'https://docs.python.org/3/library/index.html#library-index',
  'text': 'The Python Standard Library (library-index)'},
 {'url': 'https://docs.python.org/3/reference/index.html#reference-index',
  'text': 'The Python Language Reference (reference-index)'},
 {'url': 'https://docs.python.org/3/extending/index.html#extending-index',
  'text': 'Extending and Embedding the Python Interpreter (extending-index)'},
 {'url': 'https://docs.python.org/3/c-api/index.html#c-api-index',
  'text': 'Python/C API Reference Manual (c-api-index)'},
 {'url': 'https://docs.python.org/3/glossary.html#glossary',
  'text': 'Glossary'}]

In [9]:
base_url = "https://www.tensorflow.org/tutorials"
sublinks_tf = get_sub_urls_with_metadata(base_url)
sublinks_tf[10:15]

[{'url': 'https://js.tensorflow.org/api/latest/',
  'text': 'TensorFlow.js (latest)'},
 {'url': 'https://www.tensorflow.org/lite/api_docs',
  'text': 'TensorFlow Lite (api_docs)'},
 {'url': 'https://www.tensorflow.org/tfx/api_docs', 'text': 'TFX (api_docs)'},
 {'url': 'https://www.tensorflow.org/resources/models-datasets',
  'text': 'Ecosystem (models-datasets)'},
 {'url': 'https://www.tensorflow.org/js',
  'text': 'TensorFlow.jsDevelop web ML applications in JavaScript'}]

In [None]:
! pip install sentence-transformers

In [11]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2') 

link_texts = [item["text"] for item in sublinks_tf]
link_embeddings = model.encode(link_texts, convert_to_tensor=True)
query = "How do I detect objects in TensorFlow?"
query_embedding = model.encode(query, convert_to_tensor=True)

cos_scores = util.pytorch_cos_sim(query_embedding, link_embeddings)[0]

top_indices = cos_scores.topk(k=3).indices

for idx in top_indices:
    print(f"{sublinks_tf[idx]['text']}: {sublinks_tf[idx]['url']}")


  from .autonotebook import tqdm as notebook_tqdm


Using TensorFlow Datasets (overview): https://www.tensorflow.org/datasets/overview
Object detection with TF Hub (tf2_object_detection): https://www.tensorflow.org/hub/tutorials/tf2_object_detection
Stack Overflow (tensorflow): https://stackoverflow.com/questions/tagged/tensorflow


In [18]:
from langchain.document_loaders import WebBaseLoader
urls = set()
for idx in top_indices[:2]:
    urls.add(sublinks_tf[idx]['url'])
loader = WebBaseLoader(list(urls))
documents = loader.load()
documents



In [26]:
len(documents[0].page_content.split(" "))

5928

In [25]:
len(documents[1].page_content.split(" "))

10857

In [34]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=50
)
chunks = text_splitter.split_documents(documents)

In [35]:
len(chunks)

77

In [38]:
for chunk in chunks[:20]:
    print(len(chunk.page_content),">",len(chunk.page_content.split(" ")))

1912 > 1106
1958 > 869
1790 > 516
1829 > 222
1997 > 105
1995 > 112
1997 > 107
1996 > 94
1997 > 73
1988 > 63
1985 > 71
1981 > 70
1985 > 70
1978 > 74
1987 > 74
1962 > 76
844 > 28
1994 > 207
1896 > 222
1802 > 216


In [43]:
!pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Obtaining dependency information for faiss-cpu from https://files.pythonhosted.org/packages/ed/83/8aefc4d07624a868e046cc23ede8a59bebda57f09f72aee2150ef0855a82/faiss_cpu-1.11.0-cp311-cp311-macosx_14_0_arm64.whl.metadata
  Downloading faiss_cpu-1.11.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-macosx_14_0_arm64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [44]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings()

faiss_index = FAISS.from_documents(documents=chunks,embedding=embedding_model)

  embedding_model = HuggingFaceEmbeddings()


In [None]:
query = 'How do I detect objects in TensorFlow?'

results = faiss_index.similarity_search(query, k=3)

for res in results:
    print(res.page_content)


In [46]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=faiss_index.as_retriever())
chain



In [48]:
query = "How do I detect objects in TensorFlow?"
result = chain({"question": query})
result

{'question': 'How do I detect objects in TensorFlow?',
 'answer': 'To detect objects in TensorFlow, you can use the TensorFlow Hub Object Detection Colab, which provides pre-trained object detection models. You can select a model architecture and load pre-trained model weights. The TensorFlow Object Detection API can be used to visualize the results, including drawing boxes and labels on the image.\n\n',
 'sources': ''}