In [115]:
import re, os
import chromadb
from dotenv import load_dotenv
from chromadb.utils import embedding_functions
from langchain.text_splitter import MarkdownTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

from scraper import get_featured_article

load_dotenv()

True

In [78]:
article_text = get_featured_article()
print(f"\n\nArticle:\n", article_text)

Featured Article URL: https://www.deeplearning.ai/the-batch/issue-281/
Successfully fetched the featured article page!
Formatted Content for LLM:


Article:
 ## Introduction
Dear friends,
Is AI progressing rapidly? Yes! But while the progress of underlying AI technology has indeed sped up over the past 2 years, the fastest acceleration is in applications.
Consider this: GPT-4 was released March 2023. Since then, models have become much faster, cheaper, sometimes smaller, more multimodal, and better at reasoning, and many more open weight versions are available — so progress has been fantastic! (Claims that AI is “hitting a wall” seem extremely ill-informed.) But more significantly, many applications that  already were theoretically possible using the March 2023 version of GPT-4 — in areas such as customer service, question answering, and process automation — now have significant early momentum.
I’m confident 2025 will see even faster and more exciting advances than 2024 in both AI tech

## Get Article Chunks Function

In [79]:
def get_article_chunks(article_text):
    markdown_splitter = MarkdownTextSplitter(chunk_size = 2000, chunk_overlap = 200)
    docs = markdown_splitter.split_text(article_text)
    return docs


docs = get_article_chunks(article_text)
print(f"Length of docs: {len(docs)}")
for i, doc in enumerate(docs):
        print(f"Chunk #{i + 1}\n{doc}\n\n")

Length of docs: 15
Chunk #1
## Introduction
Dear friends,
Is AI progressing rapidly? Yes! But while the progress of underlying AI technology has indeed sped up over the past 2 years, the fastest acceleration is in applications.
Consider this: GPT-4 was released March 2023. Since then, models have become much faster, cheaper, sometimes smaller, more multimodal, and better at reasoning, and many more open weight versions are available — so progress has been fantastic! (Claims that AI is “hitting a wall” seem extremely ill-informed.) But more significantly, many applications that  already were theoretically possible using the March 2023 version of GPT-4 — in areas such as customer service, question answering, and process automation — now have significant early momentum.
I’m confident 2025 will see even faster and more exciting advances than 2024 in both AI technology and applications. Looking back, the one thing that could have stopped AI was bad, anti-competitive regulation that would ha

## Create Metadata Function (Article Link, Chunk Heading, Chunk Index)

In [147]:
def create_metadata(docs):
    headings = []
    chunk_index = []
    metadatas = []
    ids = []
    for index, doc in enumerate(docs):
        if doc.startswith('##'):
            pattern = re.compile(r'^##\s+(.*)$', re.MULTILINE)
            match = pattern.search(doc)
            heading = match.group(1)
            headings.append(match.group(1))
            metadatas.append({"Heading": heading, "source": index})
        else:
            words = doc.split()
            first_four = words[:4]
            heading = ' '.join(first_four)
            headings.append(heading)
            metadatas.append({"Heading": heading, "source": index})

        chunk_index.append(f"Chunk #{index}")
    # metadatas["Headings"] = headings
    # metadatas["Indexes"] = chunk_index
        ids.append(f"id{index}")
    return metadatas, ids

metadatas, ids = create_metadata(docs)
# print(f"Headings: \n{metadatas["Headings"]}\n\nChunk Indexes: \n{metadatas["Indexes"]}")
print(metadatas)
print(ids)

[{'Heading': 'Introduction', 'source': 0}, {'Heading': 'A Blizzard of Progress', 'source': 1}, {'Heading': 'Agents Ascendant', 'source': 2}, {'Heading': '- Throughout the year,', 'source': 3}, {'Heading': 'Prices Tumble', 'source': 4}, {'Heading': '- Makers of closed', 'source': 5}, {'Heading': 'Behind the news: Prominent', 'source': 6}, {'Heading': 'Generative Video Takes Off', 'source': 7}, {'Heading': '- Meta introduced Movie', 'source': 8}, {'Heading': 'Smaller Is Beautiful', 'source': 9}, {'Heading': '- The tide started', 'source': 10}, {'Heading': '- In 2006, Rich', 'source': 11}, {'Heading': 'Alternatives to Acquisitions', 'source': 12}, {'Heading': '- In October, Amazon', 'source': 13}, {'Heading': 'Where things stand: Giving', 'source': 14}]
['id0', 'id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11', 'id12', 'id13', 'id14']


## ChromaDB Stuff

In [80]:
chroma_client = chromadb.Client()

In [81]:
collection = chroma_client.create_collection(name = "article_collection")

In [96]:
collection.add(ids = ids, metadatas = metadatas, documents = docs)

In [97]:
collection.peek()

{'ids': ['id0', 'id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9'],
 'embeddings': array([[-0.05990074, -0.09561614,  0.0620347 , ..., -0.1074878 ,
         -0.02350325,  0.04333426],
        [-0.02935752, -0.05063661, -0.01405945, ..., -0.04106434,
         -0.07007962,  0.01929996],
        [-0.05704041, -0.08026227, -0.04355863, ...,  0.03351628,
          0.00378328, -0.0132593 ],
        ...,
        [-0.04937019, -0.07073288, -0.0320917 , ..., -0.02720085,
          0.01391544,  0.0292466 ],
        [-0.08720752, -0.0737434 , -0.00401154, ..., -0.04853952,
          0.06667031,  0.03110035],
        [ 0.03579304, -0.05337286,  0.07313617, ..., -0.06324023,
         -0.00179799,  0.05418792]], shape=(10, 384)),
 'documents': ['## Introduction\nDear friends,\nIs AI progressing rapidly? Yes! But while the progress of underlying AI technology has indeed sped up over the past 2 years, the fastest acceleration is in applications.\nConsider this: GPT-4 was released March 202

In [98]:
collection.count()

15

In [100]:
response = collection.query(
    query_texts=["What is the new buzzword in AI?"],
    n_results=3,
    # where={"metadata_field": "is_equal_to_this"},
    # where_document={"$contains":"search_string"}
)
response

{'ids': [['id2', 'id1', 'id0']],
 'embeddings': None,
 'documents': [['## Agents Ascendant\nThe AI community laid the foundation for systems that can act by prompting large language models iteratively, leading to much higher performance across a range of applications.\nWhat happened: AI gained a new buzzword — agentic — as researchers, tool vendors, and model builders equipped large language models (LLMs) to make choices and take actions to achieve goals. These developments set the stage for an upswell of agentic activity in the coming year and beyond.\nDriving the story: Several tools emerged to help developers build agentic workflows.\n- Microsoft primed the pump for agentic development tools in late 2023 with Autogen, an open source conversational framework that orchestrates collaboration among multiple agents. (Learn how to take advantage of it in our short course “ AI Agentic Design Patterns with Autogen .”) In late 2024, part of the Autogen team split off to build AG2 based on a 

In [103]:
for doc in response['documents'][0]:
    print(f"{doc}\n\n")

## Agents Ascendant
The AI community laid the foundation for systems that can act by prompting large language models iteratively, leading to much higher performance across a range of applications.
What happened: AI gained a new buzzword — agentic — as researchers, tool vendors, and model builders equipped large language models (LLMs) to make choices and take actions to achieve goals. These developments set the stage for an upswell of agentic activity in the coming year and beyond.
Driving the story: Several tools emerged to help developers build agentic workflows.
- Microsoft primed the pump for agentic development tools in late 2023 with Autogen, an open source conversational framework that orchestrates collaboration among multiple agents. (Learn how to take advantage of it in our short course “ AI Agentic Design Patterns with Autogen .”) In late 2024, part of the Autogen team split off to build AG2 based on a fork of the code base.
- In October 2023, CrewAI released its open source P

## ChromaDB Stuff using LangChain

In [148]:
embeddings = OpenAIEmbeddings(model = "text-embedding-3-small")

vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=embeddings
)

In [127]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = "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"

In [149]:
vector_store.add_texts(texts=docs, ids=ids, metadatas=metadatas)

['id0',
 'id1',
 'id2',
 'id3',
 'id4',
 'id5',
 'id6',
 'id7',
 'id8',
 'id9',
 'id10',
 'id11',
 'id12',
 'id13',
 'id14']

In [152]:
vector_store.get(ids[:5])

{'ids': ['id0', 'id1', 'id2', 'id3', 'id4'],
 'embeddings': None,
 'documents': ['## Introduction\nDear friends,\nIs AI progressing rapidly? Yes! But while the progress of underlying AI technology has indeed sped up over the past 2 years, the fastest acceleration is in applications.\nConsider this: GPT-4 was released March 2023. Since then, models have become much faster, cheaper, sometimes smaller, more multimodal, and better at reasoning, and many more open weight versions are available — so progress has been fantastic! (Claims that AI is “hitting a wall” seem extremely ill-informed.) But more significantly, many applications that\xa0 already were theoretically possible using the March 2023 version of GPT-4 — in areas such as customer service, question answering, and process automation — now have significant early momentum.\nI’m confident 2025 will see even faster and more exciting advances than 2024 in both AI technology and applications. Looking back, the one thing that could have 

In [136]:
results = vector_store.similarity_search(
    "What is the new buzzword in AI?",
    k = 2
)

for res in results:
    print(f"* {res.page_content}")

* ## Agents Ascendant
The AI community laid the foundation for systems that can act by prompting large language models iteratively, leading to much higher performance across a range of applications.
What happened: AI gained a new buzzword — agentic — as researchers, tool vendors, and model builders equipped large language models (LLMs) to make choices and take actions to achieve goals. These developments set the stage for an upswell of agentic activity in the coming year and beyond.
Driving the story: Several tools emerged to help developers build agentic workflows.
- Microsoft primed the pump for agentic development tools in late 2023 with Autogen, an open source conversational framework that orchestrates collaboration among multiple agents. (Learn how to take advantage of it in our short course “ AI Agentic Design Patterns with Autogen .”) In late 2024, part of the Autogen team split off to build AG2 based on a fork of the code base.
- In October 2023, CrewAI released its open source

In [137]:
results = vector_store.similarity_search_with_score(
    "What tools were introduced to create agentic AI applications?",
    k = 2
)

for res, score in results:
    print(f"{res.page_content}\n{score:3f}\n\n")

## Agents Ascendant
The AI community laid the foundation for systems that can act by prompting large language models iteratively, leading to much higher performance across a range of applications.
What happened: AI gained a new buzzword — agentic — as researchers, tool vendors, and model builders equipped large language models (LLMs) to make choices and take actions to achieve goals. These developments set the stage for an upswell of agentic activity in the coming year and beyond.
Driving the story: Several tools emerged to help developers build agentic workflows.
- Microsoft primed the pump for agentic development tools in late 2023 with Autogen, an open source conversational framework that orchestrates collaboration among multiple agents. (Learn how to take advantage of it in our short course “ AI Agentic Design Patterns with Autogen .”) In late 2024, part of the Autogen team split off to build AG2 based on a fork of the code base.
- In October 2023, CrewAI released its open source P

## Chains

In [166]:
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_openai import OpenAI, ChatOpenAI

llm = ChatOpenAI(model = "gpt-4o-mini", temperature=0.5)

chain = RetrievalQAWithSourcesChain.from_llm(llm = llm, retriever=vector_store.as_retriever())

query = "Summarize the article"

response = chain.invoke({"question": query}, return_only_outputs = True)

In [167]:
response["answer"]

'The article discusses significant advancements in AI throughout 2024, highlighting improvements in agentic systems that enhance reasoning and tool usage. Smaller, more capable AI models have emerged, and applications in customer service and process automation are gaining momentum. The author expresses optimism for further advancements in 2025, despite concerns about regulatory challenges. Key developments include new frameworks for multi-agent collaboration from Microsoft and CrewAI, and partnerships between major tech companies and AI startups to access cutting-edge technology and talent. Overall, the article emphasizes the rapid evolution of AI and encourages ongoing learning in this field.\n\n'

In [168]:
print(response['answer'].format())

The article discusses significant advancements in AI throughout 2024, highlighting improvements in agentic systems that enhance reasoning and tool usage. Smaller, more capable AI models have emerged, and applications in customer service and process automation are gaining momentum. The author expresses optimism for further advancements in 2025, despite concerns about regulatory challenges. Key developments include new frameworks for multi-agent collaboration from Microsoft and CrewAI, and partnerships between major tech companies and AI startups to access cutting-edge technology and talent. Overall, the article emphasizes the rapid evolution of AI and encourages ongoing learning in this field.




{'answer': 'The article discusses significant advancements in AI throughout 2024, highlighting improvements in agentic systems that enhance reasoning and tool usage. Smaller, more capable AI models have emerged, and applications in customer service and process automation are gaining momentum. The author expresses optimism for further advancements in 2025, despite concerns about regulatory challenges. Key developments include new frameworks for multi-agent collaboration from Microsoft and CrewAI, and partnerships between major tech companies and AI startups to access cutting-edge technology and talent. Overall, the article emphasizes the rapid evolution of AI and encourages ongoing learning in this field.\n\n',
 'sources': '1, 0, 2, 12'}

## Creating a Generic Scraper

### Get Articles' Links Function

In [209]:
import requests
from bs4 import BeautifulSoup
import time

base_url = "https://www.deeplearning.ai/the-batch/"

def get_website_html(url):
    response = requests.get(url)
    return response.text

base_url_html = get_website_html(base_url)
print(base_url_html)

<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>The Batch | DeepLearning.AI | AI News &amp; Insights</title><meta name="description" content="Weekly AI news for engineers, executives, and enthusiasts." data-sentry-element="meta" data-sentry-source-file="seo.tsx"/><link rel="canonical" href="https://www.deeplearning.ai/the-batch/"/><meta property="og:type" content="website" data-sentry-element="meta" data-sentry-source-file="seo.tsx"/><meta property="og:title" content="The Batch | DeepLearning.AI | AI News &amp; Insights" data-sentry-element="meta" data-sentry-source-file="seo.tsx"/><meta property="og:description" content="Weekly AI news for engineers, executives, and enthusiasts." data-sentry-element="meta" data-sentry-source-file="seo.tsx"/><meta property="og:site_name" content="The Batch | DeepLearning.AI | AI News &amp; Insights" data-sentry-element="meta" data-sentry-source-file="seo.tsx"/><meta property="og:url

In [210]:
def get_articles_links(base_url_html):
    soup = BeautifulSoup(base_url_html, "html.parser")
    article_links = []
    
    for a_tag in soup.find_all("a", href = True):
        href = a_tag['href']
        if href.startswith("/the-batch/issue"):
            full_url = f"https://www.deeplearning.ai{href}"
            if full_url not in article_links:
                article_links.append(full_url)
                time.sleep(0.5)
    return article_links

links = get_articles_links(base_url_html)

for i, link in enumerate(links):
    print(f"{i} -- {link}")

0 -- https://www.deeplearning.ai/the-batch/issue-281/
1 -- https://www.deeplearning.ai/the-batch/issue-260/
2 -- https://www.deeplearning.ai/the-batch/issue-259/
3 -- https://www.deeplearning.ai/the-batch/issue-280/
4 -- https://www.deeplearning.ai/the-batch/issue-279/
5 -- https://www.deeplearning.ai/the-batch/issue-278/
6 -- https://www.deeplearning.ai/the-batch/issue-277/
7 -- https://www.deeplearning.ai/the-batch/issue-276/
8 -- https://www.deeplearning.ai/the-batch/issue-275/
9 -- https://www.deeplearning.ai/the-batch/issue-274/
10 -- https://www.deeplearning.ai/the-batch/issue-273/
11 -- https://www.deeplearning.ai/the-batch/issue-272/
12 -- https://www.deeplearning.ai/the-batch/issue-271/
13 -- https://www.deeplearning.ai/the-batch/issue-270/
14 -- https://www.deeplearning.ai/the-batch/issue-269/
15 -- https://www.deeplearning.ai/the-batch/issue-268/
16 -- https://www.deeplearning.ai/the-batch/issue-267/
17 -- https://www.deeplearning.ai/the-batch/issue-266/


### Get Article Text Function

In [212]:
# def get_article_text(url: str) -> str:
#     """Function to fetch all articles from the home page"""
#     response = requests.get(url)
#     if response.status_code != 200:
#         print(f"Failed to fetch response from {url}. Exited with error: {response.status_code}")
#         return ""

#     article_soup = BeautifulSoup(response.text, "html.parser")

#     main_div = article_soup.find("div", class_ = "prose--styled justify-self-center post_postContent__wGZtc")
#     if not main_div:
#         print(f"Could not find main content container in {url}")
#         return ""
    
#     article_text = main_div.get_text(separator="\n", strip=True)
#     return article_text


# os.makedirs("articles", exist_ok=True)

# for i, link in enumerate(links):
#     article_text = get_article_text(link)
#     if not article_text:
#         print(f"Article text not found for url: {link}")
#         continue
#     slug = re.sub(r'https?://[^/]+/', '', link)  # remove scheme and domain
#     slug = slug.strip("/").replace("/", "_")     # turn '/the-batch/issue-281/' -> 'the-batch_issue-281'
#     file_name = f"article_{i}_{slug}.txt"
    
#     file_path = os.path.join("articles", file_name)
    
#     with open(file_path, "w", encoding="utf-8") as f:
#         f.write(article_text)
    
#     print(f"Saved: {file_path}")


Saved: articles/article_0_the-batch_issue-281.txt
Saved: articles/article_1_the-batch_issue-260.txt
Saved: articles/article_2_the-batch_issue-259.txt
Saved: articles/article_3_the-batch_issue-280.txt
Saved: articles/article_4_the-batch_issue-279.txt
Saved: articles/article_5_the-batch_issue-278.txt
Saved: articles/article_6_the-batch_issue-277.txt
Saved: articles/article_7_the-batch_issue-276.txt
Saved: articles/article_8_the-batch_issue-275.txt
Saved: articles/article_9_the-batch_issue-274.txt
Saved: articles/article_10_the-batch_issue-273.txt
Saved: articles/article_11_the-batch_issue-272.txt
Saved: articles/article_12_the-batch_issue-271.txt
Saved: articles/article_13_the-batch_issue-270.txt
Saved: articles/article_14_the-batch_issue-269.txt
Saved: articles/article_15_the-batch_issue-268.txt
Saved: articles/article_16_the-batch_issue-267.txt
Saved: articles/article_17_the-batch_issue-266.txt


In [224]:
print(len(links))
    

18


In [237]:
def get_article_titles(base_url_html):
    soup = BeautifulSoup(base_url_html, "html.parser")
    articles_titles = []
    article_cards = soup.select("div.p-6")
    for card in article_cards:
        h2_tag = card.find("h2")
        if not h2_tag:
            continue
        # print(card)
        link_tag = card.find("a", href=True)
        print(link_tag)
        if not link_tag:
            print("Article out of scope")
            continue

        title = h2_tag.get_text(strip=True)
        articles_titles.append(title)
    return articles_titles

titles = get_article_titles(base_url_html)
print(f"Total titles: {len(titles)}")
for title in titles:
    print(f"{title}\n")
    

<a href="/the-batch/tag/dec-25-2024/"><div class="inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-white text-slate-500">Dec 25, 2024</div></a>
<a href="/the-batch/tag/dec-18-2024/"><div class="inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-slate-100 text-slate-500">Dec 18, 2024</div></a>
<a href="/the-batch/tag/dec-11-2024/"><div class="inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-slate-100 text-slate-500">Dec 11, 2024</div></a>
<a href="/the-batch/tag/dec-04-2024/"><div class="inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-slate-100 text-slate-500">Dec 04, 2024</div></a>
<a href="/the-batch/tag/nov-27-2024/"><div class="inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-slate-100 text-slate-500">Nov 27, 2024</div></a>
<a href="/the-batch/tag/nov-20-2024/"><div class="inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 re

str

In [219]:
#     slug = re.sub(r'https?://[^/]+/', '', link)  # remove scheme and domain
#     slug = slug.strip("/").replace("/", "_")     # turn '/the-batch/issue-281/' -> 'the-batch_issue-281'
#     file_name = f"article_{i}_{slug}.txt"
    
#     file_path = os.path.join("articles", file_name)
    
#     with open(file_path, "w", encoding="utf-8") as f:
#         f.write(article_text)
    
#     print(f"Saved: {file_path}")




# Div class name for Article Titles in Grid View:
# text-xl lg:text-2xl font-semibold tracking-tight leading-tight text-slate-800 font-primary mb-2

SyntaxError: invalid syntax (128902662.py, line 15)

In [271]:
def get_articles_links(base_url_html):
    base_url = "https://www.deeplearning.ai"
    soup = BeautifulSoup(base_url_html, "html.parser")
    article_links = []
    article_cards = soup.select("div.p-6")
    for card in article_cards:
        a_tag = card.find_all("a", href=True)
        if len(a_tag) < 2:
            print("Invalid article link (It is probably a featured article)...continuing")
            continue
        article_links.append(base_url + a_tag[1]["href"])
    return article_links
       

articles = get_articles_links(base_url_html)

Invalid article link (It is probably a featured article)...continuing


In [266]:
def get_featured_article_link(base_url_html):
    base_url = "https://www.deeplearning.ai"
    soup = BeautifulSoup(base_url_html, "html.parser")
    featured_article_div = soup.find("div", class_ = "col-span-1 lg:col-span-2")
    link = featured_article_div.find_all("a", href=True)
    if len(link) < 2:
        print("There was a problem fetching featured article's link")
        return ""
    return base_url + link[1]['href']
    


featured_article_link = get_featured_article_link(base_url_html)
print(featured_article_link)

https://www.deeplearning.ai/the-batch/issue-281/


In [272]:
articles


['https://www.deeplearning.ai/the-batch/issue-280/',
 'https://www.deeplearning.ai/the-batch/issue-279/',
 'https://www.deeplearning.ai/the-batch/issue-278/',
 'https://www.deeplearning.ai/the-batch/issue-277/',
 'https://www.deeplearning.ai/the-batch/issue-276/',
 'https://www.deeplearning.ai/the-batch/issue-275/',
 'https://www.deeplearning.ai/the-batch/issue-274/',
 'https://www.deeplearning.ai/the-batch/issue-273/',
 'https://www.deeplearning.ai/the-batch/issue-272/',
 'https://www.deeplearning.ai/the-batch/issue-271/',
 'https://www.deeplearning.ai/the-batch/issue-270/',
 'https://www.deeplearning.ai/the-batch/issue-269/',
 'https://www.deeplearning.ai/the-batch/issue-268/',
 'https://www.deeplearning.ai/the-batch/issue-267/',
 'https://www.deeplearning.ai/the-batch/issue-266/']

In [None]:
# STUFF

# get_featured_article_link return the link of the featured article only
# get_article_links returns the links of all articles on the main page except for the featured article.
# get_formatted_article_text saves the text of all articles (including the featured article) in the separate .txt files.

# Next STUFF
# Check wether get_article_titles are extracting all the titles and are in the order of the links extracted. Order is important for creating the metadatas later.
# Refactoring
# Create metadatas properly.
# Chunk all articles properly
# Store all articles in proper format in vector DB
# RAG chain