In [3]:
import asyncio
from crawl4ai import *

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url = "https://www.deeplearning.ai/the-batch/issue-282/",
        )
        print(result.markdown)


await main()

[INIT].... → Crawl4AI 0.4.246
[FETCH]... ↓ https://www.deeplearning.ai/the-batch/issue-282/... | Status: True | Time: 3.74s
[SCRAPE].. ◆ Processed https://www.deeplearning.ai/the-batch/issue-282/... | Time: 107ms
[COMPLETE] ● https://www.deeplearning.ai/the-batch/issue-282/... | Status: True | Total: 3.94s
✨ New course! Enroll in [Reasoning with o1](https://www.deeplearning.ai/the-batch/issue-282/<https:/bit.ly/3ZXjdrf>)
[![](data:image/svg+xml,%3csvg%20xmlns=%27http://www.w3.org/2000/svg%27%20version=%271.1%27%20width=%27300%27%20height=%2792%27/%3e)![The Batch](https://www.deeplearning.ai/_next/image/?url=%2F_next%2Fstatic%2Fmedia%2Fdlai-batch-logo.a60dbb9f.png&w=640&q=75)](https://www.deeplearning.ai/the-batch/issue-282/</the-batch/>)
  * [Explore Courses](https://www.deeplearning.ai/the-batch/issue-282/</courses/>)
  * [AI Newsletter](https://www.deeplearning.ai/the-batch/issue-282/</the-batch/>)
    * [The Batch](https://www.deeplearning.ai/the-batch/issue-282/</the-batch/>)
    *

# Using LLMs to Extract content

In [40]:
import os, json
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import WebCrawler

url = "https://www.deeplearning.ai/the-batch/"

class ArticleContent(BaseModel):
    article_title: str
    article_heading: str
    article_link: str

strategy = LLMExtractionStrategy(
    provider='openai/gpt-4o',
    api_token=os.environ.get('OPENAI_API_KEY'),
    schema=ArticleContent.model_json_schema(),
    instruction="Extract the first article title"
)


async def main2():
    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Run the crawler on a URL
        results = await crawler.arun(url=url, extraction_strategy=strategy)
        print(results.extracted_content)
        print("\n\nDone")


In [41]:
await main2()

[INIT].... → Crawl4AI 0.4.246
[FETCH]... ↓ https://www.deeplearning.ai/the-batch/... | Status: True | Time: 0.06s
[COMPLETE] ● https://www.deeplearning.ai/the-batch/... | Status: True | Total: 0.08s



Done


In [52]:
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
import os, json

class OpenAIModelFee(BaseModel):
    title: str = Field(..., description="Title of the weekly issue")
    # input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    # output_fee: str = Field(
    #     ..., description="Fee for output token for the OpenAI model."
    # )

async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: dict = None):
    print(f"\n--- Extracting Structured Data with {provider} ---")

    # Skip if API token is missing (for providers that require it)
    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return

    extra_args = {"extra_headers": extra_headers} if extra_headers else {}

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://www.deeplearning.ai/the-batch/",
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
                provider=provider,
                api_token=api_token,
                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
                instruction="""Extract all the articles titles."
                "{article_title: 'Top AI Stories of 2024! Agents Rise, Prices Fall, Models Shrink, Video Takes Off, Acquisitions Morph'}.""",
                **extra_args
            ),
            cach_mode = CacheMode.ENABLED
        )
        print(json.loads(result.extracted_content)[:5])

# Usage:
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
# await extract_structured_data_using_llm("ollama/llama3.2")
await extract_structured_data_using_llm("openai/gpt-4o-mini", os.getenv("OPENAI_API_KEY"))


--- Extracting Structured Data with openai/gpt-4o-mini ---
[INIT].... → Crawl4AI 0.4.246
[FETCH]... ↓ https://www.deeplearning.ai/the-batch/... | Status: True | Time: 0.03s
[COMPLETE] ● https://www.deeplearning.ai/the-batch/... | Status: True | Total: 0.04s


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# Extracting the Article Dates

In [34]:
from bs4 import BeautifulSoup
from data_ingestion.utils.utils import get_website_html
THE_BATCH_URL = "https://www.deeplearning.ai/the-batch/"

base_url_html = get_website_html(THE_BATCH_URL)


def get_articles_publication_dates(base_url_html):
    soup = BeautifulSoup(base_url_html, "html.parser")
    article_dates = []
    featured_article_date = soup.find("div", class_ = "inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-white text-slate-500")
    article_dates.append(featured_article_date.text)
    dates_div = soup.find_all("div", class_ = "inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-slate-100 text-slate-500")
    for div in dates_div:
        article_dates.append(div.text)
    return article_dates

dates = get_articles_publication_dates(base_url_html)


In [36]:
print(len(dates))
print(dates)

16
['Jan 01, 2025', 'Dec 25, 2024', 'Dec 18, 2024', 'Dec 11, 2024', 'Dec 04, 2024', 'Nov 27, 2024', 'Nov 20, 2024', 'Nov 13, 2024', 'Nov 06, 2024', 'Oct 30, 2024', 'Oct 23, 2024', 'Oct 16, 2024', 'Oct 09, 2024', 'Oct 02, 2024', 'Sep 25, 2024', 'Sep 18, 2024']


# **Preparing JSON for SwiftUI**

{  
    "article_link" "",  
    "article_title": "",  
    "article_publication_date": ""  
    "article_image_url: ""  
}  

In [1]:
from data_ingestion.vector_db import get_vector_store


vector_store = get_vector_store()

In [4]:
vector_db = vector_store.get()
print(vector_db.keys())

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])


In [5]:
vector_db['metadatas']

[{'article_link': 'https://www.deeplearning.ai/the-batch/issue-272/',
  'article_title': 'AI Giants Go Nuclear, A Tech Bromance Turns Turbulent, Mistral Sharpens the Edge, Cheaper Video Generation',
  'chunk_heading': 'Introduction',
  'source': 0},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-272/',
  'article_title': 'AI Giants Go Nuclear, A Tech Bromance Turns Turbulent, Mistral Sharpens the Edge, Cheaper Video Generation',
  'chunk_heading': 'I’d like to focus',
  'source': 1},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-272/',
  'article_title': 'AI Giants Go Nuclear, A Tech Bromance Turns Turbulent, Mistral Sharpens the Edge, Cheaper Video Generation',
  'chunk_heading': 'With these tactics, scrappy',
  'source': 2},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-272/',
  'article_title': 'AI Giants Go Nuclear, A Tech Bromance Turns Turbulent, Mistral Sharpens the Edge, Cheaper Video Generation',
  'chunk_heading': 'AI Giants G

In [43]:
metadatas = vector_db['metadatas']
sorted_metadatas = []
for metadata in metadatas:
    if metadata['source'] == 0:
        sorted_metadatas.append(metadata)

sorted_metadatas = sorted(sorted_metadatas, key = lambda x:x['article_link'])
sorted_metadatas.reverse()
sorted_metadatas

[{'article_link': 'https://www.deeplearning.ai/the-batch/issue-282/',
  'article_title': 'Happy New Year! Hopes For 2025 With Mustafa Suleyman, Audrey Tang, Albert Gu, Hanno Basse, Joseph Gonzalez, David Ding',
  'chunk_heading': 'Introduction',
  'source': 0},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-281/',
  'article_title': 'Top AI Stories of 2024! Agents Rise, Prices Fall, Models Shrink, Video Takes Off, Acquisitions Morph',
  'chunk_heading': 'Introduction',
  'source': 0},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-280/',
  'article_title': 'Phi-4 Breaks Size Barrier, HunyuanVideo Narrows Open Source Gap, Gemini 2.0 Flash Accelerates Multimodal Modeling, LLMs Propose Research Ideas',
  'chunk_heading': 'Introduction',
  'source': 0},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-279/',
  'article_title': 'Amazon Nova’s Competitive Price/Performance, OpenAI o1 Pro’s High Price/Performance, Google’s Game Worlds on Tap, Fact

In [21]:
from openai import OpenAI
client = OpenAI()

def generate_image(article_title):
    response = client.images.generate(
        model="dall-e-2",
        prompt=f"{article_title}",
        size="512x512",
        quality="standard",
        n=1,
    )
    return response.data[0].url

In [53]:
final_metadatas = []

for metadata in zip(sorted_metadatas, dates):
    new_metadata = metadata[0]
    new_metadata["article_publication_date"] = metadata[1]
    final_metadatas.append(new_metadata)


final_metadatas = [{k: v for k, v in metadata.items() if k != "source" and k != "chunk_heading"} for metadata in final_metadatas]
    

In [57]:
def generate_image(article_title):
    return "https://www.SampleImageURL.com"

print(len(final_metadatas))

for met in final_metadatas:
    image_url = generate_image(met['article_title'])
    met["article_image_url"] = image_url
    print(met, end = "\n\n")

16
{'article_link': 'https://www.deeplearning.ai/the-batch/issue-282/', 'article_title': 'Happy New Year! Hopes For 2025 With Mustafa Suleyman, Audrey Tang, Albert Gu, Hanno Basse, Joseph Gonzalez, David Ding', 'article_publication_date': 'Jan 01, 2025', 'article_image_url': 'https://www.SampleImageURL.com'}

{'article_link': 'https://www.deeplearning.ai/the-batch/issue-281/', 'article_title': 'Top AI Stories of 2024! Agents Rise, Prices Fall, Models Shrink, Video Takes Off, Acquisitions Morph', 'article_publication_date': 'Dec 25, 2024', 'article_image_url': 'https://www.SampleImageURL.com'}

{'article_link': 'https://www.deeplearning.ai/the-batch/issue-280/', 'article_title': 'Phi-4 Breaks Size Barrier, HunyuanVideo Narrows Open Source Gap, Gemini 2.0 Flash Accelerates Multimodal Modeling, LLMs Propose Research Ideas', 'article_publication_date': 'Dec 18, 2024', 'article_image_url': 'https://www.SampleImageURL.com'}

{'article_link': 'https://www.deeplearning.ai/the-batch/issue-279/'