# Extracting the Article Dates

In [34]:
from bs4 import BeautifulSoup
from data_ingestion.utils.utils import get_website_html
THE_BATCH_URL = "https://www.deeplearning.ai/the-batch/"

base_url_html = get_website_html(THE_BATCH_URL)


def get_articles_publication_dates(base_url_html):
    soup = BeautifulSoup(base_url_html, "html.parser")
    article_dates = []
    featured_article_date = soup.find("div", class_ = "inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-white text-slate-500")
    article_dates.append(featured_article_date.text)
    dates_div = soup.find_all("div", class_ = "inline-flex rounded-md py-1 px-3 text-[13px] font-medium mb-3 relative z-10 bg-slate-100 text-slate-500")
    for div in dates_div:
        article_dates.append(div.text)
    return article_dates

dates = get_articles_publication_dates(base_url_html)


In [36]:
print(len(dates))
print(dates)

16
['Jan 01, 2025', 'Dec 25, 2024', 'Dec 18, 2024', 'Dec 11, 2024', 'Dec 04, 2024', 'Nov 27, 2024', 'Nov 20, 2024', 'Nov 13, 2024', 'Nov 06, 2024', 'Oct 30, 2024', 'Oct 23, 2024', 'Oct 16, 2024', 'Oct 09, 2024', 'Oct 02, 2024', 'Sep 25, 2024', 'Sep 18, 2024']


# **Preparing JSON for SwiftUI**

{  
    "article_link" "",  
    "article_title": "",  
    "article_publication_date": ""  
    "article_image_url: ""  
}  

In [1]:
from data_ingestion.vector_db import get_vector_store


vector_store = get_vector_store()

In [4]:
vector_db = vector_store.get()
print(vector_db.keys())

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])


In [5]:
vector_db['metadatas']

[{'article_link': 'https://www.deeplearning.ai/the-batch/issue-272/',
  'article_title': 'AI Giants Go Nuclear, A Tech Bromance Turns Turbulent, Mistral Sharpens the Edge, Cheaper Video Generation',
  'chunk_heading': 'Introduction',
  'source': 0},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-272/',
  'article_title': 'AI Giants Go Nuclear, A Tech Bromance Turns Turbulent, Mistral Sharpens the Edge, Cheaper Video Generation',
  'chunk_heading': 'I’d like to focus',
  'source': 1},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-272/',
  'article_title': 'AI Giants Go Nuclear, A Tech Bromance Turns Turbulent, Mistral Sharpens the Edge, Cheaper Video Generation',
  'chunk_heading': 'With these tactics, scrappy',
  'source': 2},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-272/',
  'article_title': 'AI Giants Go Nuclear, A Tech Bromance Turns Turbulent, Mistral Sharpens the Edge, Cheaper Video Generation',
  'chunk_heading': 'AI Giants G

In [43]:
metadatas = vector_db['metadatas']
sorted_metadatas = []
for metadata in metadatas:
    if metadata['source'] == 0:
        sorted_metadatas.append(metadata)

sorted_metadatas = sorted(sorted_metadatas, key = lambda x:x['article_link'])
sorted_metadatas.reverse()
sorted_metadatas

[{'article_link': 'https://www.deeplearning.ai/the-batch/issue-282/',
  'article_title': 'Happy New Year! Hopes For 2025 With Mustafa Suleyman, Audrey Tang, Albert Gu, Hanno Basse, Joseph Gonzalez, David Ding',
  'chunk_heading': 'Introduction',
  'source': 0},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-281/',
  'article_title': 'Top AI Stories of 2024! Agents Rise, Prices Fall, Models Shrink, Video Takes Off, Acquisitions Morph',
  'chunk_heading': 'Introduction',
  'source': 0},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-280/',
  'article_title': 'Phi-4 Breaks Size Barrier, HunyuanVideo Narrows Open Source Gap, Gemini 2.0 Flash Accelerates Multimodal Modeling, LLMs Propose Research Ideas',
  'chunk_heading': 'Introduction',
  'source': 0},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-279/',
  'article_title': 'Amazon Nova’s Competitive Price/Performance, OpenAI o1 Pro’s High Price/Performance, Google’s Game Worlds on Tap, Fact

In [53]:
final_metadatas = []

for metadata in zip(sorted_metadatas, dates):
    new_metadata = metadata[0]
    new_metadata["article_publication_date"] = metadata[1]
    final_metadatas.append(new_metadata)


final_metadatas = [{k: v for k, v in metadata.items() if k != "source" and k != "chunk_heading"} for metadata in final_metadatas]
    

In [57]:
def generate_image(article_title):
    return "https://www.SampleImageURL.com"

print(len(final_metadatas))

for met in final_metadatas:
    image_url = generate_image(met['article_title'])
    met["article_image_url"] = image_url
    print(met, end = "\n\n")

16
{'article_link': 'https://www.deeplearning.ai/the-batch/issue-282/', 'article_title': 'Happy New Year! Hopes For 2025 With Mustafa Suleyman, Audrey Tang, Albert Gu, Hanno Basse, Joseph Gonzalez, David Ding', 'article_publication_date': 'Jan 01, 2025', 'article_image_url': 'https://www.SampleImageURL.com'}

{'article_link': 'https://www.deeplearning.ai/the-batch/issue-281/', 'article_title': 'Top AI Stories of 2024! Agents Rise, Prices Fall, Models Shrink, Video Takes Off, Acquisitions Morph', 'article_publication_date': 'Dec 25, 2024', 'article_image_url': 'https://www.SampleImageURL.com'}

{'article_link': 'https://www.deeplearning.ai/the-batch/issue-280/', 'article_title': 'Phi-4 Breaks Size Barrier, HunyuanVideo Narrows Open Source Gap, Gemini 2.0 Flash Accelerates Multimodal Modeling, LLMs Propose Research Ideas', 'article_publication_date': 'Dec 18, 2024', 'article_image_url': 'https://www.SampleImageURL.com'}

{'article_link': 'https://www.deeplearning.ai/the-batch/issue-279/'

In [79]:
soup = BeautifulSoup(base_url_html, "html.parser")
div_class = soup.find("div", class_ = "aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200")
div_class

<div class="aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200"><span style="box-sizing:border-box;display:block;overflow:hidden;width:initial;height:initial;background:none;opacity:1;border:0;margin:0;padding:0;position:absolute;top:0;left:0;bottom:0;right:0"><noscript>

In [75]:
from bs4 import BeautifulSoup
import re

# Your HTML string
html = '''<div class="aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200">
    <span style="box-sizing: border-box; display: block; overflow: hidden; width: initial; height: initial; background: none; opacity: 1; border: 0px; margin: 0px; padding: 0px; position: absolute; inset: 0px;">
        <img alt="Deer training class with sleigh diagrams on a chalkboard." 
             src="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=3840&amp;q=75" 
             srcset="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=640&amp;q=75 640w, /_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=750&amp;q=75 750w, ..."/>
    </span>
</div>'''

soup = BeautifulSoup(html, 'html.parser')
img = soup.find('img')
print(img)

# Extract the base URL from src or srcset
def extract_original_url(url_str):
    # Look for the URL parameter in the Next.js image URL
    match = re.search(r'url=(.*?)&', url_str)
    if match:
        # URL decode the matched URL
        from urllib.parse import unquote
        return unquote(match.group(1))
    return None

# Try getting from src first
src = img.get('src')
original_url = extract_original_url(src)

print("Original image URL:", original_url)

<img alt="Deer training class with sleigh diagrams on a chalkboard." src="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=3840&amp;q=75" srcset="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=640&amp;q=75 640w, /_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=750&amp;q=75 750w, ..."/>
Original image URL: https://dl-staging-website.ghost.io/content/images/2024/12/unnamed--39--1.jpg


In [76]:
div_class

<div class="aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200"><span style="box-sizing:border-box;display:block;overflow:hidden;width:initial;height:initial;background:none;opacity:1;border:0;margin:0;padding:0;position:absolute;top:0;left:0;bottom:0;right:0"><noscript><img alt="Deer training class with sleigh diagrams on a chalkboard." data-nimg="fill" decoding="async" loading="lazy" sizes="100vw" src="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=3840&amp;q=75" srcset="/_next/image/?url=https%3A

In [77]:
type(div_class)

bs4.element.Tag

In [80]:
print(div_class.prettify())

<div class="aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200">
 <span style="box-sizing:border-box;display:block;overflow:hidden;width:initial;height:initial;background:none;opacity:1;border:0;margin:0;padding:0;position:absolute;top:0;left:0;bottom:0;right:0">
  
  <noscript>
   <img alt="Deer training class with sleigh diagrams on a chalkboard." data-nimg="fill" decoding="async" loading="lazy" sizes="100vw" src="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=3840&amp;q=75" srcset="/_next/image/?

In [81]:
from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(base_url_html, "html.parser")
html = soup.find("div", class_ = "aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200")

print(html)
img = soup.find('img')
print(img)

# Extract the base URL from src or srcset
def extract_original_url(url_str):
    # Look for the URL parameter in the Next.js image URL
    match = re.search(r'url=(.*?)&', url_str)
    if match:
        # URL decode the matched URL
        from urllib.parse import unquote
        return unquote(match.group(1))
    return None

# Try getting from src first
src = img.get('src')
original_url = extract_original_url(src)

print("Original image URL:", original_url)

<div class="aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200"><span style="box-sizing:border-box;display:block;overflow:hidden;width:initial;height:initial;background:none;opacity:1;border:0;margin:0;padding:0;position:absolute;top:0;left:0;bottom:0;right:0"><noscript><img alt="Deer training class with sleigh diagrams on a chalkboard." data-nimg="fill" decoding="async" loading="lazy" sizes="100vw" src="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=3840&amp;q=75" srcset="/_next/image/?url=https%3A

In [83]:
from bs4 import BeautifulSoup
import re
from urllib.parse import unquote

def extract_original_url(url_str):
    match = re.search(r'url=(.*?)&', url_str)
    if match:
        return unquote(match.group(1))
    return None

# Parse the HTML
soup = BeautifulSoup(base_url_html, "html.parser")

# First try to find the image inside noscript tag
noscript_img = soup.find('noscript').find('img')
print(noscript_img)
if noscript_img:
    # Try getting from src first
    src = noscript_img.get('src')
    if src:
        original_url = extract_original_url(src)
    else:
        # If no src, try srcset
        srcset = noscript_img.get('srcset')
        if srcset:
            # Take the first URL from srcset
            first_srcset_url = srcset.split(',')[0].split(' ')[0]
            original_url = extract_original_url(first_srcset_url)
else:
    original_url = None

print("Original image URL:", original_url)

None
Original image URL: None


In [84]:
from bs4 import BeautifulSoup
import re
from urllib.parse import unquote

soup = BeautifulSoup(base_url_html, "html.parser")

# Let's check what we're getting step by step
print("1. Finding div:")
div = soup.find("div", class_="aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200")
print(div is not None)  # Should print True

print("\n2. Finding span inside div:")
span = div.find("span") if div else None
print(span is not None)  # Should print True

print("\n3. Finding noscript inside span:")
noscript = span.find("noscript") if span else None
print(noscript is not None)  # Should print True

print("\n4. Full noscript content:")
print(noscript)

print("\n5. Finding img inside noscript:")
noscript_img = noscript.find("img") if noscript else None
print(noscript_img)

if noscript_img:
    src = noscript_img.get('src')
    srcset = noscript_img.get('srcset')
    print("\n6. src attribute:", src)
    print("\n7. srcset attribute:", srcset)

1. Finding div:
True

2. Finding span inside div:
True

3. Finding noscript inside span:
True

4. Full noscript content:
<noscript><img alt="Deer training class with sleigh diagrams on a chalkboard." data-nimg="fill" decoding="async" loading="lazy" sizes="100vw" src="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=3840&amp;q=75" srcset="/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=640&amp;q=75 640w, /_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=750&amp;q=75 750w, /_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=828&amp;q=75 828w, /_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%2Fimages%2F2024%2F12%2Funnamed--39--1.jpg&amp;w=1080&amp;q=75 1080w, /_next/image/?url=https%3A%2F%2Fdl-stagin

In [85]:
def extract_image_url(html):
    soup = BeautifulSoup(html, "html.parser")
    
    # Find all img tags
    all_imgs = soup.find_all('img')
    
    # Look for the one with a srcset containing the actual URL
    for img in all_imgs:
        srcset = img.get('srcset')
        if srcset and '/_next/image/' in srcset:
            # Take the first URL from srcset
            first_srcset_url = srcset.split(',')[0].split(' ')[0]
            match = re.search(r'url=(.*?)&', first_srcset_url)
            if match:
                return unquote(match.group(1))
    
    return None

original_url = extract_image_url(base_url_html)
print("Original image URL:", original_url)

Original image URL: /_next/static/media/the-batch-logo.0b7c10a2.png


### This is the final implementation of extracting the images from the articles

In [86]:
from bs4 import BeautifulSoup
import re
from urllib.parse import unquote

soup = BeautifulSoup(base_url_html, "html.parser")
div = soup.find("div", class_="aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200")
span = div.find("span") if div else None
noscript = span.find("noscript") if span else None
noscript_img = noscript.find("img") if noscript else None

def extract_original_url(url_str):
    # Modified to handle both & and &amp;
    match = re.search(r'url=(.*?)(?:&amp;|&)', url_str)
    if match:
        return unquote(match.group(1))
    return None

if noscript_img:
    src = noscript_img.get('src')
    original_url = extract_original_url(src)
    print("Original image URL:", original_url)
else:
    print("Could not find the image URL")

Original image URL: https://dl-staging-website.ghost.io/content/images/2024/12/unnamed--39--1.jpg


#### This extracts the featured article's image

In [None]:
def extract_featured_article_image_url(html):
    soup = BeautifulSoup(html, "html.parser")
    noscript_img = soup.select_one("div.aspect-w-16 noscript img")

    if noscript_img and noscript_img.get('src'):
        src = noscript_img.get('src')
        # Extract everything between 'url=' and '&' or '&amp;'
        url_part = re.search(r'url=(.*?)(?:&amp;|&)', src).group(1)
        original_url = unquote(url_part)
        print("Featured image URL:", original_url)
        return original_url
    else:
        print("Could not find the featured image URL")
        return ""
    
def extract_original_url(url_str):
    match = re.search(r'url=(.*?)(?:&amp;|&)', url_str)
    if match:
        return unquote(match.group(1))
    return None

def extract_images_urls(html):
    image_urls = []
    soup = BeautifulSoup(html, "html.parser")
    image_urls.append(extract_featured_article_image_url(html))
    article_divs = soup.find_all("div", class_="aspect-w-16 aspect-h-9 rounded-t-lg overflow-hidden bg-slate-200")
    for div in article_divs:
        span = div.find("span")
        if span:
            noscript = span.find("noscript")
            if noscript:
                img = noscript.find("img")
                if img and img.get('src'):
                    url = extract_original_url(img.get('src'))
                    if url:
                        image_urls.append(url)
                        # Get the alternative text (if needed)
                        # alt_text = img.get('alt', '')
                        # image_urls.append({'url': url, 'alt': alt_text})
    return image_urls

In [97]:
image_urls = extract_images_urls(base_url_html)
image_urls

Featured image URL: https://dl-staging-website.ghost.io/content/images/2025/01/unnamed--35--1.png


['https://dl-staging-website.ghost.io/content/images/2025/01/unnamed--35--1.png',
 'https://dl-staging-website.ghost.io/content/images/2024/12/unnamed--39--1.jpg',
 'https://dl-staging-website.ghost.io/content/images/2024/12/unnamed--32--1.png',
 'https://dl-staging-website.ghost.io/content/images/2024/12/unnamed--38--1.jpg',
 'https://dl-staging-website.ghost.io/content/images/2024/12/unnamed--25--1.png',
 'https://dl-staging-website.ghost.io/content/images/2024/11/unnamed--35--1.jpg',
 'https://dl-staging-website.ghost.io/content/images/2024/11/Captura-de-pantalla-2024-11-20-a-la-s--2.49.16-p.-m.-1.png',
 'https://dl-staging-website.ghost.io/content/images/2024/11/unnamed--33--1.jpg',
 'https://dl-staging-website.ghost.io/content/images/2024/11/unnamed--32--1.jpg',
 'https://dl-staging-website.ghost.io/content/images/2024/10/unnamed--27--1.jpg',
 'https://dl-staging-website.ghost.io/content/images/2024/10/unnamed--23--1.jpg',
 'https://dl-staging-website.ghost.io/content/images/2024/

In [98]:
len(image_urls)

16

In [91]:
final_metadatas

[{'article_link': 'https://www.deeplearning.ai/the-batch/issue-282/',
  'article_title': 'Happy New Year! Hopes For 2025 With Mustafa Suleyman, Audrey Tang, Albert Gu, Hanno Basse, Joseph Gonzalez, David Ding',
  'article_publication_date': 'Jan 01, 2025',
  'article_image_url': 'https://www.SampleImageURL.com'},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-281/',
  'article_title': 'Top AI Stories of 2024! Agents Rise, Prices Fall, Models Shrink, Video Takes Off, Acquisitions Morph',
  'article_publication_date': 'Dec 25, 2024',
  'article_image_url': 'https://www.SampleImageURL.com'},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-280/',
  'article_title': 'Phi-4 Breaks Size Barrier, HunyuanVideo Narrows Open Source Gap, Gemini 2.0 Flash Accelerates Multimodal Modeling, LLMs Propose Research Ideas',
  'article_publication_date': 'Dec 18, 2024',
  'article_image_url': 'https://www.SampleImageURL.com'},
 {'article_link': 'https://www.deeplearning.ai/th

In [103]:
new_met = []
for met in zip(final_metadatas, image_urls):
    met[0]['article_image_url'] = met[1]
    new_met.append(met[0])

In [104]:
new_met

[{'article_link': 'https://www.deeplearning.ai/the-batch/issue-282/',
  'article_title': 'Happy New Year! Hopes For 2025 With Mustafa Suleyman, Audrey Tang, Albert Gu, Hanno Basse, Joseph Gonzalez, David Ding',
  'article_publication_date': 'Jan 01, 2025',
  'article_image_url': 'https://dl-staging-website.ghost.io/content/images/2025/01/unnamed--35--1.png'},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-281/',
  'article_title': 'Top AI Stories of 2024! Agents Rise, Prices Fall, Models Shrink, Video Takes Off, Acquisitions Morph',
  'article_publication_date': 'Dec 25, 2024',
  'article_image_url': 'https://dl-staging-website.ghost.io/content/images/2024/12/unnamed--39--1.jpg'},
 {'article_link': 'https://www.deeplearning.ai/the-batch/issue-280/',
  'article_title': 'Phi-4 Breaks Size Barrier, HunyuanVideo Narrows Open Source Gap, Gemini 2.0 Flash Accelerates Multimodal Modeling, LLMs Propose Research Ideas',
  'article_publication_date': 'Dec 18, 2024',
  'article_im