In [1]:
from substack_api import Post, Newsletter, User
from datetime import datetime, timezone
from pathlib import Path
import shutil

In [2]:
directory = Path('post_content')

for item in directory.iterdir():
    if item.is_file() or item.is_symlink():
        item.unlink()
    elif item.is_dir():
        shutil.rmtree(item)

In [3]:
username = "ivankching"
date_str = "2025-10-19T00:00:00.000Z"

In [4]:
user = User(username)
user

User(username=ivankching)

In [5]:
user.get_subscriptions()

[{'publication_id': 1174659,
  'publication_name': 'Ahead of AI',
  'domain': 'magazine.sebastianraschka.com',
  'membership_state': 'free_signup'},
 {'publication_id': 4083837,
  'publication_name': 'The AI Engineer',
  'domain': 'www.aimlengineer.io',
  'membership_state': 'free_signup'},
 {'publication_id': 1560026,
  'publication_name': 'Andrej’s Substack',
  'domain': 'karpathy.substack.com',
  'membership_state': 'free_signup'},
 {'publication_id': 289327,
  'publication_name': 'Deep Learning Weekly',
  'domain': 'www.deeplearningweekly.com',
  'membership_state': 'free_signup'},
 {'publication_id': 48041,
  'publication_name': 'Gwern.net Newsletter',
  'domain': 'gwern.substack.com',
  'membership_state': 'free_signup'},
 {'publication_id': 18908,
  'publication_name': 'Monomythical',
  'domain': 'nayafia.substack.com',
  'membership_state': 'free_signup'},
 {'publication_id': 1178062,
  'publication_name': 'NLP News',
  'domain': 'newsletter.ruder.io',
  'membership_state': 'fr

In [6]:
newsletters = {}
for d in user.get_subscriptions():
    newsletters[d["publication_name"]] = Newsletter(f"https://{d["domain"]}")

In [7]:
newsletter = newsletters["Sustainability by numbers"]

In [8]:
posts = newsletter.get_posts(sorting='new', limit=7)

In [9]:
posts

[Post(url=https://www.sustainabilitybynumbers.com/p/iea-current-policies-scenario),
 Post(url=https://www.sustainabilitybynumbers.com/p/global-carbon-emissions-2025),
 Post(url=https://www.sustainabilitybynumbers.com/p/amazon-deforestation-2025),
 Post(url=https://www.sustainabilitybynumbers.com/p/artificial-intelligence-could-dramatically),
 Post(url=https://www.sustainabilitybynumbers.com/p/usa-electricity-growth),
 Post(url=https://www.sustainabilitybynumbers.com/p/eliminating-contrails),
 Post(url=https://www.sustainabilitybynumbers.com/p/food-projections-2025)]

In [10]:
post = posts[-1]
post.get_metadata()["post_date"]

'2025-09-22T05:45:59.562Z'

In [11]:
post.get_metadata()

{'audience': 'everyone',
 'audience_before_archived': None,
 'canonical_url': 'https://www.sustainabilitybynumbers.com/p/food-projections-2025',
 'default_comment_sort': None,
 'editor_v2': False,
 'exempt_from_archive_paywall': False,
 'free_unlock_required': False,
 'id': 174220389,
 'podcast_art_url': None,
 'podcast_duration': None,
 'podcast_preview_upload_id': None,
 'podcast_upload_id': None,
 'podcast_url': None,
 'post_date': '2025-09-22T05:45:59.562Z',
 'updated_at': '2025-09-22T05:46:20.278Z',
 'publication_id': 1199196,
 'search_engine_description': None,
 'search_engine_title': None,
 'section_id': None,
 'should_send_free_preview': False,
 'show_guest_bios': True,
 'slug': 'food-projections-2025',
 'social_title': 'The world is on track for record harvests this year',
 'subtitle': 'What do the latest projections expect for global food production and yields of different crops?',
 'teaser_post_eligible': True,
 'title': 'The world is on track for record harvests this year',

In [12]:
from substack import download_substack_posts

In [13]:
download_substack_posts(username, date_str)

INFO:root:Added newsletter: Ahead of AI
INFO:root:Added newsletter: The AI Engineer
INFO:root:Added newsletter: Andrej’s Substack
INFO:root:Added newsletter: Deep Learning Weekly
INFO:root:Added newsletter: Gwern.net Newsletter
INFO:root:Added newsletter: Monomythical
INFO:root:Added newsletter: NLP News
INFO:root:Added newsletter: Noahpinion
INFO:root:Added newsletter: The Substack Post
INFO:root:Added newsletter: Sustainability by numbers
INFO:root:Added post beyond-standard-llms from newsletter Ahead of AI
INFO:root:{'status': 'success', 'message': 'Posts downloaded successfully'}
INFO:root:{'status': 'success', 'message': 'Posts downloaded successfully'}
INFO:root:{'status': 'success', 'message': 'Posts downloaded successfully'}
INFO:root:Added post deep-learning-weekly-issue-431 from newsletter Deep Learning Weekly
INFO:root:Added post deep-learning-weekly-issue-430 from newsletter Deep Learning Weekly
INFO:root:Added post deep-learning-weekly-issue-429 from newsletter Deep Learni

{'status': 'success', 'message': 'Posts downloaded successfully'}

In [14]:
newsletter = newsletters["Ahead of AI"]
posts = newsletter.get_posts(sorting='new', limit=30)

In [15]:
post = posts[22]

In [16]:
post_date = datetime.strptime(post.get_metadata()["post_date"], "%Y-%m-%dT%H:%M:%S.%f%z")

In [17]:
post_date

datetime.datetime(2024, 5, 12, 11, 2, 46, 703000, tzinfo=datetime.timezone.utc)

In [18]:
dt = datetime.strptime(date_str,  "%Y-%m-%dT%H:%M:%S.%f%z")
dt

datetime.datetime(2025, 10, 19, 0, 0, tzinfo=datetime.timezone.utc)

In [19]:
from html_reader import batch_convert, clean_and_convert, clean_whitespaces_markdown
from bs4 import BeautifulSoup

In [20]:
metadata = post.get_metadata()
html = metadata["body_html"]
soup = BeautifulSoup(html, 'html.parser')

In [21]:
header = soup.new_tag('header')
h1 = soup.new_tag('h1')
h1.string = metadata["title"]
h2 = soup.new_tag('p')
h2.string = metadata["subtitle"]
header.append(h1)
header.append(h2)
soup.insert(0, header)

[<header><h1>How Good Are the Latest Open LLMs? And Is DPO Better Than PPO?</h1><p>Discussing the Latest Model Releases and AI Research in April 2024</p></header>]

In [22]:
soup

<header><h1>How Good Are the Latest Open LLMs? And Is DPO Better Than PPO?</h1><p>Discussing the Latest Model Releases and AI Research in April 2024</p></header><p>April 2024, what a month! My birthday, a <a href="https://www.amazon.com/Machine-Learning-AI-Essential-Questions/dp/1718503768">new book release</a>, spring is finally here, and four major open LLM releases: Mixtral, Meta AI's Llama 3, Microsoft's Phi-3, and Apple's OpenELM.</p><p>This article reviews and discusses all four major transformer-based LLM model releases that have been happening in the last few weeks, followed by new research on reinforcement learning with human feedback methods for instruction finetuning using PPO and DPO algorithms.</p><p>1. How Good are Mixtral, Llama 3, and Phi-3?</p><p>2. OpenELM: An Efficient Language Model Family with Open-source Training and Inference Framework</p><p>3. Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study</p><p>4. Other Interesting Research Papers In April</p><

In [23]:
name = "Ahead of AI"
Path(f'post_content/{name}').mkdir(parents=True, exist_ok=True)
                
with open(f"post_content/{name}/{metadata["slug"]}.html", 'w', encoding='utf-8') as f:
    f.write(str(soup))

In [24]:
filepath = Path(f"post_content/{name}/{metadata["slug"]}.html")

In [25]:
clean_and_convert(filepath)

'Discussing the Latest Model Releases and AI Research in April 2024\n\nApril 2024, what a month! My birthday, a new book release, spring is finally here, and four major open LLM releases: Mixtral, Meta AI\'s Llama 3, Microsoft\'s Phi-3, and Apple\'s OpenELM.\n\nThis article reviews and discusses all four major transformer-based LLM model releases that have been happening in the last few weeks, followed by new research on reinforcement learning with human feedback methods for instruction finetuning using PPO and DPO algorithms.\n\n1. How Good are Mixtral, Llama 3, and Phi-3?\n\n2. OpenELM: An Efficient Language Model Family with Open-source Training and Inference Framework\n\n3. Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study\n\n4. Other Interesting Research Papers In April\n\nFirst, let\'s start with the most prominent topic: the new major LLM releases this month. This section will briefly cover Mixtral, Llama 3, and Phi-3, which have been accompanied by short blog post

In [26]:
batch_convert('post_content')

INFO:html_reader:Found 31 HTML files
INFO:html_reader:Successfully converted 31/31 files


In [27]:
newsletter = newsletters["Sustainability by numbers"]
posts = newsletter.get_posts(sorting='new', limit=30)

In [28]:
posts

[Post(url=https://www.sustainabilitybynumbers.com/p/iea-current-policies-scenario),
 Post(url=https://www.sustainabilitybynumbers.com/p/global-carbon-emissions-2025),
 Post(url=https://www.sustainabilitybynumbers.com/p/amazon-deforestation-2025),
 Post(url=https://www.sustainabilitybynumbers.com/p/artificial-intelligence-could-dramatically),
 Post(url=https://www.sustainabilitybynumbers.com/p/usa-electricity-growth),
 Post(url=https://www.sustainabilitybynumbers.com/p/eliminating-contrails),
 Post(url=https://www.sustainabilitybynumbers.com/p/food-projections-2025),
 Post(url=https://www.sustainabilitybynumbers.com/p/clearing-the-air-published-uk),
 Post(url=https://www.sustainabilitybynumbers.com/p/the-electrotech-revolution),
 Post(url=https://www.sustainabilitybynumbers.com/p/solar-wind-speed-rollout),
 Post(url=https://www.sustainabilitybynumbers.com/p/ai-footprint-august-2025),
 Post(url=https://www.sustainabilitybynumbers.com/p/electric-car-battery-degradation),
 Post(url=https:/

In [29]:
post = posts[0]
metadata = post.get_metadata()

In [30]:
metadata['body_html']

'<p>“Yes” was what many media outlets reported last week.</p><p>Take a look at the headline clippings below (from the <a href="https://www.ft.com/content/8696254d-1873-434a-96be-88aa690f9b75">Financial Times</a>, <a href="https://www.wsj.com/articles/ieas-revived-policy-outlook-sees-no-peak-in-oil-gas-demand-this-decade-50939014">Wall Street Journal</a>, <a href="https://www.offshore-technology.com/news/iea-oil-and-gas-consumption-prediction/">Offshore Technology</a>, and <a href="https://www.telegraphindia.com/business/iea-projects-oil-demand-to-rise-till-2050-with-india-leading-global-consumption-growth-prnt/cid/2132616">Telegraph India</a>). There were <a href="https://www.bloomberg.com/news/articles/2025-11-12/iea-reinstates-bullish-oil-demand-growth-scenario-in-key-report">many more</a>.</p><p>These refer to the International Energy Agency’s newly published “<a href="https://www.iea.org/reports/world-energy-outlook-2025">World Energy Outlook 2025</a>”. In this report, the IEA proj

In [31]:
metadata.keys()

dict_keys(['audience', 'audience_before_archived', 'canonical_url', 'default_comment_sort', 'editor_v2', 'exempt_from_archive_paywall', 'free_unlock_required', 'id', 'podcast_art_url', 'podcast_duration', 'podcast_preview_upload_id', 'podcast_upload_id', 'podcast_url', 'post_date', 'updated_at', 'publication_id', 'search_engine_description', 'search_engine_title', 'section_id', 'should_send_free_preview', 'show_guest_bios', 'slug', 'social_title', 'subtitle', 'teaser_post_eligible', 'title', 'type', 'video_upload_id', 'write_comment_permissions', 'meter_type', 'live_stream_id', 'is_published', 'restacks', 'reactions', 'top_exclusions', 'pins', 'section_pins', 'has_shareable_clips', 'previous_post_slug', 'next_post_slug', 'cover_image', 'cover_image_is_square', 'cover_image_is_explicit', 'videoUpload', 'podcastFields', 'podcastUpload', 'podcastPreviewUpload', 'voiceover_upload_id', 'voiceoverUpload', 'has_voiceover', 'description', 'body_html', 'truncated_body_text', 'wordcount', 'postT

In [32]:
metadata['title']

'Will oil and gas consumption keep rising through 2050?'

In [33]:
metadata['type']

'newsletter'

In [34]:
header = f"""<header>
<h1>{metadata['title']}</h1>
<p>{metadata['subtitle']}</p
</header>
"""

In [35]:
html_out = header + metadata['body_html']

In [36]:
html_out

'<header>\n<h1>Will oil and gas consumption keep rising through 2050?</h1>\n<p>Unpacking some of the assumptions behind the IEA\'s policy scenarios.</p\n</header>\n<p>“Yes” was what many media outlets reported last week.</p><p>Take a look at the headline clippings below (from the <a href="https://www.ft.com/content/8696254d-1873-434a-96be-88aa690f9b75">Financial Times</a>, <a href="https://www.wsj.com/articles/ieas-revived-policy-outlook-sees-no-peak-in-oil-gas-demand-this-decade-50939014">Wall Street Journal</a>, <a href="https://www.offshore-technology.com/news/iea-oil-and-gas-consumption-prediction/">Offshore Technology</a>, and <a href="https://www.telegraphindia.com/business/iea-projects-oil-demand-to-rise-till-2050-with-india-leading-global-consumption-growth-prnt/cid/2132616">Telegraph India</a>). There were <a href="https://www.bloomberg.com/news/articles/2025-11-12/iea-reinstates-bullish-oil-demand-growth-scenario-in-key-report">many more</a>.</p><p>These refer to the Internat