In [2]:
import requests
from bs4 import BeautifulSoup

# URL of the Substack page to scrape
URL = "https://fintechradar.substack.com"

def scrape_substack(url):
    # Send a GET request to the page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract all post links using the correct data attribute
    posts = soup.find_all("a", {"data-testid": "post-preview-title"})

    if not posts:
        print("No posts found on the page.")
        return

    print("Recent posts:")
    for post in posts:
        title = post.get_text(strip=True)
        link = post["href"]
        print(f"- {title}: {link}")

# Run the scraper
scrape_substack(URL)


No posts found on the page.


In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

# Set up Selenium with Chrome
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode (no browser UI)
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def scrape_substack():
    url = "https://fintechradar.substack.com"
    driver = setup_driver()

    # Open the Substack page
    driver.get(url)
    time.sleep(5)  # Wait for the page to fully load

    # Find all post links using their anchor tag structure
    posts = driver.find_elements(By.XPATH, '//a[@data-testid="post-preview-title"]')

    if not posts:
        print("No posts found on the page.")
    else:
        print("Recent posts:")
        for post in posts:
            print(post)
            title = post.text
            link = post.get_attribute("href")
            print(f"- {title}: {link}")

    driver.quit()

# Run the scraper
scrape_substack()


Recent posts:
<selenium.webdriver.remote.webelement.WebElement (session="df6bf1641bb1c9bcb27d89b02f309fac", element="f.9F7226D87AFFED9A162424820A9C2272.d.07D44067A4E3430B8CB002FC72ED3FB8.e.13")>
- : https://fintechradar.substack.com/p/welcome-to-fintech-radar
<selenium.webdriver.remote.webelement.WebElement (session="df6bf1641bb1c9bcb27d89b02f309fac", element="f.9F7226D87AFFED9A162424820A9C2272.d.07D44067A4E3430B8CB002FC72ED3FB8.e.14")>
- : https://fintechradar.substack.com/p/issue-132-banked-acquires-waave-plaid
<selenium.webdriver.remote.webelement.WebElement (session="df6bf1641bb1c9bcb27d89b02f309fac", element="f.9F7226D87AFFED9A162424820A9C2272.d.07D44067A4E3430B8CB002FC72ED3FB8.e.15")>
- : https://fintechradar.substack.com/p/issue-131hsbcs-embedded-finance-bet
<selenium.webdriver.remote.webelement.WebElement (session="df6bf1641bb1c9bcb27d89b02f309fac", element="f.9F7226D87AFFED9A162424820A9C2272.d.07D44067A4E3430B8CB002FC72ED3FB8.e.16")>
- : https://fintechradar.substack.com/p/iss

In [13]:
import requests
from bs4 import BeautifulSoup
import csv
import json

def scrape_fintech_radar():
    url = "https://fintechradar.substack.com"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    posts = []
    post_elements = soup.find_all('div', class_='post-preview')
    
    if not post_elements:
        print("No post elements found. Dumping page content for debugging:")
        print(soup.prettify())
        return posts

    for post in post_elements:
        try:
            title_element = post.find('h3', class_='post-preview-title')
            date_element = post.find('time')
            snippet_element = post.find('p', class_='post-preview-snippet')
            link_element = post.find('a', class_='post-preview-title')

            if title_element and date_element and snippet_element and link_element:
                title = title_element.text.strip()
                date = date_element['datetime']
                snippet = snippet_element.text.strip()
                link = link_element['href']

                posts.append({
                    'title': title,
                    'date': date,
                    'snippet': snippet,
                    'link': url + link if not link.startswith('http') else link
                })
            else:
                print(f"Incomplete post data found: {post}")
        except Exception as e:
            print(f"Error processing post: {e}")

    return posts

def save_to_csv(posts, filename='fintech_radar_posts.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['title', 'date', 'snippet', 'link'])
        writer.writeheader()
        for post in posts:
            writer.writerow(post)

def save_to_json(posts, filename='fintech_radar_posts.json'):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(posts, file, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    posts = scrape_fintech_radar()
    if posts:
        save_to_csv(posts)
        save_to_json(posts)
        print(f"Scraped {len(posts)} posts and saved to CSV and JSON files")
    else:
        print("No posts were scraped. Please check the debugging output above.")

No post elements found. Dumping page content for debugging:
<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="24usqpep0ejc5w6hod3dulxwciwp0djs6c6ufp96av3t4whuxovj72wfkdjxu82yacb7430qjm8adbd5ezlt4592dq4zrvadcn9j9n-0btgdzpiojfzno16-fnsnu7xd" name="norton-safeweb-site-verification"/>
  <link href="https://substackcdn.com" rel="preconnect"/>
  <link as="style" href="https://substackcdn.com/bundle/theme/welcome.9c94859edd9cbfd54c1a.css" rel="preload"/>
  <link as="style" href="https://substackcdn.com/bundle/theme/color_links.33eb7c3bd37d78bc2dc3.css" rel="preload"/>
  <link href="https://substackcdn.com/bundle/assets/entry-c64f5271.css" rel="stylesheet" type="text/css"/>
  <link href="https://substackcdn.com/bundle/assets/responsive_img-51b02764.css" rel="stylesheet" type="text/css"/>
  <link href="https://substackcdn.com/bundle/assets/FlexBox-75e78ab2.css" rel="stylesheet" type="text/css"/>
  <link href="https://substackcdn.com/bundle/assets/IntroPopup-a01

In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://fintechradar.substack.com"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the title
title = soup.find('title').text.strip() if soup.find('title') else 'No title found'

# Extract the description
description = soup.find('meta', attrs={'name': 'description'})
description = description['content'] if description else 'No description found'

# Try to find post elements (this might not work if the structure has changed)
posts = soup.find_all('div', class_='post-preview')

# If no posts are found, try to extract any text content
if not posts:
    content = soup.get_text(separator='\n', strip=True)
    paragraphs = [p for p in content.split('\n') if p and len(p) > 50]  # Filter out short lines
    
    # Create a DataFrame with the available information
    df = pd.DataFrame({
        'Title': [title],
        'Description': [description],
        'Content': ['\n\n'.join(paragraphs[:5])]  # Include the first 5 longer paragraphs
    })
else:
    # If posts are found, extract their information (this is the original approach)
    titles = []
    dates = []
    excerpts = []
    for post in posts:
        titles.append(post.find('h3', class_='post-title').text.strip())
        dates.append(post.find('time')['datetime'])
        excerpts.append(post.find('p', class_='post-preview-excerpt').text.strip())
    
    df = pd.DataFrame({
        'Title': titles,
        'Date': dates,
        'Excerpt': excerpts
    })

# Display the DataFrame
print(df)

# Optionally, save to CSV
df.to_csv('fintech_radar_content.csv', index=False)

                                  Title  \
0  Fintech Radar | Alan Tsen | Substack   

                                         Description  \
0  Fintech Radar is a weekly missive about all th...   

                                             Content  
0  Fintech Radar is a weekly missive about all th...  


In [35]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://fintechradar.substack.com/p/issue-132-banked-acquires-waave-plaid"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the title
title = soup.find('h1', class_='post-title').text.strip() if soup.find('h1', class_='post-title') else 'No title found'

# Extract the subtitle
subtitle = soup.find('h3', class_='subtitle').text.strip() if soup.find('h3', class_='subtitle') else 'No subtitle found'

# Extract the date
date = soup.find('time')['datetime'] if soup.find('time') else 'No date found'

# Extract the main content
content_div = soup.find('div', class_='body markup')
content = content_div.get_text(separator='\n', strip=True) if content_div else 'No content found'

# Create a DataFrame
df = pd.DataFrame({
    'Title': [title],
    'Subtitle': [subtitle],
    'Date': [date],
    'Content': [content]
})

# Display the DataFrame
print(df)

# Optionally, save to CSV
# df.to_csv('fintech_radar_post.csv', index=False)

                                               Title  \
0  Issue #132: Banked Acquires Waave, Plaid Pushe...   

                                            Subtitle           Date  \
0  Your weekly roundup of the biggest stories and...  No date found   

                                             Content  
0  👋 Welcome back to another edition of Fintech R...  


In [36]:
print(content)

👋 Welcome back to another edition of Fintech Radar!
If you’re new,
here is a breakdown
of what you can expect from each issue.
If you missed our recent editions, you can catch up
here
. Some previous issues you might want to check out if you’re new include
“A Deep Dive Into The Cash App's Growth Machine”
,
“The Future Of Payment Initiation”
,
and
“Current: Doing It Differently”
.
Your ad could be HERE!
Fintech Radar is a must-read for fintech founders, operators, and investors. If that’s your target audience, placing an ad right
HERE
is a cost-effective way to reach them.
If this sounds like a good fit for your brand, head to our
sponsorship page
for more details and to secure your advertising slot.
Prices start at $100 per issue!
If you have any questions, reply to this email and ask away!
Find Out More
Banked Bolsters Australian Pay-by-Bank Foothold With Waave Acquisition
,
Pymnts
🏃‍♂️ The Rundown:
Banked has acquired Australian firm Waave, strengthening its presence in the Aussie ma

In [52]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata

url = "https://fintechradar.substack.com/p/issue-132-banked-acquires-waave-plaid"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the title
title = soup.find('h1', class_='post-title').text.strip() if soup.find('h1', class_='post-title') else 'No title found'

# Extract the subtitle
subtitle = soup.find('h3', class_='subtitle').text.strip() if soup.find('h3', class_='subtitle') else 'No subtitle found'

# Extract the date
date = soup.find('time')['datetime'] if soup.find('time') else 'No date found'

# Extract the main content
content_div = soup.find('div', class_='body markup')
content = content_div.get_text(separator='\n', strip=True) if content_div else 'No content found'

def extract_stories(content):
    story_pattern = r'([\w\s\'\":]+)\s*,\s*([^\n]+)\n\n🏃‍️ The Rundown:\n*(.*?)\n\nTakeaway:\n*(.*?)(?=\n\n[\w\s\'\":]+,|\Z)'
    
    stories = []
    for match in re.finditer(story_pattern, content, re.DOTALL):
        story = {
            'Title': match.group(1).strip(),
            'Source': match.group(2).strip(),
            'Rundown': match.group(3).strip(),
            'Takeaway': match.group(4).strip()
        }
        stories.append(story)

    return stories
# Extract stories from content without emojis
stories = extract_stories(content)
print(stories)
# Display extracted stories
for story in stories:
    print(story)

[]


In [54]:
print(content)

👋 Welcome back to another edition of Fintech Radar!
If you’re new,
here is a breakdown
of what you can expect from each issue.
If you missed our recent editions, you can catch up
here
. Some previous issues you might want to check out if you’re new include
“A Deep Dive Into The Cash App's Growth Machine”
,
“The Future Of Payment Initiation”
,
and
“Current: Doing It Differently”
.
Your ad could be HERE!
Fintech Radar is a must-read for fintech founders, operators, and investors. If that’s your target audience, placing an ad right
HERE
is a cost-effective way to reach them.
If this sounds like a good fit for your brand, head to our
sponsorship page
for more details and to secure your advertising slot.
Prices start at $100 per issue!
If you have any questions, reply to this email and ask away!
Find Out More
Banked Bolsters Australian Pay-by-Bank Foothold With Waave Acquisition
,
Pymnts
🏃‍♂️ The Rundown:
Banked has acquired Australian firm Waave, strengthening its presence in the Aussie ma

In [50]:
import re

def extract_stories(content):
    # Refined pattern to match each story with its components
    story_pattern = r'([\w\s\'\":]+)\s*,\s*([^\n]+)\n\n🏃‍️ The Rundown:\s*(.*?)\n\nTakeaway:\s*(.*?)(?=\n\n[\w\s\'\":]+,|\Z)'

    stories = []
    for match in re.finditer(story_pattern, content, re.DOTALL):
        story = {
            'Title': match.group(1).strip(),
            'Source': match.group(2).strip(),
            'Rundown': match.group(3).strip(),
            'Takeaway': match.group(4).strip()
        }
        stories.append(story)

    return stories

# Example usage with your content
content = """
Banked Bolsters Australian Pay-by-Bank Foothold With Waave Acquisition, Pymnts

🏃‍️ The Rundown: Banked has acquired Australian firm Waave, strengthening its presence in the Aussie market. Waave, which leverages open banking to offer a pay-by-bank option, had recently raised $4.7m in seed funding.

Takeaway: It’s still very early days for A2A/pay-by-bank in Australia—really early. According to the latest Worldpay Global Payments report, A2A e-commerce transactions make up just 4% of wallet share. In many ways, this looks like an early land grab by Banked, who must clearly see a lot of potential in the Australian market for A2A payments.

I get it. With the recently introduced laws enabling action initiation, or “write access,” to the Consumer Data Right (CDR)—after nearly three years in Parliament—and the potential shift away from the card schemes towards A2A payments, Waave and others like it are positioning themselves to be early movers.
"""
stories = extract_stories(content)

# Display extracted stories
for story in stories:
    print(story)


{'Title': 'Bank Foothold With Waave Acquisition', 'Source': 'Pymnts', 'Rundown': 'Banked has acquired Australian firm Waave, strengthening its presence in the Aussie market. Waave, which leverages open banking to offer a pay-by-bank option, had recently raised $4.7m in seed funding.', 'Takeaway': 'It’s still very early days for A2A/pay-by-bank in Australia—really early. According to the latest Worldpay Global Payments report, A2A e-commerce transactions make up just 4% of wallet share. In many ways, this looks like an early land grab by Banked, who must clearly see a lot of potential in the Australian market for A2A payments.\n\nI get it. With the recently introduced laws enabling action initiation, or “write access,” to the Consumer Data Right (CDR)—after nearly three years in Parliament—and the potential shift away from the card schemes towards A2A payments, Waave and others like it are positioning themselves to be early movers.'}
