Importing Necessary Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import os
import dateutil.parser

In [2]:
rss_feed_url = "https://feeds.megaphone.fm/SIXMSB5088139739"
output_dir = "./podcasts_download"

# Keywords to filter podcast titles
keywords = [r"\bcareer\b" , r"\bgrowth\b"] 

# Maximum number of podcasts to download
max_podcast = 3

# Whether to filter podcasts by keywords
use_keywords = True

In [3]:
# Function to parse and format publication date
def parse_date(date):
    return dateutil.parser.parse(date).strftime("%d-%b-%y")

In [4]:
# Function to filter episodes based on keywords
def keyword_filter(title, keywords):
    for keyword in keywords:
        if re.search(keyword, title, re.IGNORECASE):
            return True
    return False

In [5]:
# extract metadata of episodes from RSS feed content
def get_episodes_metadata(feed_content, keywords=True):
    soup = BeautifulSoup(feed_content, "xml")
    items = soup.find_all("item")
    
    episode_metadata = []
    for item in items:
        title = item.find("title").text
        if use_keywords and keyword_filter(title, keywords): 
            url = item.find("enclosure")["url"]
            release_date = parse_date(item.find("pubDate").text)
            episode_metadata.append((url, title, release_date))
        elif not use_keywords:
            url = item.find("enclosure")["url"]
            release_date = parse_date(item.find("pubDate").text)
            episode_metadata.append((url, title, release_date))
        
        if len(episode_metadata) == max_podcast:
            break
            
    return episode_metadata

In [6]:
# download MP3 file from URL
def get_mp3_file(url):
    try:
        response = requests.get(url, allow_redirects=True)
        return response
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

In [7]:
#  save MP3 file to disk
def save_mp3_file(file, file_path):
    with open(file_path, "wb") as f:
        f.write(file.content)

In [8]:
#  simplify podcast title for filename
def simplify_title(title):
    return re.sub(r'[^a-zA-Z0-9 ]', '', title)[:100]

In [9]:
if __name__ == "__main__":
    
    # Fetch and parse the RSS feed content
    feed_content = requests.get(rss_feed_url).content
    episodes_metadata = get_episodes_metadata(feed_content, keywords)
    
    # Create ouput directory if it doesn't exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    #  Download and save each episode
    for url, title, release_date in episodes_metadata:
        simple_title = simplify_title(title)
        file = get_mp3_file(url)
        file_path = os.path.join(output_dir, f"{release_date}_{simple_title}.mp3")
        save_mp3_file (file, file_path)
        print(f"{file_path} saved")
        