## Web Scrape Episodes from Scientific American’s Science Quickly Podcast

This notebook is designed to scrape episode information from the Science Quickly Podcast, available at Scientific American’s website: https://www.scientificamerican.com/podcast/science-quickly/

The notebook retrieves:

- Episode title
- Release date
- Category
- Summary
- Authors

The data is saved in a CSV file.

### 1. Set up

In [41]:
## importing library packages
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
import time

In [42]:
# Set Firefox options (optional, for example, headless mode)
options = Options()
options.headless = True  # Optional, set True for headless mode

# Create a Service object to specify the geckodriver path
service = Service("/Users/julieliu/geckodriver")  # Adjust the path to your geckodriver

### 2. Start scrap

This step will cost around 4 minutes

In [None]:
# Initialize the WebDriver
driver = webdriver.Firefox(service=service, options=options)

# Generate the list of URLs for pages 1 to 50
url_list = [f"https://www.scientificamerican.com/podcast/science-quickly/?page={i}" for i in range(1, 51)]

# create a list to store the extracted information
data = []

# Loop through each URL
for url in url_list:
    try:
        # Visit the page
        driver.get(url)
        print(f"Scraping URL: {url}")
        
        # Allow some time for the page to load
        driver.implicitly_wait(5)
        
        # Find all articles representing episodes on the page
        episodes = driver.find_elements(By.CSS_SELECTOR, 'article.article-pFLe7')

        # Loop through each episode and extract the information
        for episode in episodes:
            try:
                # Extract the title
                title_element = episode.find_element(By.CSS_SELECTOR, 'h2.articleTitle-mtY5p')
                title = title_element.text

                # Extract the summary
                summary_element = episode.find_element(By.CSS_SELECTOR, 'div.dek-KweYs p')
                summary = summary_element.text

                # Extract the authors
                author_element = episode.find_element(By.CSS_SELECTOR, 'p.authors-NCGt1')
                authors = author_element.text

                # Extract the category (e.g., "Spacecraft")
                category_element = episode.find_element(By.CSS_SELECTOR, 'div.kicker-EEaW-')
                category = category_element.text.split('\n')[0]  # Take only the first line

                # Extract the date (e.g., "September 13, 2024")
                date_element = episode.find_element(By.CSS_SELECTOR, 'span.kickerMeta-0zV3t')
                date = date_element.text

                # Append the data as a dictionary to the list
                data.append({
                    "Title": title,
                    "Date": date,
                    "Category": category,
                    "Summary": summary,
                    "Authors": authors
                })
            
            except Exception as e:
                print(f"Error extracting episode: {e}")

        # Pause for a short time to avoid overwhelming the server
        time.sleep(2)

    except Exception as e:
        print(f"Error scraping page {url}: {e}")

# Close the browser
driver.quit()

### 3. Store the results

In [None]:
# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("podcast_episodes.csv", index=False)


# Print the DataFrame
print(df)