In [6]:
# STEP 1 — Install & Import Libraries (code cell)
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [7]:
# STEP 2 — Scrape IMAGES from BooksToScrape (code cell)

img_url = "https://books.toscrape.com/"
response = requests.get(img_url)
print("Image page status:", response.status_code)

# Parse the HTML content
soup = BeautifulSoup(response.text, "html.parser")

# Find all <img> tags
images = soup.find_all("img")

image_urls = []

# Extract and clean each image URL
for img in images:
    src = img.get("src")
    if not src:
        continue
    full_url = img_url + src.replace("../", "")
    image_urls.append(full_url)

# Remove duplicates
image_urls = list(set(image_urls))

print("Number of image URLs found:", len(image_urls))
image_urls[:5]


Image page status: 200
Number of image URLs found: 20


['https://books.toscrape.com/media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg',
 'https://books.toscrape.com/media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg',
 'https://books.toscrape.com/media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg',
 'https://books.toscrape.com/media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg',
 'https://books.toscrape.com/media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg']

In [8]:
# STEP 3 — Save Image URLs into a CSV

img_df = pd.DataFrame({"image_url": image_urls})
img_df.to_csv("image_urls_books_toscrape.csv", index=False)

img_df.head()


Unnamed: 0,image_url
0,https://books.toscrape.com/media/cache/0b/bc/0...
1,https://books.toscrape.com/media/cache/be/f4/b...
2,https://books.toscrape.com/media/cache/27/a5/2...
3,https://books.toscrape.com/media/cache/68/33/6...
4,https://books.toscrape.com/media/cache/5b/88/5...


In [9]:
# STEP 4 — Scrape AUDIO FILES from SoundHelix

url_audio = "https://www.soundhelix.com/audio-examples"
response_audio = requests.get(url_audio)
soup_audio = BeautifulSoup(response_audio.text, "html.parser")

audio_links = []

for a in soup_audio.find_all("a", href=True):
    href = a["href"]
    if href.endswith(".mp3"):
        audio_links.append(href)

# Convert to full URLs
base = "https://www.soundhelix.com"
full_audio_links = [base + href for href in audio_links]

full_audio_links[:10]  # first 10


['https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-2.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-3.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-4.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-5.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-6.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-7.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-8.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-9.mp3',
 'https://www.soundhelix.com/examples/mp3/SoundHelix-Song-10.mp3']

In [10]:
# STEP 5 — Save Audio URLs into CSV
df_audio = pd.DataFrame({"audio_url": full_audio_links})
df_audio.to_csv("audio_links.csv", index=False)

df_audio.head()


Unnamed: 0,audio_url
0,https://www.soundhelix.com/examples/mp3/SoundH...
1,https://www.soundhelix.com/examples/mp3/SoundH...
2,https://www.soundhelix.com/examples/mp3/SoundH...
3,https://www.soundhelix.com/examples/mp3/SoundH...
4,https://www.soundhelix.com/examples/mp3/SoundH...
