In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

main_url = 'https://www.scrapethissite.com/pages/frames/'
headers = {'User-Agent': 'Mozilla/5.0'}

main_response = requests.get(main_url, headers=headers)
main_response.raise_for_status()
main_soup = BeautifulSoup(main_response.content, 'html.parser')

iframe_tags = main_soup.find_all('iframe')
print(f"Found {len(iframe_tags)} iframe(s)")

all_data = []

for index, iframe in enumerate(iframe_tags):
    iframe_src = iframe.get('src')
    if not iframe_src:
        continue

    iframe_url = urljoin(main_url, iframe_src)
    print(f"[{index+1}] Scraping iframe: {iframe_url}")

    iframe_response = requests.get(iframe_url, headers=headers)
    iframe_response.raise_for_status()
    iframe_soup = BeautifulSoup(iframe_response.content, 'html.parser')

    turtle_links = iframe_soup.select('div.turtle-family-card a')

    for link in turtle_links:
        family_url = urljoin(iframe_url, link.get('href'))
        print(f"  → Following link to: {family_url}")

        family_response = requests.get(family_url, headers=headers)
        family_response.raise_for_status()
        family_soup = BeautifulSoup(family_response.content, 'html.parser')

        family_name = family_soup.find('h3', class_='family-name')
        description = family_soup.find('strong', class_='common-name')
        lead = family_soup.find('p', class_='lead')

        family_name = family_name.text.strip() if family_name else ''
        description = description.text.strip() if description else ''

        year_discovered = ''
        discovered_by = ''
        if lead:
            lead_text = lead.text.strip()

            # Extract numbers (year)
            nums = re.findall(r'\d+', lead_text)
            if nums:
                year_discovered = nums[0]

            # Extract everything AFTER 'by' (exclude 'by')
            match = re.search(r'\bby\b\s*(.*)', lead_text, re.IGNORECASE)
            if match:
                discovered_by = match.group(1).strip()

        all_data.append({
            'Family Name': family_name,
            'Description': description,
            'Year Discovered': year_discovered,
            'Discovered By': discovered_by
        })

df = pd.DataFrame(all_data)
df


Found 1 iframe(s)
[1] Scraping iframe: https://www.scrapethissite.com/pages/frames/?frame=i
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Carettochelyidae
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Cheloniidae
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Chelydridae
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Dermatemydidae
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Dermochelyidae
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Emydidae
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Geoemydidae
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Kinosternidae
  → Following link to: https://www.scrapethissite.com/pages/frames/?frame=i&family=Platysternidae
  → Following link to: https://www.scrap

Unnamed: 0,Family Name,Description,Year Discovered,Discovered By
0,Carettochelyidae,Pig-nosed turtle,1887,Boulenger.
1,Cheloniidae,Sea turtles,1811,Oppel.
2,Chelydridae,Snapping turtles,1831,Gray.
3,Dermatemydidae,Central American river turtle,1870,Gray.
4,Dermochelyidae,Leatherback sea turtle,1843,Fitzinger.
5,Emydidae,Pond or water turtles,1815,Rafinesque.
6,Geoemydidae,"Asian river, leaf, roofed or Asian box turtles",1868,Theobald.
7,Kinosternidae,Mud or musk turtles,1857,Agassiz.
8,Platysternidae,Big-headed turtle,1869,Gray.
9,Testudinidae,Tortoises,1788,Batsch.
