In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re 
import time

# 1. Define the base URL of the website
base_url = "https://www.scrapethissite.com"
# Initialize the current page URL to the iframe URL
current_page_url = "https://www.scrapethissite.com/pages/frames/?frame=i"

# List to store all scraped turtle family data
all_turtle_data = []

print(f"Starting to scrape from: {current_page_url}")

# Define all turtle families to scrape
turtle_families = [
    "Carettochelyidae", "Cheloniidae", "Chelydridae", "Dermatemydidae",
    "Dermochelyidae", "Emydidae", "Geoemydidae", "Kinosternidae", 
    "Platysternidae", "Testudinidae", "Trionychidae", "Chelidae",
    "Pelomedusidae", "Podocnemididae"
]

# Loop through each turtle family to scrape individual family pages
for family_name in turtle_families:
    print(f"Scraping family: {family_name}")
    
    # Construct URL for individual family page
    family_url = f"{base_url}/pages/frames/?frame=i&family={family_name}"
    
    try:
        response = requests.get(family_url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching family page {family_url}: {e}")
        continue # Skip to next family on error
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize default values
    discoverer = "Not found"
    discovery_year = "Not found"
    
    # Get all text from the page
    page_text = soup.get_text()
    
    # Look for discoverer information
    discoverer_match = re.search(r'discovered.*?by\s+([A-Za-z\s]+)', page_text, re.IGNORECASE)
    if discoverer_match:
        discoverer = discoverer_match.group(1).strip().rstrip('.')
    
    # Look for discovery year
    year_match = re.search(r'in\s+(\d{4})', page_text)
    if year_match:
        discovery_year = year_match.group(1)
    
    # Create turtle family data dictionary
    turtle_data = {
        'Family_Name': family_name,
        'Discoverer': discoverer, 
        'Discovery_Year': discovery_year,
        'URL': family_url
    }
    all_turtle_data.append(turtle_data)
    
    # Add a delay to avoid overwhelming the server
    time.sleep(1) # Pause for 1 second before the next request

print("Finished scraping all turtle families.")

# Create DataFrame and display results
df = pd.DataFrame(all_turtle_data)
df

Starting to scrape from: https://www.scrapethissite.com/pages/frames/?frame=i
Scraping family: Carettochelyidae
Scraping family: Cheloniidae
Scraping family: Chelydridae
Scraping family: Dermatemydidae
Scraping family: Dermochelyidae
Scraping family: Emydidae
Scraping family: Geoemydidae
Scraping family: Kinosternidae
Scraping family: Platysternidae
Scraping family: Testudinidae
Scraping family: Trionychidae
Scraping family: Chelidae
Scraping family: Pelomedusidae
Scraping family: Podocnemididae
Finished scraping all turtle families.


Unnamed: 0,Family_Name,Discoverer,Discovery_Year,URL
0,Carettochelyidae,Boulenger,1887,https://www.scrapethissite.com/pages/frames/?f...
1,Cheloniidae,Oppel,1811,https://www.scrapethissite.com/pages/frames/?f...
2,Chelydridae,Gray,1831,https://www.scrapethissite.com/pages/frames/?f...
3,Dermatemydidae,Gray,1870,https://www.scrapethissite.com/pages/frames/?f...
4,Dermochelyidae,Fitzinger,1843,https://www.scrapethissite.com/pages/frames/?f...
5,Emydidae,Rafinesque,1815,https://www.scrapethissite.com/pages/frames/?f...
6,Geoemydidae,Theobald,1868,https://www.scrapethissite.com/pages/frames/?f...
7,Kinosternidae,Agassiz,1857,https://www.scrapethissite.com/pages/frames/?f...
8,Platysternidae,Gray,1869,https://www.scrapethissite.com/pages/frames/?f...
9,Testudinidae,Batsch,1788,https://www.scrapethissite.com/pages/frames/?f...
