# Web Scraping

#### 1. Importing Libraries

In [16]:
import pandas as pd
import requests # This is used to make HTTP requests to web pages (like GET)
from bs4 import BeautifulSoup # URL of the page that lists turtle families
import re

#### 2. Setting up URLs

In [17]:
base_url = "https://www.scrapethissite.com" # Base website address
family_list_url = base_url + "/pages/frames/?frame=i" # URL of the page that lists turtle families

#### Fetch the Family List Page

In [18]:
# Get the page content
response = requests.get(family_list_url)

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all turtle family cards
family_cards = soup.find_all("div", class_="turtle-family-card")

# Print how many turtle families found
print(f"Found {len(family_cards)} turtle families.")

Found 14 turtle families.


#### Prepare the Data Storage

In [19]:
# Dictionary to store data
turtle_data = {
    "turtle_name": [],
    "common_name": [],
    "discovered_year": [],
    "discovered_by": []
}

#### Scrape Each Turtle Detail Page

In [20]:
# Loop through each card
for card in family_cards:
    # Try to find the <h3> tag containing the family name
    family_name_tag = card.find("h3", class_="family-name")

    # Check if the tag was found
    if family_name_tag is not None:
        # Extract and clean the text
        family_name = family_name_tag.text.strip()
    else:
        # Fallback if the tag is missing
        family_name = "Unknown Family"

    # Append the result to the turtle_data dictionary
    turtle_data["turtle_name"].append(family_name)

    # Detail page link
    link_tag = card.find("a", href=True)
    if not link_tag:
        turtle_data["common_name"].append("Not Available")
        turtle_data["discovered_year"].append("Unknown")
        turtle_data["discovered_by"].append("Unknown")
        continue

    detail_url = base_url + link_tag["href"]
    detail_response = requests.get(detail_url)
    detail_soup = BeautifulSoup(detail_response.content, "html.parser")

    # Find details
    paragraphs = detail_soup.find_all("p", class_= "lead")

    # Defaults
    common_name = "Not Available"
    year_found = "Unknown"
    discovered_by = "Unknown"

    for para in paragraphs:
        text = para.get_text()

        # Common name
        strong_tag = para.find("strong", class_ = "common-name")
        if strong_tag:
            common_name = strong_tag.text.strip()

        # Year pattern
        year_match = re.search(r"\b(1[5-9][0-9]{2}|20[0-2][0-9]|2030)\b", text)
        if year_match:
            year_found = year_match.group(0)

        # Discovered by pattern
        by_match = re.search(r"by\s+([A-Z][a-zA-Z\-]+)", text)
        if by_match:
            discovered_by = by_match.group(1)

    # Store the info
    turtle_data["common_name"].append(common_name)
    turtle_data["discovered_year"].append(year_found)
    turtle_data["discovered_by"].append(discovered_by)

In [21]:
df = pd.DataFrame(turtle_data)
df

Unnamed: 0,turtle_name,common_name,discovered_year,discovered_by
0,Carettochelyidae,Pig-nosed turtle,1887,Boulenger
1,Cheloniidae,Sea turtles,1811,Oppel
2,Chelydridae,Snapping turtles,1831,Gray
3,Dermatemydidae,Central American river turtle,1870,Gray
4,Dermochelyidae,Leatherback sea turtle,1843,Fitzinger
5,Emydidae,Pond or water turtles,1815,Rafinesque
6,Geoemydidae,"Asian river, leaf, roofed or Asian box turtles",1868,Theobald
7,Kinosternidae,Mud or musk turtles,1857,Agassiz
8,Platysternidae,Big-headed turtle,1869,Gray
9,Testudinidae,Tortoises,1788,Batsch
