In [10]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from urllib.parse import urljoin

In [11]:
page_url = 'https://www.scrapethissite.com/pages/frames/?frame=i'
base_url = 'https://www.scrapethissite.com'

In [12]:

response = requests.get(page_url)
if response.status_code == 200:
    print("Access granted to the page.")
    soup = BeautifulSoup(response.content, 'html.parser')
else:
    print(f"Failed to access the page. Status code: {response.status_code}")
    exit()

turtles = soup.find_all('div', class_='col-md-4 turtle-family-card')

Access granted to the page.


In [16]:
# Initialize empty dictionary to store turtle data
turtles_data = {
    "Name": [],
    "Known_As": [],
    "Discovery_Year": [],
    "Discovered_By": []
}

# Add a counter to track progress
total_turtles = len(turtles)
print(f"Found {total_turtles} turtle families to process")

# Process each turtle family
for i, turtle in enumerate(turtles):
    # Display progress
    print(f"Processing turtle {i+1}/{total_turtles}...")
    
    # STEP 1: Extract the turtle family name
    name_tag = turtle.find('h3', class_='family-name')
    if name_tag:
        name = name_tag.text.strip()
        turtles_data['Name'].append(name)
    else:
        turtles_data['Name'].append("Unknown")
        print(f"  Warning: Could not find name for turtle #{i+1}")

    # STEP 2: Find and follow the detail link
    link_tag = turtle.find('a', href=True)
    if link_tag:
        # Create the full URL and request the detail page
        detail_url = base_url + link_tag['href']
        
        try:
            # Add basic error handling for network requests
            detail_response = requests.get(detail_url, timeout=10)
            detail_response.raise_for_status()  # Will raise an exception for 4XX/5XX responses
            
            # Parse the detail page HTML
            detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
            
            # Extract the lead paragraph with detailed information
            info = detail_soup.find('p', class_='lead')

            if info:
                # STEP 3: Extract the common name
                known_tag = info.find('strong', class_='common-name')
                known_as = known_tag.text.strip() if known_tag else "Unknown"
                turtles_data['Known_As'].append(known_as)

                # STEP 4: Extract the discovery year using regex
                # Look for 4-digit years between 1700-2099
                year_match = re.search(r'\b(1[7-9]\d{2}|20\d{2})\b', info.text)
                year = year_match.group(0) if year_match else "Unknown"
                turtles_data['Discovery_Year'].append(year)

                # STEP 5: Extract the discoverer name using regex
                # Look for text that follows "by " and starts with a capital letter
                discoverer_match = re.search(r'by ([A-Z][a-zA-Z\s\.\-]*)', info.text)
                discoverer = discoverer_match.group(1).strip() if discoverer_match else "Unknown"
                turtles_data['Discovered_By'].append(discoverer)
            else:
                # Handle missing information
                turtles_data['Known_As'].append("Unknown")
                turtles_data['Discovery_Year'].append("Unknown")
                turtles_data['Discovered_By'].append("Unknown")
                print(f"  Warning: No detailed information found for {name}")
        
        except requests.exceptions.RequestException as e:
            # Handle any request errors (timeout, connection errors, etc.)
            turtles_data['Known_As'].append("Error")
            turtles_data['Discovery_Year'].append("Error")
            turtles_data['Discovered_By'].append("Error")
            print(f"  Error accessing {detail_url}: {e}")
    else:
        # Handle missing link
        turtles_data['Known_As'].append("No Link")
        turtles_data['Discovery_Year'].append("No Link")
        turtles_data['Discovered_By'].append("No Link")
        print(f"  Warning: No detail link found for turtle #{i+1}")

print("\nScraping completed!")
print(f"Collected data for {len(turtles_data['Name'])} turtle families")

Found 14 turtle families to process
Processing turtle 1/14...
Processing turtle 2/14...
Processing turtle 2/14...
Processing turtle 3/14...
Processing turtle 3/14...
Processing turtle 4/14...
Processing turtle 4/14...
Processing turtle 5/14...
Processing turtle 5/14...
Processing turtle 6/14...
Processing turtle 6/14...
Processing turtle 7/14...
Processing turtle 7/14...
Processing turtle 8/14...
Processing turtle 8/14...
Processing turtle 9/14...
Processing turtle 9/14...
Processing turtle 10/14...
Processing turtle 10/14...
Processing turtle 11/14...
Processing turtle 11/14...
Processing turtle 12/14...
Processing turtle 12/14...
Processing turtle 13/14...
Processing turtle 13/14...
Processing turtle 14/14...
Processing turtle 14/14...

Scraping completed!
Collected data for 14 turtle families

Scraping completed!
Collected data for 14 turtle families


In [None]:
# Create a DataFrame from the collected data
df = pd.DataFrame(turtles_data)

# Add a count column to make it easier to reference rows
df['ID'] = range(1, len(df) + 1)

# Reorder columns to put ID first
df = df[['ID', 'Name', 'Known_As', 'Discovery_Year', 'Discovered_By']]

# Display summary statistics
print(f"Total rows: {len(df)}")
print(f"Missing values: {df.isna().sum().sum()}")

# Replace empty strings or None values with "Unknown"
df = df.fillna("Unknown")

# Set display options to show all rows
pd.set_option('display.max_rows', None)

# Display the full DataFrame
print("\nTurtle Families Data:")
df

Unnamed: 0,Name,Known_As,Discovery_Year,Discovered_By
0,Carettochelyidae,Pig-nosed turtle,1887,Boulenger.
1,Cheloniidae,Sea turtles,1811,Oppel.
2,Chelydridae,Snapping turtles,1831,Gray.
3,Dermatemydidae,Central American river turtle,1870,Gray.
4,Dermochelyidae,Leatherback sea turtle,1843,Fitzinger.
5,Emydidae,Pond or water turtles,1815,Rafinesque.
6,Geoemydidae,"Asian river, leaf, roofed or Asian box turtles",1868,Theobald.
7,Kinosternidae,Mud or musk turtles,1857,Agassiz.
8,Platysternidae,Big-headed turtle,1869,Gray.
9,Testudinidae,Tortoises,1788,Batsch.


In [None]:
# Import datetime for timestamping files
from datetime import datetime

# Get current date and time for filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f'turtles_data_{timestamp}.csv'

# Save to CSV with index=False to exclude row numbers
df.to_csv(csv_filename, index=False)
print(f"Data saved to {csv_filename}")

# OPTIONAL: Save to Excel format with formatting (if xlsxwriter is installed)
try:
    excel_filename = f'turtles_data_{timestamp}.xlsx'
    
    # Create a writer for Excel
    with pd.ExcelWriter(excel_filename, engine='xlsxwriter') as writer:
        # Write the dataframe to Excel
        df.to_excel(writer, sheet_name='Turtle Families', index=False)
        
        # Get the xlsxwriter workbook and worksheet objects
        workbook = writer.book
        worksheet = writer.sheets['Turtle Families']
        
        # Add a header format
        header_format = workbook.add_format({
            'bold': True,
            'text_wrap': True,
            'valign': 'top',
            'bg_color': '#D8E4BC',
            'border': 1
        })
        
        # Apply the header format to the header row
        for col_num, value in enumerate(df.columns.values):
            worksheet.write(0, col_num, value, header_format)
            
        # Auto-adjust columns' width
        for i, col in enumerate(df.columns):
            column_width = max(df[col].astype(str).map(len).max(), len(col)) + 2
            worksheet.set_column(i, i, column_width)
    
    print(f"Data also saved to Excel: {excel_filename}")
except Exception as e:
    print(f"Could not save to Excel format: {e}")
    print("Tip: Install xlsxwriter with 'pip install xlsxwriter' to enable Excel export")

Data saved to turtles_data.csv


In [None]:
# Basic data visualization
import matplotlib.pyplot as plt

# Set a larger figure size
plt.figure(figsize=(10, 6))

# Count discoveries by century (assuming Discovery_Year is a string like '1812')
df['Century'] = df['Discovery_Year'].apply(
    lambda x: '18th' if str(x).startswith('17') else
              '19th' if str(x).startswith('18') else
              '20th' if str(x).startswith('19') else
              '21st' if str(x).startswith('20') else 'Unknown'
)

# Create a bar chart of discoveries by century
century_counts = df['Century'].value_counts().sort_index()
plt.bar(century_counts.index, century_counts.values, color='skyblue')

# Add labels and title
plt.xlabel('Century')
plt.ylabel('Number of Discoveries')
plt.title('Turtle Family Discoveries by Century')

# Add value labels on top of each bar
for i, v in enumerate(century_counts.values):
    plt.text(i, v + 0.1, str(v), ha='center')

plt.tight_layout()
plt.show()

# Display percentage of discoveries by century
century_percentage = (century_counts / century_counts.sum() * 100).round(1)
print("\nPercentage of Discoveries by Century:")
for century, percentage in century_percentage.items():
    print(f"{century}: {percentage}%")

In [None]:
# Reusable function for future scraping projects
def scrape_turtle_data(url, save_to_csv=True):
    """
    A reusable function to scrape turtle family data from scrapethissite.com
    
    Parameters:
    -----------
    url : str
        The URL of the page containing turtle family cards
    save_to_csv : bool, default=True
        Whether to save the results to a CSV file
        
    Returns:
    --------
    pandas.DataFrame
        A DataFrame containing the scraped turtle data
    """
    # Initialize session and get the page
    session = requests.Session()
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return None
    
    # Parse the HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all turtle cards
    turtle_cards = soup.find_all('div', class_='col-md-4 turtle-family-card')
    
    if not turtle_cards:
        print("No turtle cards found on the page")
        return None
    
    print(f"Found {len(turtle_cards)} turtle families")
    
    # Initialize data dictionary
    data = {
        "Name": [],
        "Known_As": [],
        "Discovery_Year": [],
        "Discovered_By": []
    }
    
    # Process each turtle card
    for i, card in enumerate(turtle_cards):
        print(f"Processing turtle {i+1}/{len(turtle_cards)}...")
        
        # Extract name
        name_tag = card.find('h3', class_='family-name')
        name = name_tag.text.strip() if name_tag else "Unknown"
        data['Name'].append(name)
        
        # Find detail link
        link_tag = card.find('a', href=True)
        if link_tag:
            # Get detail page
            detail_url = 'https://www.scrapethissite.com' + link_tag['href']
            try:
                detail_response = session.get(detail_url, timeout=10)
                detail_response.raise_for_status()
                detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
                
                # Extract information
                info = detail_soup.find('p', class_='lead')
                if info:
                    # Get common name
                    known_tag = info.find('strong', class_='common-name')
                    known_as = known_tag.text.strip() if known_tag else "Unknown"
                    
                    # Get year
                    year_match = re.search(r'\b(1[7-9]\d{2}|20\d{2})\b', info.text)
                    year = year_match.group(0) if year_match else "Unknown"
                    
                    # Get discoverer
                    discoverer_match = re.search(r'by ([A-Z][a-zA-Z\s\.\-]*)', info.text)
                    discoverer = discoverer_match.group(1).strip() if discoverer_match else "Unknown"
                    
                    # Add to data
                    data['Known_As'].append(known_as)
                    data['Discovery_Year'].append(year)
                    data['Discovered_By'].append(discoverer)
                else:
                    data['Known_As'].append("Unknown")
                    data['Discovery_Year'].append("Unknown")
                    data['Discovered_By'].append("Unknown")
            except requests.exceptions.RequestException:
                data['Known_As'].append("Error")
                data['Discovery_Year'].append("Error")
                data['Discovered_By'].append("Error")
        else:
            data['Known_As'].append("No Link")
            data['Discovery_Year'].append("No Link")
            data['Discovered_By'].append("No Link")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Add ID column
    df['ID'] = range(1, len(df) + 1)
    df = df[['ID', 'Name', 'Known_As', 'Discovery_Year', 'Discovered_By']]
    
    # Save to CSV if requested
    if save_to_csv:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'turtle_data_{timestamp}.csv'
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    
    return df

# Example usage:
# new_df = scrape_turtle_data('https://www.scrapethissite.com/pages/frames/?frame=i')

# Simple Web Scraping for iFrames

This notebook demonstrates how to scrape content from a webpage that contains frames or iframes. We'll follow these steps:

1. Load the parent page
2. Find all frame/iframe elements
3. Load the content of each frame
4. Extract data from each frame
5. Handle errors and avoid duplicates
6. Organize the data into a DataFrame

## What We're Scraping

The example site "scrapethissite.com" has a frames demo page that we'll use to practice extracting content from multiple frames.

# Issues and Solutions

When web scraping, you'll often encounter various challenges:

## Common Issues Fixed in This Code:

1. **404 Errors**: Some frames had invalid URLs that returned 404 errors
   - *Solution*: Added proper error handling with try/except blocks

2. **Duplicate Processing**: The same frame was being processed multiple times
   - *Solution*: Added a set to track processed URLs

3. **Missing Data**: Some expected elements were not found in the frames
   - *Solution*: Added fallbacks to look for alternative elements and proper default values

4. **Inconsistent Data Structure**: The DataFrame had mixed column types
   - *Solution*: Standardized the data structure and filled missing values

## Best Practices for Web Scraping:

- Add delays between requests to avoid overloading the server
- Handle errors gracefully
- Check if content exists before trying to extract it
- Avoid scraping too aggressively (which could get your IP blocked)
- Respect robots.txt and website terms of service