### Scrape GPT o1 instructions
- The goal is to scrape multiple items on multiple pages to extract image and text information about Smithsonian Collection items. 
- he base URL is: https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22
- all items on the page are contained in: <ul> with class="search-results masonry-grid reset-list masonry-processed">
- within that are <li> that contain <a> tag with href and class="inner" that contains the link that leads to that item's component_information (e.g. <a href="/object/archives/components/sova-acma-06-016-ref18" class="inner">) 

- once we click through to that href, on the component_information page, we want to extract information and create a dictionary of dictionaries:
- Key = h1
- then, within h2 are a series of <dl> containing <dt>, which will be the key of this nested dictionary, and <dd> which will be the value. There may be multiple <dd> per <dt>

- once this is done, we will return to base URL and move on to next <li> and repeat. 
- once all <li> are scraped, we will move on to the next page. 
- do this by, between base url 'images?' and 'edan_q' place 'page=1&'. This technically will take you to the second page. I.e. "https://www.si.edu/search/collection-images?page=1&edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22", then page 3 is: https://www.si.edu/search/collection-images?page=2&edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22 and so on. Then we repeat the process on the subsequent pages, building a database of dictionaries of dictionaries.
- write this code, and include printed information to confirm its working along the way.
- please cap this first code at the first 3 pages, as I want to confirm it works before going further. 
- Good luck!

In [11]:
# import requests
# from bs4 import BeautifulSoup
# import time
# import json
# import random

# # Base URL components
# BASE_URL = "https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22"

# # Headers to mimic a browser visit
# HEADERS = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
# }

# # Function to construct URL with page number
# def construct_url(page_number):
#     if page_number > 1:
#         # Insert page parameter after collection-images? for subsequent pages
#         page_param = f"page={page_number - 1}&"
#         url_parts = BASE_URL.split("?")
#         return f"{url_parts[0]}?{page_param}{url_parts[1]}"
#     return BASE_URL

# # Function to extract item links from a page
# def get_item_links(soup):
#     item_links = []
#     for li in soup.find_all("li", attrs={"ogmt-id": True}):
#         a_tag = li.find("a", class_="inner")
#         if a_tag and 'href' in a_tag.attrs:
#             item_links.append(a_tag['href'])
#     return item_links

# # Function to extract information from an item's component_information page
# def extract_item_info(item_url):
#     full_url = f"https://www.si.edu{item_url}"
#     response = requests.get(full_url, headers=HEADERS)
#     if response.status_code != 200:
#         print(f"Failed to retrieve {full_url}")
#         return None

#     # Try to decode the content, handle potential gzip errors
#     try:
#         content = response.content
#     except requests.exceptions.ContentDecodingError:
#             print(f"    Decompression error for {full_url}, skipping...")
#             return None
#     except Exception as e:
#         print(f"    Unexpected error for {full_url}: {str(e)}")
#         return None 
#     print(f"Retrieved {full_url}")
#     soup = BeautifulSoup(response.content, 'html.parser')
    
#     # Extract the h1 tag
#     h1_tag = soup.find("h1")
#     if not h1_tag:
#         print(f"No h1 tag found in {full_url}")
#         return None
#     h1_text = h1_tag.get_text(strip=True)
    
#     # Initialize the nested dictionary
#     item_dict = {}
#     item_dict[h1_text] = {}
    
#     # Find all h2 tags
#     h2_tags = soup.find_all("h2")
#     for h2 in h2_tags:
#         # Within each h2, find the following <dl>
#         dl = h2.find_next_sibling("dl")
#         if dl:
#             dt_tags = dl.find_all("dt")
#             dd_tags = dl.find_all("dd")
#             for dt, dd in zip(dt_tags, dd_tags):
#                 key = dt.get_text(strip=True)
#                 value = dd.get_text(strip=True)
#                 # Handle multiple <dd> per <dt>
#                 if key in item_dict[h1_text]:
#                     if isinstance(item_dict[h1_text][key], list):
#                         item_dict[h1_text][key].append(value)
#                     else:
#                         item_dict[h1_text][key] = [item_dict[h1_text][key], value]
#                 else:
#                     item_dict[h1_text][key] = value
#     return item_dict

# # Main scraping function
# def scrape_smithsonian_collection(total_pages=1):
#     all_items = {}
#     for page in range(1, total_pages + 1):
#         url = construct_url(page)
#         print(f"Scraping Page {page}: {url}")
#         response = requests.get(url, headers=HEADERS)
#         if response.status_code != 200:
#             print(f"Failed to retrieve page {page}")
#             continue
        
#         soup = BeautifulSoup(response.content, 'html.parser')
#         item_links = get_item_links(soup)
#         print(f"Found {len(item_links)} items on Page {page}")
        
#         for idx, item_link in enumerate(item_links, 1):
#             print(f"  Scraping Item {idx}: {item_link}")
#             item_info = extract_item_info(item_link)
#             if item_info:
#                 all_items.update(item_info)
#             else:
#                 print(f"    Failed to extract info for {item_link}")
#             # Add a random pause between 1 to 3 seconds
#             sleep_time = random.uniform(.5, 2.2)
#             print(f"    Sleeping for {sleep_time:.2f} seconds...")
#             time.sleep(sleep_time)  # Randomized pause
#         print(f"Completed scraping Page {page}\n")
    
#     return all_items

# # Execute the scraping
# if __name__ == "__main__":
#     scraped_data = scrape_smithsonian_collection(total_pages=1)
#     print("Scraping Completed. Total items scraped:", len(scraped_data))
#     # Optionally, save the data to a file
    
#     with open('smithsonian_collection.json', 'w') as f:
#         json.dump(scraped_data, f, indent=4)

Scraping Page 1: https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22
Found 54 items on Page 1
  Scraping Item 1: /object/archives/components/sova-acma-06-016-ref18
Retrieved https://www.si.edu/object/archives/components/sova-acma-06-016-ref18
    Sleeping for 1.20 seconds...
  Scraping Item 2: /object/archives/components/sova-eepa-1985-014-ref1532
Retrieved https://www.si.edu/object/archives/components/sova-eepa-1985-014-ref1532
    Sleeping for 1.09 seconds...
  Scraping Item 3: /object/archives/components/sova-eepa-1985-014-ref1254
Retrieved https://www.si.edu/object/archives/components/sova-eepa-1985-014-ref1254
    Sleeping for 0.83 seconds...
  Scraping Item 4: /object/archives/components/sova-eepa-1985-014-ref1873
Retrieved https://www.si.edu/object/archives/components/sova-eepa-1985-014-ref1873
    Sleeping for 1.53 seconds...
  Scraping Item 5: /object/archives/components/sova-eepa-1985-014-ref1285
Retrieved https://www.si.edu/object/archive

# Original Scrape without "Timeout" Adjustments

In [9]:
import requests
from bs4 import BeautifulSoup
import time
import json
import random

# Base URL components
BASE_URL = "https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22"

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko)",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)",
    # Add more user agents as needed
]

HEADERS = {
    "User-Agent": random.choice(USER_AGENTS)
}

# Function to construct URL with page number
def construct_url(page_number):
    if page_number > 1:
        # Insert page parameter after collection-images? for subsequent pages
        page_param = f"page={page_number - 1}&"
        url_parts = BASE_URL.split("?")
        return f"{url_parts[0]}?{page_param}{url_parts[1]}"
    return BASE_URL

# Function to extract item links from a page
def get_item_links(soup):
    item_links = []
    for li in soup.find_all("li", attrs={"ogmt-id": True}):
        a_tag = li.find("a", class_="inner")
        if a_tag and 'href' in a_tag.attrs:
            item_links.append(a_tag['href'])
    return item_links

    # Updated Function to extract information from an item's component_information page
def extract_item_info(item_url):
    full_url = f"https://www.si.edu{item_url}"
    try:
        response = requests.get(full_url, headers=HEADERS, timeout=10)
    except requests.exceptions.RequestException as e:
        print(f"Request exception for {full_url}: {e}")
        return None

    if response.status_code != 200:
        print(f"Failed to retrieve {full_url} with status code {response.status_code}")
        return None

    # Try to decode the content, handle potential gzip errors
    try:
        content = response.content
    except requests.exceptions.ContentDecodingError:
        print(f"    Decompression error for {full_url}, skipping...")
        return None
    except Exception as e:
        print(f"    Unexpected error for {full_url}: {str(e)}")
        return None 

    print(f"Retrieved {full_url}")
    soup = BeautifulSoup(content, 'html.parser')
    
    # Extract the h1 tag
    h1_tag = soup.find("h1")
    if not h1_tag:
        print(f"No h1 tag found in {full_url}")
        return None
    h1_text = h1_tag.get_text(strip=True)
    
    # Initialize the nested dictionary
    item_dict = {}
    item_dict[h1_text] = {}
    
    # Extract the "Museum" information from the next <a> tag after <h1>
    next_a_tag = h1_tag.find_next("a")
    if next_a_tag and next_a_tag.get_text(strip=True):
        museum_name = next_a_tag.get_text(strip=True)
        item_dict[h1_text]["Museum"] = museum_name
    else:
        print(f"No Museum information found in {full_url}")
    
    # Iterate through all <dl> tags directly
    for dl in soup.find_all("dl"):
        current_key = None
        values = []
        for child in dl.children:
            if child.name == "dt":
                if current_key and values:
                    # Assign the collected values to the previous key
                    if len(values) == 1:
                        value = values[0]
                    else:
                        value = values
                    
                    if current_key in item_dict[h1_text]:
                        # If key exists, append the new values appropriately
                        if isinstance(item_dict[h1_text][current_key], list):
                            if isinstance(value, list):
                                item_dict[h1_text][current_key].extend(value)
                            else:
                                item_dict[h1_text][current_key].append(value)
                        else:
                            if isinstance(value, list):
                                item_dict[h1_text][current_key] = [item_dict[h1_text][current_key]] + value
                            else:
                                item_dict[h1_text][current_key] = [item_dict[h1_text][current_key], value]
                    else:
                        # Assign the value directly
                        item_dict[h1_text][current_key] = value
                # Start a new key
                current_key = child.get_text(strip=True)
                values = []
            elif child.name == "dd":
                dd_text = child.get_text(strip=True)
                values.append(dd_text)
        # After the loop, assign the last collected values
        if current_key and values:
            if len(values) == 1:
                value = values[0]
            else:
                value = values
            
            if current_key in item_dict[h1_text]:
                # If key exists, append the new values appropriately
                if isinstance(item_dict[h1_text][current_key], list):
                    if isinstance(value, list):
                        item_dict[h1_text][current_key].extend(value)
                    else:
                        item_dict[h1_text][current_key].append(value)
                else:
                    if isinstance(value, list):
                        item_dict[h1_text][current_key] = [item_dict[h1_text][current_key]] + value
                    else:
                        item_dict[h1_text][current_key] = [item_dict[h1_text][current_key], value]
            else:
                # Assign the value directly
                item_dict[h1_text][current_key] = value
    # **New Section: Extracting the Image URL**
    # Locate the <span> with class "media-inner"
    media_inner_span = soup.find("span", class_="media-inner")
    if media_inner_span:
        # Find the <a> tag with class "modal-trigger image" within the span
        a_tag = media_inner_span.find("a", class_="modal-trigger image")
        if a_tag and a_tag.has_attr('data-source'):
            image_url = a_tag['data-source']
            item_dict[h1_text]["Image_URL"] = image_url
            print(f"    Image URL extracted: {image_url}")
        else:
            print(f"    No image URL found in {full_url}")
    else:
        print(f"    No media-inner span found in {full_url}")
    
    return item_dict

# Main scraping function
# def scrape_smithsonian_collection(total_pages=3, max_items=5):
def scrape_smithsonian_collection(total_pages=3):
    all_items = {}
    total_scraped = 0  # Initialize the counter

    for page in range(1, total_pages + 1):
        url = construct_url(page)
        print(f"Scraping Page {page}: {url}")
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
        except requests.exceptions.RequestException as e:
            print(f"Request exception for page {page}: {e}")
            continue

        if response.status_code != 200:
            print(f"Failed to retrieve page {page} with status code {response.status_code}")
            continue
        
        soup = BeautifulSoup(response.content, 'html.parser')
        item_links = get_item_links(soup)
        print(f"Found {len(item_links)} items on Page {page}")
        
        for idx, item_link in enumerate(item_links, 1):
            # if total_scraped >= max_items:
            #     print(f"Reached the maximum of {max_items} items. Stopping the scraper.")
            #     return all_items  # Exit the function once the limit is reached

            print(f"  Scraping Item {idx}: {item_link}")  
            item_info = extract_item_info(item_link)
            if item_info:
                all_items.update(item_info)
                total_scraped += 1  # Increment the counter only if scraping is successful
                # print(f"    Successfully scraped Item {idx}. Total scraped: {total_scraped}")
            else:
                print(f"    Failed to extract info for {item_link}")
            
            # Add a random pause between 1 to 3 seconds
            sleep_time = random.uniform(1, 3)
            print(f"    Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)  # Randomized pause
        print(f"Completed scraping Page {page}\n")
    
    print("Reached the end of the pages.")
    return all_items

# Execute the scraping
if __name__ == "__main__":
    scraped_data = scrape_smithsonian_collection(total_pages=3)
    print("Scraping Completed. Total items scraped:", len(scraped_data))
    # Optionally, save the data to a file
    
    with open('smithsonian_collection.json', 'w') as f:
        json.dump(scraped_data, f, indent=4)

Scraping Page 1: https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22
Found 54 items on Page 1
  Scraping Item 1: /object/archives/components/sova-acma-06-016-ref18


KeyboardInterrupt: 

# With "Timeout" Adjustments

In [11]:
import requests
from bs4 import BeautifulSoup
import time
import json
import random
from urllib.parse import urlparse
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')

# Base URL components
BASE_URL = "https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22"

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko)",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)",
    # Add more user agents as needed
]

# Function to check robots.txt
def can_scrape(url, user_agent='*'):
    parsed_url = urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    try:
        response = requests.get(robots_url)
        if response.status_code == 200:
            from urllib.robotparser import RobotFileParser
            rp = RobotFileParser()
            rp.parse(response.text.splitlines())
            return rp.can_fetch(user_agent, url)
    except requests.RequestException:
        pass
    return False  # If unable to fetch robots.txt, proceed with caution

# Function to construct URL with page number
def construct_url(page_number):
    if page_number > 1:
        # Insert page parameter after collection-images? for subsequent pages
        page_param = f"page={page_number - 1}&"
        url_parts = BASE_URL.split("?")
        return f"{url_parts[0]}?{page_param}{url_parts[1]}"
    return BASE_URL

# Function to extract item links from a page
def get_item_links(soup):
    item_links = []
    for li in soup.find_all("li", attrs={"ogmt-id": True}):
        a_tag = li.find("a", class_="inner")
        if a_tag and 'href' in a_tag.attrs:
            item_links.append(a_tag['href'])
    return item_links

# Function to extract information from an item's component_information page
def extract_item_info(session, item_url):
    full_url = f"https://www.si.edu{item_url}"
    try:
        response = session.get(full_url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logging.warning(f"Request exception for {full_url}: {e}")
        return None

    content = response.content
    logging.info(f"Retrieved {full_url}")
    soup = BeautifulSoup(content, 'html.parser')

    # Extract the h1 tag
    h1_tag = soup.find("h1")
    if not h1_tag:
        logging.warning(f"No h1 tag found in {full_url}")
        return None
    h1_text = h1_tag.get_text(strip=True)

    # Initialize the nested dictionary
    item_dict = {h1_text: {}}

    # Extract the "Museum" information from the next <a> tag after <h1>
    next_a_tag = h1_tag.find_next("a")
    if next_a_tag and next_a_tag.get_text(strip=True):
        museum_name = next_a_tag.get_text(strip=True)
        item_dict[h1_text]["Museum"] = museum_name
    else:
        logging.warning(f"No Museum information found in {full_url}")

    # Iterate through all <dl> tags directly
    for dl in soup.find_all("dl"):
        current_key = None
        values = []
        for child in dl.children:
            if child.name == "dt":
                if current_key and values:
                    # Assign the collected values to the previous key
                    item_dict[h1_text].setdefault(current_key, []).extend(values)
                # Start a new key
                current_key = child.get_text(strip=True)
                values = []
            elif child.name == "dd":
                dd_text = child.get_text(strip=True)
                values.append(dd_text)
        # After the loop, assign the last collected values
        if current_key and values:
            item_dict[h1_text].setdefault(current_key, []).extend(values)

    # Extracting the Image URL
    media_inner_span = soup.find("span", class_="media-inner")
    if media_inner_span:
        a_tag = media_inner_span.find("a", class_="modal-trigger image")
        if a_tag and a_tag.has_attr('data-source'):
            image_url = a_tag['data-source']
            item_dict[h1_text]["Image_URL"] = image_url
            logging.info(f"    Image URL extracted: {image_url}")
        else:
            logging.warning(f"    No image URL found in {full_url}")
    else:
        logging.warning(f"    No media-inner span found in {full_url}")

    return item_dict

# Main scraping function
def scrape_smithsonian_collection(total_pages=3):
    all_items = {}
    total_scraped = 0

    # Initialize a session
    session = requests.Session()
    session.headers.update({
        "User-Agent": random.choice(USER_AGENTS)
    })

    # Check if scraping is allowed
    # if not can_scrape(BASE_URL, session.headers["User-Agent"]):
    #     logging.warning("Scraping is not allowed by robots.txt. Exiting.")
    #     return all_items

    for page in range(1, total_pages + 1):
        url = construct_url(page)
        logging.info(f"Scraping Page {page}: {url}")
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            logging.warning(f"Request exception for page {page}: {e}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        item_links = get_item_links(soup)
        logging.info(f"Found {len(item_links)} items on Page {page}")

        for idx, item_link in enumerate(item_links, 1):
            logging.info(f"  Scraping Item {idx}: {item_link}")
            item_info = extract_item_info(session, item_link)
            if item_info:
                all_items.update(item_info)
                total_scraped += 1
            else:
                logging.warning(f"    Failed to extract info for {item_link}")

            # Add a random pause between 1 to 3 seconds
            sleep_time = random.uniform(1, 3)
            logging.info(f"    Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)

        logging.info(f"Completed scraping Page {page}\n")

    logging.info("Scraping completed.")
    return all_items

# Execute the scraping
if __name__ == "__main__":
    scraped_data = scrape_smithsonian_collection(total_pages=3)
    logging.info(f"Scraping Completed. Total items scraped: {len(scraped_data)}")
    # Optionally, save the data to a file
    with open('smithsonian_collection.json', 'w') as f:
        json.dump(scraped_data, f, indent=4)

Scraping Page 1: https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22
Found 54 items on Page 1
  Scraping Item 1: /object/archives/components/sova-acma-06-016-ref18
Retrieved https://www.si.edu/object/archives/components/sova-acma-06-016-ref18
    Image URL extracted: https://www.si.edu/object/viewer/ebl-1572528636032-1572528636124-0
    Sleeping for 1.44 seconds...
  Scraping Item 2: /object/archives/components/sova-eepa-1985-014-ref1532
Retrieved https://www.si.edu/object/archives/components/sova-eepa-1985-014-ref1532
    Image URL extracted: https://www.si.edu/object/viewer/ebl-1536871081657-1536871087408-0
    Sleeping for 1.85 seconds...
  Scraping Item 3: /object/archives/components/sova-eepa-1985-014-ref1254
Retrieved https://www.si.edu/object/archives/components/sova-eepa-1985-014-ref1254
    Image URL extracted: https://www.si.edu/object/viewer/ebl-1536871081657-1536871087365-3
    Sleeping for 2.16 seconds...
  Scraping Item 4: /object/arch