In [2]:
!pip install transformers torch requests bs4 pandas

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.12.3 bs4-0.0.2 soupsieve-2.6


In [3]:


import requests
from bs4 import BeautifulSoup
import time
import json
import random
from urllib.parse import urlparse
import logging
import pandas as pd
from transformers import pipeline
from PIL import Image
import numpy as np
import os
import torch
from transformers import AutoImageProcessor, AutoModelForObjectDetection

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')

# Base URL components
# BASE_URL = "https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22"
BASE_URL = "https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B%5D=media_usage:%22CC0%22&edan_fq%5B%5D=topic:%22Costume%22+OR+topic%3A%22Dress+accessories%22+OR+topic%3A%22Hats%22+OR+topic%3A%22Headgear%22+OR+topic%3A%22Neckties%22+OR+topic%3A%22Neckwear%22"


USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko)",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)",
    # Add more user agents as needed
]

# Function to construct URL with page number
def construct_url(page_number):
    if page_number > 1:
        # Insert page parameter after collection-images? for subsequent pages
        page_param = f"page={page_number - 1}&"
        url_parts = BASE_URL.split("?")
        return f"{url_parts[0]}?{page_param}{url_parts[1]}"
    return BASE_URL

# Function to extract item links from a page
def get_item_links(soup):
    item_links = []
    for li in soup.find_all("li", attrs={"ogmt-id": True}):
        a_tag = li.find("a", class_="inner")
        if a_tag and 'href' in a_tag.attrs:
            item_links.append(a_tag['href'])
    return item_links

from urllib.parse import urlparse, parse_qs, urlencode

def extract_item_info(session, item_url, item_number=None):
    full_url = f"https://www.si.edu{item_url}"
    try:
        response = session.get(full_url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logging.warning(f"Request exception for item {item_number}: {full_url}: {e}")
        return None

    content = response.content
    logging.info(f"Retrieving item {item_number}: {full_url}")

    soup = BeautifulSoup(content, 'html.parser')

    # Extract the h1 tag
    h1_tag = soup.find("h1")
    if not h1_tag:
        logging.warning(f"No h1 tag found in {full_url}")
        return None
    h1_text = h1_tag.get_text(strip=True)

    # Initialize the item dictionary
    item_dict = {'Title': h1_text}

    # Extract the "Museum" information from the next <a> tag after <h1>
    next_a_tag = h1_tag.find_next("a")
    if next_a_tag and next_a_tag.get_text(strip=True):
        museum_name = next_a_tag.get_text(strip=True)
        item_dict["Museum"] = museum_name
    else:
        logging.warning(f"No Museum information found in {full_url}")

    # Iterate through all <dl> tags directly
    for dl in soup.find_all("dl"):
        current_key = None
        values = []
        for child in dl.children:
            if child.name == "dt":
                if current_key and values:
                    # Assign the collected values to the previous key
                    item_dict[current_key] = values
                # Start a new key
                current_key = child.get_text(strip=True)
                values = []
            elif child.name == "dd":
                dd_text = child.get_text(strip=True)
                values.append(dd_text)
        # After the loop, assign the last collected values
        if current_key and values:
            item_dict[current_key] = values

    # Try to extract the 'Screen Image' link
    screen_image_link = soup.find('a', text='Screen Image')
    if screen_image_link and screen_image_link.has_attr('href'):
        image_url = screen_image_link['href']
        if not image_url.startswith('http'):
            image_url = f"https://ids.si.edu{image_url}"
        item_dict["Image_URL"] = image_url
        logging.info(f"    Screen Image URL extracted: {image_url}")
    else:
        # Fallback method: extract from <img id='edan-image'> tag
        img_tag = soup.find('img', id='edan-image')
        if img_tag and img_tag.has_attr('src'):
            img_src = img_tag['src']
            # Parse the URL
            parsed_url = urlparse(img_src)
            query_params = parse_qs(parsed_url.query)
            # Append '.jpg' to the 'id' parameter
            if 'id' in query_params:
                id_value = query_params['id'][0]
                if not id_value.endswith('.jpg'):
                    id_value += '.jpg'
                query_params['id'] = [id_value]
                # Reconstruct the query string
                new_query = urlencode(query_params, doseq=True)
                # Reconstruct the full URL
                image_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}?{new_query}"
                item_dict["Image_URL"] = image_url
                logging.info(f"    Image URL extracted from img tag: {image_url}")
            else:
                logging.warning(f"    No 'id' parameter in img src for {full_url}")
        else:
            logging.warning(f"    No valid Image URL found in {full_url}")

    return item_dict

def scrape_smithsonian_collection(
    total_pages=3,
    checkpoint_dir='checkpoints',
    output_dir='archive/cropped_images',
    resume=False
):
    # Ensure necessary directories exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize list to collect all objects
    all_objects = []
    
    # Handle checkpointing for resuming
    if resume:
        checkpoints = sorted([
            f for f in os.listdir(checkpoint_dir) 
            if f.endswith('.pkl') and f.startswith('checkpoint_')
        ])
        if checkpoints:
            last_checkpoint = checkpoints[-1]
            start_page = int(last_checkpoint.split('_')[1].split('.')[0]) + 1
            # Load last checkpoint data
            all_objects = pd.read_pickle(os.path.join(checkpoint_dir, last_checkpoint)).to_dict('records')
            logging.info(f"Resuming from page {start_page} with {len(all_objects)} objects loaded from checkpoint.")
        else:
            start_page = 1
            logging.warning(f"No checkpoints found in '{checkpoint_dir}'. Starting from page {start_page}.")
    else:
        start_page = 1
        logging.info("Starting scraping from page 1 without resuming.")
    
    # Initialize the requests session
    session = requests.Session()
    session.headers.update({"User-Agent": random.choice(USER_AGENTS)})
    
    # Initialize total_items to keep track of the cumulative item index
    total_items = len(all_objects) if resume else 0
    
    # Iterate through each page
    for page in range(start_page, total_pages + 1):
        logging.info(f"Scraping Page {page}")
        
        try:
            # Construct and send the GET request
            url = construct_url(page)
            response = session.get(url, timeout=10)
            response.raise_for_status()
            
            # Parse the page content
            soup = BeautifulSoup(response.content, 'html.parser')
            item_links = get_item_links(soup)
            
            page_items = []
            for idx, item_link in enumerate(item_links, 1):
                item_info = extract_item_info(session, item_link, item_number=idx)
                if item_info:
                    page_items.append(item_info)
                time.sleep(random.uniform(1, 3))  # Respectful scraping
            
            # Convert the items to a DataFrame
            page_df = pd.DataFrame(page_items)
            
            # Set unique index for page_df
            page_df.index = range(total_items, total_items + len(page_df))
            
            # Process images and extract objects
            objects_df = process_images(page_df, output_dir)
            all_objects.extend(objects_df.to_dict('records'))
            
            # Update the total_items counter
            total_items += len(page_df)
            
            # Save checkpoint after processing the page
            checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_{page}.pkl')
            pd.DataFrame(all_objects).to_pickle(checkpoint_path)
            logging.info(f"Saved checkpoint for page {page}")
        
        except requests.exceptions.HTTPError as http_err:
            if response.status_code == 404:
                logging.warning(f"Page {page} does not exist (404). Stopping scraper.")
                break  # Exit the loop if the page doesn't exist
            else:
                logging.error(f"HTTP error occurred on page {page}: {http_err}")
                continue  # Skip to the next page
        except Exception as e:
            logging.error(f"Unexpected error on page {page}: {e}")
            continue  # Skip to the next page
    
    # Create the final DataFrame from all collected objects
    final_df = pd.DataFrame(all_objects)
    
    # Save the final DataFrame
    final_df.to_csv('final_data.csv', index=False)
    final_df.to_pickle('final_data.pkl')
    
    logging.info(f"Scraping Completed. Total objects scraped: {len(final_df)}")
    
    return final_df

from transformers import pipeline

token_classifier = pipeline(
    "ner", model="dslim/bert-base-NER", aggregation_strategy="simple"
)

def extract_entities(df, entity_score_threshold=0.8):
    """
    Extracts named entities from the DataFrame's text columns and adds them as new columns.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing scraped data.
    - entity_score_threshold (float): The minimum confidence score for entities to be considered.

    Returns:
    - df (pandas.DataFrame): The updated DataFrame with new entity columns.
    """
    import pandas as pd
    
    # Ensure the DataFrame index is unique
    if not df.index.is_unique:
        df = df.reset_index(drop=True)
    
    # Initialize a dictionary to hold the new columns
    new_columns = {}
    
    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        # Dictionary to hold entities for this row
        row_entities = {}
        
        # Iterate over each column in the row
        for column in df.columns:
            # Skip columns that should not be processed for NER
            if column in ['Title', 'Museum', 'Image_URL', 'cropped_objects']:
                continue
            
            value = row[column]
            
            # Initialize list to hold text snippets to process
            texts_to_process = []
            
            # Determine how to handle the value based on its type
            if isinstance(value, list):
                # If it's a list, check if all elements are null
                if all(pd.isnull(v) for v in value):
                    continue  # Skip if all elements are null
                # Filter out non-string elements
                texts_to_process = [str(v) for v in value if isinstance(v, str)]
            elif isinstance(value, str):
                if pd.isnull(value):
                    continue  # Skip if the string is null
                texts_to_process = [value]
            else:
                # For any other type (e.g., NaN, None), skip processing
                continue
            
            # If there are no texts to process, skip to the next column
            if not texts_to_process:
                continue
            
            # Process each text snippet
            for text in texts_to_process:
                # Run the NER model on the text
                try:
                    results = token_classifier(text)
                except Exception as e:
                    logging.warning(f"NER model failed on text '{text}': {e}")
                    continue  # Skip this text snippet if NER fails
                
                # Filter entities based on the score threshold
                filtered_results = [entity for entity in results if entity['score'] >= entity_score_threshold]
                
                # Iterate over filtered entities and organize them by entity group
                for entity in filtered_results:
                    entity_group = entity['entity_group']
                    word = entity['word'].replace('\n', ' ').strip()  # Clean up the word
                    
                    # Create a new column name based on the original column and entity group
                    col_name = f"{column}.{entity_group}"
                    
                    # Initialize the list for this entity group if not already present
                    if col_name not in row_entities:
                        row_entities[col_name] = []
                    
                    # Append the extracted word to the list
                    row_entities[col_name].append(word)
        
        # Store the entities for this row
        for col_name, words in row_entities.items():
            if col_name not in new_columns:
                new_columns[col_name] = [None] * len(df)
            new_columns[col_name][idx] = words
    
    # After processing all rows, add the new columns to the DataFrame
    for col_name, column_data in new_columns.items():
        df[col_name] = column_data
    
    return df

# Load the processor and model for object detection
processor = AutoImageProcessor.from_pretrained("valentinafeve/yolos-fashionpedia")
model = AutoModelForObjectDetection.from_pretrained("valentinafeve/yolos-fashionpedia")

# Get the label mapping
id2label = model.config.id2label  # mapping from label IDs to label names

def analyze_and_crop_image(image_path, image_index, output_dir, confidence_threshold=0.75):
    """
    Analyzes the image, crops objects, and returns object information.

    Args:
        image_path (str): Path to the downloaded image.
        image_index (int): Index of the image.
        output_dir (str): Directory to save cropped images.
        confidence_threshold (float): Confidence threshold for detections.

    Returns:
        dict: Dictionary of objects with their attributes.
    """
    objects = {}
    try:
        # Load the image
        image = Image.open(image_path).convert("RGB")
        
        # Preprocess the image
        inputs = processor(images=image, return_tensors="pt")
        
        # Perform inference
        outputs = model(**inputs)
        
        # Convert outputs to COCO API
        target_sizes = torch.tensor([image.size[::-1]])
        results = processor.post_process_object_detection(outputs, threshold=confidence_threshold, target_sizes=target_sizes)[0]
        
        # Iterate over detections
        for idx, (score, label, box) in enumerate(zip(results["scores"], results["labels"], results["boxes"])):
            # Convert label to category name
            category_id = label.item()
            category_name = id2label[category_id]
            
            # Convert box coordinates to integers
            xmin, ymin, xmax, ymax = map(int, box.tolist())
            bbox = [xmin, ymin, xmax, ymax]
            area = (xmax - xmin) * (ymax - ymin)
            
            # Create a unique identifier for the object
            obj_index = f"{category_name}_{idx}"
            
            # Crop the image
            cropped_img = image.crop((xmin, ymin, xmax, ymax))
            cropped_image_path = os.path.join(output_dir, f"image_{image_index}_{obj_index}.jpg")
            cropped_img.save(cropped_image_path)
            
            # Collect object info
            obj = {
                'Name': category_name,
                'Bounding Box': bbox,
                'Area': area,
                'Confidence': score.item(),
                'cropped_image_path': cropped_image_path
            }
            objects[obj_index] = obj
            
    except Exception as e:
        logging.error(f"Error processing image {image_path}: {e}")
        
    return objects

def process_images(df, output_dir):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    object_rows = []
    for idx, row in df.iterrows():
        image_url = row['Image_URL']
        image_name = f"image_{idx}.jpg"
        image_path = os.path.join(output_dir, image_name)
        try:
            response = requests.get(image_url, stream=True)
            response.raise_for_status()
            with open(image_path, 'wb') as out_file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        out_file.write(chunk)
        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to download image {image_url}: {e}")
            continue
        
        # Process the image
        try:
            objects_data = analyze_and_crop_image(image_path, idx, output_dir, confidence_threshold=0.9)
            for object_id, object_info in objects_data.items():
                # Add image index or any other relevant info
                object_info['Image_Index'] = idx
                # Include the object ID
                object_info['Object_ID'] = object_id
                
                # **Integrate Item-Level Data:**
                # Add all columns from the item (row) to the object_info
                for key, value in row.items():
                    object_info[key] = value
                
                object_rows.append(object_info)
        except Exception as e:
            logging.error(f"Failed to process image {image_path}: {e}")
            continue
    
    # Create a DataFrame where each row is an object with integrated item-level data
    objects_df = pd.DataFrame(object_rows)
    # Set 'Object_ID' as the index if desired
    objects_df.set_index('Object_ID', inplace=True)
    return objects_df


if __name__ == "__main__":
    import logging
    
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    # Define scraping parameters
    TOTAL_PAGES = 181  # Adjust as needed
    CHECKPOINT_DIR = 'checkpoints'
    OUTPUT_DIR = 'archive/cropped_images'
    RESUME = True  # Set to True to resume from last checkpoint
    
    # Scrape the data with checkpointing
    scraped_df = scrape_smithsonian_collection(
        total_pages=TOTAL_PAGES,
        checkpoint_dir=CHECKPOINT_DIR,
        output_dir=OUTPUT_DIR,
        resume=RESUME
    )
    
    print(f"Scraping Completed. Total items scraped: {len(scraped_df)}")
    
    # **Extract Named Entities**
    enriched_df = extract_entities(scraped_df, entity_score_threshold=0.8)
    
    # **Save Final Results**
    enriched_df.to_csv('final_data.csv', index=False)
    enriched_df.to_pickle('final_data.pkl')
    
    print(f"Final data saved with NER information. Total records: {len(enriched_df)}")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
Resuming from page 10 w

Scraping Completed. Total items scraped: 8646


  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name] = column_data
  df[col_name]

Final data saved with NER information. Total records: 8646
