# Prompt

Below I have three --CODE BLOCKS--:
1. SCRAPE: pulls information about a number of items and stores it in a json file. 
2. FASHION-CROP: looks at an image, uses object recognition to find fashion components of the image, extracts information about the components, and extracts new images that are cropped components of the original image. 
3. WORD-CLASSIFIER: uses a model to classify key words into categories like LOC (location), PER (person), ORG (organization), etc. 

------------------
---------SCRAPE:---------
import requests
from bs4 import BeautifulSoup
import time
import json
import random
from urllib.parse import urlparse
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')

# Base URL components
BASE_URL = "https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22"

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko)",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)",
    # Add more user agents as needed
]

# Function to check robots.txt
def can_scrape(url, user_agent='*'):
    parsed_url = urlparse(url)
    robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
    try:
        response = requests.get(robots_url)
        if response.status_code == 200:
            from urllib.robotparser import RobotFileParser
            rp = RobotFileParser()
            rp.parse(response.text.splitlines())
            return rp.can_fetch(user_agent, url)
    except requests.RequestException:
        pass
    return False  # If unable to fetch robots.txt, proceed with caution

# Function to construct URL with page number
def construct_url(page_number):
    if page_number > 1:
        # Insert page parameter after collection-images? for subsequent pages
        page_param = f"page={page_number - 1}&"
        url_parts = BASE_URL.split("?")
        return f"{url_parts[0]}?{page_param}{url_parts[1]}"
    return BASE_URL

# Function to extract item links from a page
def get_item_links(soup):
    item_links = []
    for li in soup.find_all("li", attrs={"ogmt-id": True}):
        a_tag = li.find("a", class_="inner")
        if a_tag and 'href' in a_tag.attrs:
            item_links.append(a_tag['href'])
    return item_links

# Function to extract information from an item's component_information page
def extract_item_info(session, item_url):
    full_url = f"https://www.si.edu{item_url}"
    try:
        response = session.get(full_url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logging.warning(f"Request exception for {full_url}: {e}")
        return None

    content = response.content
    logging.info(f"Retrieved {full_url}")
    soup = BeautifulSoup(content, 'html.parser')

    # Extract the h1 tag
    h1_tag = soup.find("h1")
    if not h1_tag:
        logging.warning(f"No h1 tag found in {full_url}")
        return None
    h1_text = h1_tag.get_text(strip=True)

    # Initialize the nested dictionary
    item_dict = {h1_text: {}}

    # Extract the "Museum" information from the next <a> tag after <h1>
    next_a_tag = h1_tag.find_next("a")
    if next_a_tag and next_a_tag.get_text(strip=True):
        museum_name = next_a_tag.get_text(strip=True)
        item_dict[h1_text]["Museum"] = museum_name
    else:
        logging.warning(f"No Museum information found in {full_url}")

    # Iterate through all <dl> tags directly
    for dl in soup.find_all("dl"):
        current_key = None
        values = []
        for child in dl.children:
            if child.name == "dt":
                if current_key and values:
                    # Assign the collected values to the previous key
                    item_dict[h1_text].setdefault(current_key, []).extend(values)
                # Start a new key
                current_key = child.get_text(strip=True)
                values = []
            elif child.name == "dd":
                dd_text = child.get_text(strip=True)
                values.append(dd_text)
        # After the loop, assign the last collected values
        if current_key and values:
            item_dict[h1_text].setdefault(current_key, []).extend(values)

    # Extracting the Image URL
    media_inner_span = soup.find("span", class_="media-inner")
    if media_inner_span:
        a_tag = media_inner_span.find("a", class_="modal-trigger image")
        if a_tag and a_tag.has_attr('data-source'):
            image_url = a_tag['data-source']
            item_dict[h1_text]["Image_URL"] = image_url
            logging.info(f"    Image URL extracted: {image_url}")
        else:
            logging.warning(f"    No image URL found in {full_url}")
    else:
        logging.warning(f"    No media-inner span found in {full_url}")

    return item_dict

# Main scraping function
def scrape_smithsonian_collection(total_pages=3):
    all_items = {}
    total_scraped = 0

    # Initialize a session
    session = requests.Session()
    session.headers.update({
        "User-Agent": random.choice(USER_AGENTS)
    })

    # Check if scraping is allowed
    # if not can_scrape(BASE_URL, session.headers["User-Agent"]):
    #     logging.warning("Scraping is not allowed by robots.txt. Exiting.")
    #     return all_items

    for page in range(1, total_pages + 1):
        url = construct_url(page)
        logging.info(f"Scraping Page {page}: {url}")
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            logging.warning(f"Request exception for page {page}: {e}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        item_links = get_item_links(soup)
        logging.info(f"Found {len(item_links)} items on Page {page}")

        for idx, item_link in enumerate(item_links, 1):
            logging.info(f"  Scraping Item {idx}: {item_link}")
            item_info = extract_item_info(session, item_link)
            if item_info:
                all_items.update(item_info)
                total_scraped += 1
            else:
                logging.warning(f"    Failed to extract info for {item_link}")

            # Add a random pause between 1 to 3 seconds
            sleep_time = random.uniform(1, 3)
            logging.info(f"    Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)

        logging.info(f"Completed scraping Page {page}\n")

    logging.info("Scraping completed.")
    return all_items

# Execute the scraping
if __name__ == "__main__":
    scraped_data = scrape_smithsonian_collection(total_pages=3)
    logging.info(f"Scraping Completed. Total items scraped: {len(scraped_data)}")
    # Optionally, save the data to a file
    with open('smithsonian_collection.json', 'w') as f:
        json.dump(scraped_data, f, indent=4)

-----------WORD-CLASSIFIER----------
from transformers import pipeline

with open('/Users/joshstrupp/Documents/Working/Educational/MSDV/ms1-final/notebook_explore/notebooks/smithsonian_collection.json', 'r') as file:
    text = file.read()

# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

# Get results and filter by score
results = token_classifier(text)
filtered_results = [entity for entity in results if entity['score'] >= 0.8]

# Display filtered results
filtered_results


-----------------FASHION-CROP-------------

from datasets import load_dataset

ds = load_dataset("detection-datasets/fashionpedia")
from transformers import AutoImageProcessor, AutoModelForObjectDetection

# Load the processor and model
processor = AutoImageProcessor.from_pretrained("valentinafeve/yolos-fashionpedia")
model = AutoModelForObjectDetection.from_pretrained("valentinafeve/yolos-fashionpedia")
from PIL import Image
import numpy as np

# Replace 'Chippewa.png' with the actual path to your image
image_path = 'test2.png'

# Load the image and ensure it's in RGB format
image = Image.open(image_path).convert('RGB')

# Convert the PIL Image to a NumPy array
image_array = np.array(image)

# Debugging: Print the shape of the image array
print('Image shape:', image_array.shape)  # Should be (height, width, 3)

# Preprocess the image
inputs = processor(images=image_array, return_tensors="pt")

# Run inference
outputs = model(**inputs)

# Post-process the outputs to get object detection results
results = processor.post_process_object_detection(
    outputs, threshold=0.5, target_sizes=[image.size[::-1]]
)
result = results[0]  # Since we have only one image

import json

# Load category mappings
with open('/Users/joshstrupp/Documents/Working/Educational/MSDV/ms1-final/fashionpedia-api/data/demo/category_attributes_descriptions.json', 'r') as f:
    category_data = json.load(f)

# Create mappings from category IDs to names and supercategories
categories = category_data['categories']
category_id_to_name = {category['id']: category['name'] for category in categories}
category_id_to_supercategory = {category['id']: category['supercategory'] for category in categories}

# Create a mapping from label IDs to category names (from the model)
label_mappings = model.config.id2label  # This maps label IDs to category names

# Create a mapping from category names to supercategories
name_to_supercategory = {category['name']: category['supercategory'] for category in categories}
# Prepare the objects data
objects = {
    'bbox_id': [],
    'category': [],
    'bbox': [],
    'area': [],
    'supercategory': [],
    'name': []
}

for idx in range(len(result['scores'])):
    score = result['scores'][idx].item()
    label_id = result['labels'][idx].item()
    box = result['boxes'][idx].tolist()  # [xmin, ymin, xmax, ymax]

    # Compute area
    x_min, y_min, x_max, y_max = box
    area = (x_max - x_min) * (y_max - y_min)

    # Get category name from label ID
    name = label_mappings.get(label_id, 'Unknown')
    supercategory = name_to_supercategory.get(name, 'Unknown')

    # Map category name back to category ID from the dataset if needed
    category_id = next((id for id, n in category_id_to_name.items() if n == name), label_id)

    # Append to objects
    objects['bbox_id'].append(idx)
    objects['category'].append(category_id)
    objects['bbox'].append(box)
    objects['area'].append(area)
    objects['name'].append(name)
    objects['supercategory'].append(supercategory)

# Get image dimensions
width, height = image.size

# Compile the final output
output = {
    'image_id': 0,  # Assign an ID to your image
    'image': image,
    'width': width,
    'height': height,
    'objects': objects
}

# Print the analysis
print("Image ID:", output['image_id'])
print("Width:", output['width'])
print("Height:", output['height'])
print("Objects Detected:")
for i in range(len(objects['bbox_id'])):
    print(f"  Object {i+1}:")
    print(f"    Bounding Box ID: {objects['bbox_id'][i]}")
    print(f"    Category ID: {objects['category'][i]}")
    print(f"    Name: {objects['name'][i]}")
    print(f"    Supercategory: {objects['supercategory'][i]}")
    print(f"    Bounding Box: {objects['bbox'][i]}")
    print(f"    Area: {objects['area'][i]}")

def analyze_image(image_path, category_json_path='category_attributes_descriptions.json'):
    """
    Analyzes an image using the Fashionpedia model and returns a DataFrame with the outputs.

    Parameters:
    - image_path (str): The path to the image file.
    - category_json_path (str): The path to the category attributes JSON file.

    Returns:
    - df (pandas.DataFrame): A DataFrame containing the analysis results.
    """
    from PIL import Image
    from transformers import AutoImageProcessor, AutoModelForObjectDetection
    import pandas as pd
    import torch
    import json
    import numpy as np

    # 1. Load your image
    image = Image.open(image_path)

    # Ensure image is in RGB format
    if image.mode != 'RGB':
        image = image.convert('RGB')

    # Convert the image to a NumPy array
    image_array = np.array(image)

    # 2. Load the processor and model
    processor = AutoImageProcessor.from_pretrained("valentinafeve/yolos-fashionpedia")
    model = AutoModelForObjectDetection.from_pretrained("valentinafeve/yolos-fashionpedia")

    # 3. Prepare the image
    inputs = processor(images=image_array, return_tensors="pt")

    # 4. Run inference
    outputs = model(**inputs)

    # 5. Process the outputs
    results = processor.post_process_object_detection(
        outputs, threshold=0.8, target_sizes=[image.size[::-1]]
    )
    result = results[0]

    # 6. Map category IDs to names and supercategories
    with open(category_json_path, 'r') as f:
        category_data = json.load(f)
    categories = category_data['categories']
    category_id_to_name = {category['id']: category['name'] for category in categories}
    category_id_to_supercategory = {category['id']: category['supercategory'] for category in categories}

    label_mappings = model.config.id2label
    name_to_supercategory = {category['name']: category['supercategory'] for category in categories}

    # 7. Prepare data for DataFrame
    data = []
    for idx in range(len(result['scores'])):
        score = result['scores'][idx].item()
        label_id = result['labels'][idx].item()
        box = result['boxes'][idx].tolist()
        x_min, y_min, x_max, y_max = box
        area = (x_max - x_min) * (y_max - y_min)

        name = label_mappings.get(label_id, 'Unknown')
        supercategory = name_to_supercategory.get(name, 'Unknown')
        category_id = next((id for id, n in category_id_to_name.items() if n == name), label_id)

        data.append({
            'bbox_id': idx,
            'category_id': category_id,
            'name': name,
            'supercategory': supercategory,
            'bbox': box,
            'area': area,
            'score': score
        })

    # Create DataFrame
    df = pd.DataFrame(data)

    return df

# Replace 'path/to/your/image.jpg' with the actual path to your image
image_path = 'test2.png'

# Optionally, specify the path to your category attributes JSON file
category_json_path = '../category_attributes_descriptions.json'

# Call the function
df = analyze_image(image_path, category_json_path)


import os

def save_cropped_images(image_path, df, output_dir='cropped_images'):
    """
    Crops the original image according to the bounding boxes and saves the cropped images.

    Parameters:
    - image_path (str): Path to the original image.
    - df (pandas.DataFrame): DataFrame containing the detection results.
    - output_dir (str): Directory where the cropped images will be saved.
    """
    from PIL import Image
    import os

    # Load the image
    image = Image.open(image_path)

    # Ensure image is in RGB format
    if image.mode != 'RGB':
        image = image.convert('RGB')

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Get image dimensions
    width, height = image.size

    # Loop over each detection
    for idx, row in df.iterrows():
        bbox = row['bbox']
        name = row['name']

        # Extract bounding box coordinates
        x_min, y_min, x_max, y_max = map(int, bbox)

        # Clip coordinates to image bounds
        x_min = max(0, min(width, x_min))
        y_min = max(0, min(height, y_min))
        x_max = max(0, min(width, x_max))
        y_max = max(0, min(height, y_max))

        # Check for valid crop
        if x_max > x_min and y_max > y_min:
            # Crop the image
            cropped_image = image.crop((x_min, y_min, x_max, y_max))

            # Ensure cropped image is in RGB mode
            if cropped_image.mode != 'RGB':
                cropped_image = cropped_image.convert('RGB')

            # Create a unique filename
            image_name = os.path.splitext(os.path.basename(image_path))[0]
            filename = f"{image_name}_{name.replace(' ', '_')}_{idx}.jpg"
            output_path = os.path.join(output_dir, filename)

            try:
                # Save the cropped image
                cropped_image.save(output_path, format='JPEG')
                print(f"Saved cropped image: {output_path}")
            except Exception as e:
                print(f"Failed to save {output_path}: {e}")
        else:
            print(f"Skipping invalid crop for detection {idx}")

# Assume df is the DataFrame obtained from analyze_image
image_path = 'Angola Costume.png'  # Replace with your image path

# Check if output directory exists, if not create it
if not os.path.exists('cropped_images'):
    os.makedirs('cropped_images')

# Call the function to save cropped images, ensuring unique filenames
existing_files = set(os.listdir('cropped_images'))
counter = 1
while any(f"{os.path.splitext(os.path.basename(image_path))[0]}_{counter}" in f for f in existing_files):
    counter += 1
save_cropped_images(image_path, df, output_dir='cropped_images')

---------------------

The objective is to combine this into one python script that

1. Completes the SCRAPE and saves all data to a dataframe where h1_text is in first column "title", and remaining columns are made up of dt values, while dd (and dd lists) are cell contents. E.g.:

Columns: Title, Museum, Names, Collection Photographer, ...
Vals: Lillian Evanti wears costume from Lucia di Lammermour, Anacostia Community Museum, ["Evanti, Lillian, Mme. (Lillian Evans Tibbs), 1890-1967"], ["Apeda Studio (New York, N.Y.)", "Camuzzi, M.", "Harris & Ewing"]... 

2. Use the WORD-CLASSIFIER to look at each string in the df and extract okenized words that have filtered_results, i.e. have an assigned PER, ORG, LOC, ets. (Currently using local json: with open('/Users/joshstrupp/Documents/Working/Educational/MSDV/ms1-final/notebook_explore/notebooks/smithsonian_collection.json', 'r') as file:
    text = file.read() — want to replace the json with the df information from step 1) Place in a new column as a comma separated list that is named "<dt>.[entity_group]>". E.g. for the below, there would be a new column that is produced called "Collection Photographer.LOC" and it might include "Apeda Studio, New York". Then "Collection Photographer.PER" that includes [Camuzzi,M., Harris & Ewing], etc: 
"Lillian Evanti wears costume from Lucia di Lammermour": {
        ...
        "Collection Photographer": [
            "Apeda Studio (New York, N.Y.)",
            "Camuzzi, M.",
            "Harris & Ewing"
        ],

3. As we scrape and retrieve the image_url, pass the image from image_url through the FASHION-CROP. Here, we'll a new column called "cropped_objects" that contains a dictionary of dictionaries using Name, Supercategory, Bounding Box, and Area. E.g. {Object 1: {Name: neckline}, {Supercategory: garment parts}...},{Object 2: {Name: something}, {Supercategory: something else}...}

4. Also in FASHION-CROP we will need to extract the cropped_images. These will be saved to a folder using the current image path structure, then that path will be added to the dataframe's "cropped_objects" dict at the end for each object identified, e.g. {...{cropped_image_path:/whatever/the/path/to/download/is.jpg}}


I imagine this one will take you a while. Take your time. Place "prints" to show progress as the function runs. And thank you!

In [3]:
import requests
from bs4 import BeautifulSoup
import time
import json
import random
from urllib.parse import urlparse
import logging
import pandas as pd
from transformers import pipeline
from PIL import Image
import numpy as np
import os
import torch
from transformers import AutoImageProcessor, AutoModelForObjectDetection

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')

# Base URL components
BASE_URL = "https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22"

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko)",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)",
    # Add more user agents as needed
]

# Function to construct URL with page number
def construct_url(page_number):
    if page_number > 1:
        # Insert page parameter after collection-images? for subsequent pages
        page_param = f"page={page_number - 1}&"
        url_parts = BASE_URL.split("?")
        return f"{url_parts[0]}?{page_param}{url_parts[1]}"
    return BASE_URL

# Function to extract item links from a page
def get_item_links(soup):
    item_links = []
    for li in soup.find_all("li", attrs={"ogmt-id": True}):
        a_tag = li.find("a", class_="inner")
        if a_tag and 'href' in a_tag.attrs:
            item_links.append(a_tag['href'])
    return item_links

from urllib.parse import urlparse, parse_qs, urlencode

def extract_item_info(session, item_url):
    full_url = f"https://www.si.edu{item_url}"
    try:
        response = session.get(full_url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logging.warning(f"Request exception for {full_url}: {e}")
        return None

    content = response.content
    logging.info(f"Retrieved {full_url}")
    soup = BeautifulSoup(content, 'html.parser')

    # Extract the h1 tag
    h1_tag = soup.find("h1")
    if not h1_tag:
        logging.warning(f"No h1 tag found in {full_url}")
        return None
    h1_text = h1_tag.get_text(strip=True)

    # Initialize the item dictionary
    item_dict = {'Title': h1_text}

    # Extract the "Museum" information from the next <a> tag after <h1>
    next_a_tag = h1_tag.find_next("a")
    if next_a_tag and next_a_tag.get_text(strip=True):
        museum_name = next_a_tag.get_text(strip=True)
        item_dict["Museum"] = museum_name
    else:
        logging.warning(f"No Museum information found in {full_url}")

    # Iterate through all <dl> tags directly
    for dl in soup.find_all("dl"):
        current_key = None
        values = []
        for child in dl.children:
            if child.name == "dt":
                if current_key and values:
                    # Assign the collected values to the previous key
                    item_dict[current_key] = values
                # Start a new key
                current_key = child.get_text(strip=True)
                values = []
            elif child.name == "dd":
                dd_text = child.get_text(strip=True)
                values.append(dd_text)
        # After the loop, assign the last collected values
        if current_key and values:
            item_dict[current_key] = values

    # Try to extract the 'Screen Image' link
    screen_image_link = soup.find('a', text='Screen Image')
    if screen_image_link and screen_image_link.has_attr('href'):
        image_url = screen_image_link['href']
        if not image_url.startswith('http'):
            image_url = f"https://ids.si.edu{image_url}"
        item_dict["Image_URL"] = image_url
        logging.info(f"    Screen Image URL extracted: {image_url}")
    else:
        # Fallback method: extract from <img id='edan-image'> tag
        img_tag = soup.find('img', id='edan-image')
        if img_tag and img_tag.has_attr('src'):
            img_src = img_tag['src']
            # Parse the URL
            parsed_url = urlparse(img_src)
            query_params = parse_qs(parsed_url.query)
            # Append '.jpg' to the 'id' parameter
            if 'id' in query_params:
                id_value = query_params['id'][0]
                if not id_value.endswith('.jpg'):
                    id_value += '.jpg'
                query_params['id'] = [id_value]
                # Reconstruct the query string
                new_query = urlencode(query_params, doseq=True)
                # Reconstruct the full URL
                image_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}?{new_query}"
                item_dict["Image_URL"] = image_url
                logging.info(f"    Image URL extracted from img tag: {image_url}")
            else:
                logging.warning(f"    No 'id' parameter in img src for {full_url}")
        else:
            logging.warning(f"    No valid Image URL found in {full_url}")

    return item_dict

def scrape_smithsonian_collection(total_pages=3):
    all_items = []
    total_scraped = 0

    # Initialize a session
    session = requests.Session()
    session.headers.update({
        "User-Agent": random.choice(USER_AGENTS)
    })

    for page in range(1, total_pages + 1):
        url = construct_url(page)
        logging.info(f"Scraping Page {page}: {url}")
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            logging.warning(f"Request exception for page {page}: {e}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        item_links = get_item_links(soup)
        logging.info(f"Found {len(item_links)} items on Page {page}")

        for idx, item_link in enumerate(item_links, 1):
            logging.info(f"  Scraping Item {idx}: {item_link}")
            item_info = extract_item_info(session, item_link)
            if item_info:
                all_items.append(item_info)
                total_scraped += 1
            else:
                logging.warning(f"    Failed to extract info for {item_link}")

            # Add a random pause between 1 to 3 seconds
            sleep_time = random.uniform(1, 3)
            logging.info(f"    Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)

        logging.info(f"Completed scraping Page {page}\n")

    logging.info("Scraping completed.")
    # Convert list of dicts to DataFrame
    df = pd.DataFrame(all_items)
    return df

# Load the NER pipeline
from transformers import pipeline

token_classifier = pipeline(
    "ner", model="dslim/bert-base-NER", aggregation_strategy="simple"
)

def extract_entities(df, entity_score_threshold=0.8):
    """
    Extracts named entities from the DataFrame's text columns and adds them as new columns.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing scraped data.
    - entity_score_threshold (float): The minimum confidence score for entities to be considered.

    Returns:
    - df (pandas.DataFrame): The updated DataFrame with new entity columns.
    """
    import pandas as pd
    
    # Ensure the DataFrame index is unique
    if not df.index.is_unique:
        df = df.reset_index(drop=True)
    
    # Initialize a dictionary to hold the new columns
    new_columns = {}
    
    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        # Dictionary to hold entities for this row
        row_entities = {}
        
        # Iterate over each column in the row
        for column in df.columns:
            # Skip columns that should not be processed for NER
            if column in ['Title', 'Museum', 'Image_URL', 'cropped_objects']:
                continue
            
            value = row[column]
            
            # Initialize list to hold text snippets to process
            texts_to_process = []
            
            # Determine how to handle the value based on its type
            if isinstance(value, list):
                # If it's a list, check if all elements are null
                if all(pd.isnull(v) for v in value):
                    continue  # Skip if all elements are null
                # Filter out non-string elements
                texts_to_process = [str(v) for v in value if isinstance(v, str)]
            elif isinstance(value, str):
                if pd.isnull(value):
                    continue  # Skip if the string is null
                texts_to_process = [value]
            else:
                # For any other type (e.g., NaN, None), skip processing
                continue
            
            # If there are no texts to process, skip to the next column
            if not texts_to_process:
                continue
            
            # Process each text snippet
            for text in texts_to_process:
                # Run the NER model on the text
                try:
                    results = token_classifier(text)
                except Exception as e:
                    logging.warning(f"NER model failed on text '{text}': {e}")
                    continue  # Skip this text snippet if NER fails
                
                # Filter entities based on the score threshold
                filtered_results = [entity for entity in results if entity['score'] >= entity_score_threshold]
                
                # Iterate over filtered entities and organize them by entity group
                for entity in filtered_results:
                    entity_group = entity['entity_group']
                    word = entity['word'].replace('\n', ' ').strip()  # Clean up the word
                    
                    # Create a new column name based on the original column and entity group
                    col_name = f"{column}.{entity_group}"
                    
                    # Initialize the list for this entity group if not already present
                    if col_name not in row_entities:
                        row_entities[col_name] = []
                    
                    # Append the extracted word to the list
                    row_entities[col_name].append(word)
        
        # Store the entities for this row
        for col_name, words in row_entities.items():
            if col_name not in new_columns:
                new_columns[col_name] = [None] * len(df)
            new_columns[col_name][idx] = words
    
    # After processing all rows, add the new columns to the DataFrame
    for col_name, column_data in new_columns.items():
        df[col_name] = column_data
    
    return df

# Load the processor and model for object detection
processor = AutoImageProcessor.from_pretrained("valentinafeve/yolos-fashionpedia")
model = AutoModelForObjectDetection.from_pretrained("valentinafeve/yolos-fashionpedia")

def analyze_and_crop_image(image_path, image_id, output_dir, category_data):
    """
    Analyzes an image using the Fashionpedia model, saves cropped images, and returns a dictionary of objects data.
    """
    from PIL import Image
    import torch
    import numpy as np
    
    # Load category mappings
    categories = category_data['categories']
    category_id_to_name = {category['id']: category['name'] for category in categories}
    category_id_to_supercategory = {category['id']: category['supercategory'] for category in categories}
    label_mappings = model.config.id2label  # This maps label IDs to category names
    name_to_supercategory = {category['name']: category['supercategory'] for category in categories}
    
    # Load the image
    image = Image.open(image_path)
    # Ensure image is in RGB format
    if image.mode != 'RGB':
        image = image.convert('RGB')
    # Convert the image to a NumPy array
    image_array = np.array(image)
    
    # Preprocess the image
    inputs = processor(images=image_array, return_tensors="pt")
    # Run inference
    outputs = model(**inputs)
    # Post-process the outputs to get object detection results
    results = processor.post_process_object_detection(
        outputs, threshold=0.5, target_sizes=[image.size[::-1]]
    )
    result = results[0]
    
    objects_data = {}
    # Get image dimensions
    width, height = image.size
    
    # Loop over each detection
    for idx, (score, label_id, box) in enumerate(zip(result['scores'], result['labels'], result['boxes'])):
        score = score.item()
        label_id = label_id.item()
        box = box.tolist()
        x_min, y_min, x_max, y_max = map(int, box)
        area = (x_max - x_min) * (y_max - y_min)
        name = label_mappings.get(label_id, 'Unknown')
        supercategory = name_to_supercategory.get(name, 'Unknown')
        category_id = next((id for id, n in category_id_to_name.items() if n == name), label_id)
        
        # Save the cropped image
        # Clip coordinates to image bounds
        x_min = max(0, min(width, x_min))
        y_min = max(0, min(height, y_min))
        x_max = max(0, min(width, x_max))
        y_max = max(0, min(height, y_max))
        
        # Check for valid crop
        if x_max > x_min and y_max > y_min:
            # Crop the image
            cropped_image = image.crop((x_min, y_min, x_max, y_max))
            # Ensure cropped image is in RGB mode
            if cropped_image.mode != 'RGB':
                cropped_image = cropped_image.convert('RGB')
            # Create a unique filename
            cropped_image_name = f"{os.path.splitext(os.path.basename(image_path))[0]}_{name.replace(' ', '_')}_{idx}.jpg"
            cropped_image_path = os.path.join(output_dir, cropped_image_name)
            try:
                # Save the cropped image
                cropped_image.save(cropped_image_path, format='JPEG')
                print(f"Saved cropped image: {cropped_image_path}")
            except Exception as e:
                print(f"Failed to save {cropped_image_path}: {e}")
                continue
        else:
            print(f"Skipping invalid crop for detection {idx}")
            continue
        
        # Collect the data
        object_key = f"Object_{idx+1}"
        objects_data[object_key] = {
            'Name': name,
            'Supercategory': supercategory,
            'Bounding Box': box,
            'Area': area,
            'cropped_image_path': cropped_image_path
        }
    return objects_data

def process_images(df, output_dir='cropped_images', category_json_path='../category_attributes_descriptions.json'):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Load category mappings
    with open(category_json_path, 'r') as f:
        category_data = json.load(f)
    
    for idx, row in df.iterrows():
        image_url = row['Image_URL']
        if pd.isnull(image_url):
            continue
        print(f"Processing image for row {idx}: {image_url}")
        # Download the image
        image_name = f"image_{idx}.jpg"
        image_path = os.path.join(output_dir, image_name)
        try:
            response = requests.get(image_url, stream=True)
            response.raise_for_status()
            with open(image_path, 'wb') as out_file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        out_file.write(chunk)
        except requests.exceptions.RequestException as e:
            print(f"Failed to download image {image_url}: {e}")
            continue
        
        # Process the image
        try:
            objects_data = analyze_and_crop_image(image_path, idx, output_dir, category_data)
            df.at[idx, 'cropped_objects'] = objects_data
        except Exception as e:
            print(f"Failed to process image {image_path}: {e}")
            continue
    return df
if __name__ == "__main__":
    # Scrape the data
    scraped_df = scrape_smithsonian_collection(total_pages=1)
    print(f"Scraping Completed. Total items scraped: {len(scraped_df)}")
    # Save the scraped data to a CSV file
    scraped_df.to_csv('scraped_data.csv', index=False)
    
    # Extract entities
    print("Extracting entities...")
    scraped_df = extract_entities(scraped_df, entity_score_threshold=0.8)
    # Save the data with entities to a CSV file
    scraped_df.to_csv('scraped_data_with_entities.csv', index=False)
    
    # Process images
    print("Processing images...")
    scraped_df = process_images(scraped_df, output_dir='cropped_images', category_json_path='/Users/joshstrupp/Documents/Working/Educational/MSDV/ms1-final/category_attributes_descriptions.json')
    # Save the final DataFrame to a CSV file
    scraped_df.to_csv('final_data.csv', index=False)
    # Optionally, save the DataFrame to a pickle file
    scraped_df.to_pickle('final_data.pkl')

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Scraping Page 1: https://www.si.edu/search/collection-images?edan_q=&edan_fq%5B0%5D=topic%3A%22Costume%22
Found 54 items on Page 1
  Scraping Item 1: /object/archives/components/sova-acma-06-0

Scraping Completed. Total items scraped: 53
Extracting entities...
Processing images...
Processing image for row 0: https://ids.si.edu/ids/deliveryService?max_w=800&id=damsmdm%3AACM-acma_PH2003_7063_007_01edit.jpg
Saved cropped image: cropped_images/image_0_neckline_0.jpg
Saved cropped image: cropped_images/image_0_dress_1.jpg
Saved cropped image: cropped_images/image_0_sleeve_2.jpg
Saved cropped image: cropped_images/image_0_neckline_3.jpg
Failed to process image cropped_images/image_0.jpg: Incompatible indexer with Series
Processing image for row 1: https://ids.si.edu/ids/deliveryService?max_w=800&id=damsmdm%3ANMAfA-AO-20-85.jpg
Saved cropped image: cropped_images/image_1_sleeve_0.jpg
Saved cropped image: cropped_images/image_1_sleeve_1.jpg
Saved cropped image: cropped_images/image_1_dress_2.jpg
Processing image for row 2: https://ids.si.edu/ids/deliveryService?max_w=800&id=damsmdm%3ANMAfA-AO-15-05.jpg
Saved cropped image: cropped_images/image_2_sleeve_0.jpg
Saved cropped image: crop