# GEOG 5900 - MVP

### Author: Jacob Harris
### Date: 2024/12/17

#### Updates from last version
- Functionality: Saves metadata for each image as a csv
- UI: Uses Chrome Driver to view scraping
- Quality of Life: Adds limit to the number of images that the script will download (per prompt)
- Includes a demo for reformatting metadata dfs by saving a new CSV file with persistent URLs for each images
#### Description
- The following script is meant to scrapes images from the Umedia website. This way, you have an automated method for downloading historical images of certain buildings
- If you want to replicate this script, you'll need to install the required packages and change the 'please have your directory organized like the following. Place this script into the 'scripts' directory and place the 'prompts_test.csv' into the 'data' directory.

![image info](../data/dir.png)

In [None]:
import os
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import time

In [2]:
# set dirs
print('CURRENT WORKING DIR =', os.getcwd())
data_dir = '../data'
save_dir = os.path.join(data_dir, 'images')

CURRENT WORKING DIR = /Users/jakeharris/Files/geog_5900/scripts


## scrape images and metadata

In [3]:
# Function to generate a unique file name by appending a counter if necessary
def generate_unique_filename(file_path):
    base, extension = os.path.splitext(file_path)
    counter = 1
    while os.path.exists(file_path):
        file_path = f"{base}_{counter}{extension}"
        counter += 1
    return file_path

# Function to download and save the image in the specified directory
def download_image(image_url, image_title, directory):
    try:
        # Ensure the directory exists
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        # Construct the full file path
        image_path = os.path.join(directory, f"{image_title}.png")
        
        # Ensure the file name is unique by adding a counter if needed
        unique_image_path = generate_unique_filename(image_path)
        
        # Download the image
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            with open(unique_image_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"Image saved as {unique_image_path}")
            return unique_image_path
        else:
            print(f"Failed to download image: {image_url}")
    except Exception as e:
        print(f"Error downloading image: {e}")
    return None

# Function to extract metadata from a page
def extract_metadata(page_soup):
    metadata = []

    # Find all <h3> tags (which represent the categories)
    for h3 in page_soup.find_all('h3'):
        category = h3.get_text(strip=True)
        
        # Find the next <dl> sibling after <h3>
        dl = h3.find_next_sibling('dl')
        if dl:
            # Collect all <dt> and <dd> pairs as "dt_text = dd_text"
            details_list = []
            for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd')):
                dt_text = dt.get_text(strip=True)
                dd_text = dd.get_text(strip=True)
                details_list.append(f"{dt_text} = {dd_text}")
            
            # Combine all details for the "details" column
            details = ' '.join(details_list)
            metadata.append({"category": category, "details": details})
    return metadata

# Combined function to scrape images and metadata
def scrape_images_and_metadata(prompt, chrome_driver_path):
    print('-----------------------------------\n* GEOG 5900 - FALL 2024\n* Author: JACOB HARRIS\n* Project: 3D Modeling of West Bank\n-----------------------------------')
    # Set up the Chrome driver using the specified driver path
    service = Service(chrome_driver_path)
    driver = webdriver.Chrome(service=service)
    
    # set dir and url
    
    # Base url for Umedia
    base_url = 'https://umedia.lib.umn.edu/search?facets%5Bcollection_name_s%5D%5B%5D=University+of+Minnesota+Archives+Photograph+Collection&q='
    # Change underscores (or spaces) in prompt to "+" since that is what the req format for the url
    prompt_formatted = prompt.replace('_', '+') 
    # Append prompt that is formatted for URL to the Umedia base url
    main_url = base_url + prompt_formatted # This is the final url to scrape from

    directory = os.path.join(save_dir, prompt) # The directory where images will be saved

    data = []
    image_counter = 0  # Counter to track the number of images downloaded
    download_lim = 5 # Limit to the number of images that will download per prompt

    try:
        # Load the main page
        print(f"Loading main page: {main_url}")
        driver.get(main_url)
        time.sleep(2)  # Allow time for the page to load

        # Parse the HTML content of the main page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find all <a> tags with class "search-result-item-title"
        result_links = soup.find_all('a', class_='search-result-item-title')
        
        for result in result_links:
            if image_counter >= download_lim:
                print(f"Reached the limit of {download_lim} images... terminating function.")
                break

            title = result.text.strip()  # Use the text as the title
            page_url = urljoin(main_url, result['href'])  # Construct full URL
            
            # Visit each link
            print(f"Navigating to: {page_url}")
            driver.get(page_url)
            time.sleep(2)  # Wait for the page to fully load
            
            # Parse the new page content
            page_soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Extract metadata from the page
            metadata = extract_metadata(page_soup)
            for entry in metadata:
                entry["title"] = title  # Add the title to each metadata entry
                data.append(entry)
            
            # Find the <a> tag for the "Full-size image" download
            download_link = page_soup.find('a', class_='large-download', string="Full-size image")
            if download_link and 'href' in download_link.attrs:
                image_url = urljoin(page_url, download_link['href'])  # Handle relative URLs with urljoin
                print(f"Found image URL: {image_url}")
                
                # Download the image and save it in the specified directory
                download_image(image_url, title, directory)
                image_counter += 1  # Increment the counter
                print(f"Progress: Downloaded {image_counter}/{download_lim} images.")
            else:
                print(f"No 'Full-size image' link found on page: {page_url}")
    finally:
        driver.quit()  # Ensure the browser closes after execution
    
    # Convert the collected metadata into a Pandas DataFrame
    df = pd.DataFrame(data)
    return df

## Call functions with a CSV file

In [4]:
# Function to accept a csv file of prompts
def scrape_from_df(csv_file):

    # Path to chrom driver (CHANGE TO YOUR OWN PATH)
    chrome_driver_path = '/Users/jakeharris/Dev_tools/chromedriver-mac-arm64/chromedriver'
    # Dict to save dfs
    meta_dict = {}

    # Load csv file (stored in the data directory)
    prompts_file = os.path.join(data_dir, csv_file) 
    # Read in prompts csv as a pandas df 
    prompts_df = pd.read_csv(prompts_file)

    # Iterate through each row of the df 'Prompt' column
    for index, prompt in prompts_df['Prompt'].items():
        # Call the function with each row in the 'Prompts' column
        metadata_df = scrape_images_and_metadata(prompt, chrome_driver_path)
        # Save metadata to dict
        meta_dict[prompt] = metadata_df

    # Save metadata locally 

    meta_dir = os.path.join(data_dir, 'metadata')
    # Ensure the directory exists
    if not os.path.exists(meta_dir):
        os.makedirs(meta_dir)
    
    for name, df in meta_dict.items():
        meta_filename = f'{name}.csv'
        save_path = os.path.join(meta_dir, meta_filename)
        df.to_csv(save_path, index=False)  # Save each df as a CSV

In [5]:
# Call scrape from df function 
test_file_name = 'prompts_test.csv'
data_dict = scrape_from_df(test_file_name)

-----------------------------------
* GEOG 5900 - FALL 2024
* Author: JACOB HARRIS
* Project: 3D Modeling of West Bank
-----------------------------------
Loading main page: https://umedia.lib.umn.edu/search?facets%5Bcollection_name_s%5D%5B%5D=University+of+Minnesota+Archives+Photograph+Collection&q=Heller Hall
Navigating to: https://umedia.lib.umn.edu/item/p16022coll175:10102?facets%5Bcollection_name_s%5D%5B%5D=University+of+Minnesota+Archives+Photograph+Collection&q=Heller+Hall
Found image URL: https://cdm16022.contentdm.oclc.org/utils/ajaxhelper?CISOROOT=p16022coll175&CISOPTR=10102&action=2&DMSCALE=100&DMWIDTH=3708&DMHEIGHT=4699
Image saved as ../data/images/Heller Hall/Heller, Walter_1.png
Progress: Downloaded 1/5 images.
Navigating to: https://umedia.lib.umn.edu/item/p16022coll175:2242?facets%5Bcollection_name_s%5D%5B%5D=University+of+Minnesota+Archives+Photograph+Collection&q=Heller+Hall
Found image URL: https://cdm16022.contentdm.oclc.org/utils/ajaxhelper?CISOROOT=p16022coll175&

### Pandas DataFrame Customization Example

In [None]:
# Observe structure of a metadata file
meta_dir = os.path.join(data_dir, 'metadata')
test = os.path.join(meta_dir, 'Heller Hall.csv')
test_df = pd.read_csv(test)
test_df.head()

Unnamed: 0,category,details,title
0,Physical Description,Item Type: = Still Image Format: = Black-and-w...,"Heller, Walter"
1,Topics,Subjects: = Economics;Buildings. Heller Hall;H...,"Heller, Walter"
2,Geographic Location,City: = Minneapolis;St Paul State: = Minnesota...,"Heller, Walter"
3,Collection Information,Contributing Organization: = University of Min...,"Heller, Walter"
4,Identifiers,Local Identifier: = Heller-W-08; ua100172 DLS...,"Heller, Walter"


In [15]:
# Function that takes in the metadata dir and saves a new CSV file that highlights the persistent URL for each downloaded image
def process_csv_files(directory, output_filename=None):

    # Initialize an empty DataFrame to store the results
    url_df = pd.DataFrame()

    # Function to extract 'persistent_url' for a single row
    def extract_persistent_url(details, category):
        if category == 'Identifiers':
            match = re.search(r'Persistent URL: =\s*(\S+)', str(details))
            return match.group(1) if match else None
        return None

    # Iterate through all CSV files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            
            # Read the current CSV file
            df = pd.read_csv(file_path)
            
            # Add a new 'file' column with the current file's name
            df['file'] = filename
            
            # Apply the extraction function to create 'persistent_url'
            df['persistent_url'] = df.apply(lambda row: extract_persistent_url(row['details'], row['category']), axis=1)
            
            # Drop rows where 'persistent_url' is None
            df = df.dropna(subset=['persistent_url'])
            
            # Append the relevant columns to the combined DataFrame
            url_df = pd.concat([url_df, df[['file', 'title', 'persistent_url']]], ignore_index=True)
    
    # Optionally save the combined DataFrame to a CSV file
    if output_filename:
        out_dir = os.path.join(directory, 'url_df')
        # Ensure the directory exists
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        out_path = os.path.join(out_dir, output_filename)
        url_df.to_csv(out_path, index=False)
        print(f'URL df saved to: {out_path}')
    
    return url_df

In [16]:
# Call process CSV function
output_csv = 'combined_output.csv'
result_df = process_csv_files(meta_dir, output_filename=output_csv)

result_df

URL df saved to: ../data/metadata/url_df/combined_output.csv


Unnamed: 0,file,title,persistent_url
0,Social Sciences Building.csv,Social Science Building. Duluth Campus. Constr...,http://purl.umn.edu/228672
1,Social Sciences Building.csv,"Students gathering on campus with a sign, ""U o...",http://purl.umn.edu/80776
2,Social Sciences Building.csv,Campus Views. Mpls. West. Construction of Soci...,http://purl.umn.edu/81313
3,Social Sciences Building.csv,"Campus Views. Mpls. West. Anderson Hall, Socia...",http://purl.umn.edu/81279
4,Social Sciences Building.csv,Campus Views. Minneapolis Campus West.,http://purl.umn.edu/81312
5,Humphrey Center.csv,Hubert H. Humphrey Center. Minneapolis Campus,http://purl.umn.edu/173261
6,Humphrey Center.csv,Hubert H. Humphrey Center. Minneapolis Campus,http://purl.umn.edu/173260
7,Humphrey Center.csv,Hubert H. Humphrey Center. Minneapolis Campus,http://purl.umn.edu/173263
8,Humphrey Center.csv,Hubert H. Humphrey Center. Minneapolis Campus,http://purl.umn.edu/173265
9,Humphrey Center.csv,Hubert H. Humphrey Center. Minneapolis Campus,http://purl.umn.edu/173262


## Script Complete

- Navigate to the '../data/images/' directory to see the images that you downloaded
- Navigate to the '../data/metadata/' directory to see the metadata for each image that you downloaded
- Navigate to the '../data/metadata/url_df' directory to see the CSV containing persistent URLs