In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from bs4 import BeautifulSoup
import time
import json
import logging
from urllib.parse import urljoin

In [10]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
class BlueprintScraper:
    def __init__(self):
        self.base_url = "https://blueprint.uchicago.edu/organizations"
        self.category_map = {
            '4150': 'Academic Interest',
            '4174': 'Campus and Student Life',
            '4151': 'Community Service',
            '4152': 'Cultural & Ethnic',
            '4153': 'Fine Arts',
            '8195': 'Graduate/Professional',
            '4155': 'Media & Publication',
            '4156': 'Political & Advocacy',
            '4157': 'Religious & Spiritual',
            '4158': 'Social',
            '4159': 'Sports Clubs',
            '4288': 'Student Government',
            '4289': 'University Department/Program'
        }
        self.driver = None
        self.rso_data = []
    
    def setup_driver(self):
        """Initialize Selenium WebDriver with appropriate options"""
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-size=1920,1080')
        self.driver = webdriver.Chrome(options=options)
        
    def load_all_rsos(self, expected_count=408):
        """Click 'Show More' button until all RSOs are loaded
        
        Args:
            expected_count (int): Expected number of RSOs to load
            
        Returns:
            bool: True if expected number of RSOs were loaded, False otherwise
        """
        logger.info(f"Starting to load all RSOs (expecting {expected_count})...")
        page_source_length = 0
        attempts = 0
        max_attempts = 50  # Increased max attempts to ensure we get all RSOs
        
        while attempts < max_attempts:
            # Check current number of RSOs
            current_rsos = len(self.driver.find_elements(By.CSS_SELECTOR, 'a[href^="/organization/"]'))
            logger.info(f"Currently loaded RSOs: {current_rsos}")
            
            if current_rsos >= expected_count:
                logger.info(f"Successfully loaded all {current_rsos} RSOs")
                return True
                
            # Continue with show more clicks
            try:
                # Try to find and click the "Show More" button
                show_more = WebDriverWait(self.driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, 
                    "//button[.//span[contains(text(), 'Load More')]]"))
                )
                
                # If the button is not visible in viewport, scroll to it
                self.driver.execute_script("""
                    var element = arguments[0];
                    var headerOffset = 100;
                    var elementPosition = element.getBoundingClientRect().top;
                    var offsetPosition = elementPosition + window.pageYOffset - headerOffset;
                    window.scrollTo({
                        top: offsetPosition,
                        behavior: 'smooth'
                    });
                """, show_more)
                time.sleep(0.5)  # Short pause for scroll
                
                # Click the button
                self.driver.execute_script("arguments[0].click();", show_more)
                logger.info(f"Clicked 'Load More' button, attempt {attempts + 1}")
                
                # Wait for new content
                time.sleep(2)  # Increased wait time for reliability
                
                # Check if page content has grown
                new_length = len(self.driver.page_source)
                if new_length == page_source_length:
                    logger.info("No new content loaded, finishing...")
                    break
                
                page_source_length = new_length
                attempts += 1
                
            except TimeoutException:
                logger.info("No more 'Show More' button found, all content loaded")
                break
            except Exception as e:
                logger.error(f"Error clicking 'Show More': {str(e)}")
                break
                
        # Final check after all attempts
        final_count = len(self.driver.find_elements(By.CSS_SELECTOR, 'a[href^="/organization/"]'))
        logger.info(f"Finished loading RSOs after {attempts} attempts. Final count: {final_count}")
        return final_count >= expected_count
    
    def extract_rso_info_from_card(self, card_element):
        """Extract RSO information from a card element"""
        try:
            # Get the link and RSO name from the href
            link = card_element.get('href')
            rso_name = link.split('/')[-1] if link else None
            
            # Get the display name (from img alt or other source)
            img = card_element.find('img')
            display_name = img.get('alt') if img else None
            
            # Get the description excerpt
            description = card_element.find('p', class_='DescriptionExcerpt')
            description_text = description.text.strip() if description else ""
            
            # Get the image URL
            img_src = img.get('src') if img else None
            
            return {
                'name': display_name,
                'url_name': rso_name,
                'full_url': urljoin("https://blueprint.uchicago.edu", link) if link else None,
                'description_preview': description_text,
                'image_url': img_src
            }
        except Exception as e:
            logger.error(f"Error extracting RSO card info: {str(e)}")
            return None
    
    def extract_rso_cards(self):
        """Extract all RSO cards from the loaded page"""
        logger.info("Starting to extract RSO cards...")
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        
        # Find all RSO card links
        rso_cards = soup.find_all('a', href=lambda x: x and x.startswith('/organization/'))
        
        rso_info = []
        for card in rso_cards:
            info = self.extract_rso_info_from_card(card)
            if info:
                rso_info.append(info)
        
        logger.info(f"Found {len(rso_info)} RSO cards")
        return rso_info
    
    def scrape_all_rsos(self):
        """Main function to scrape all RSOs"""
        try:
            logger.info("Starting RSO scraping process...")
            self.setup_driver()
            
            # First pass: Get all RSOs and their basic info
            logger.info("Loading main page...")
            self.driver.get(self.base_url)
            self.load_all_rsos()
            rso_info = self.extract_rso_cards()
            
            if not rso_info:
                logger.error("No RSO cards found!")
                return []
            
            if len(rso_info) < 408:
                logger.error(f"Only found {len(rso_info)} RSOs, expected 408. Retrying...")
                # Try one more time with longer waits
                self.driver.get(self.base_url)
                time.sleep(3)  # Longer initial wait
                success = self.load_all_rsos(408)
                if not success:
                    logger.error("Failed to load all RSOs even after retry")
                rso_info = self.extract_rso_cards()
                
            if len(rso_info) < 408:
                logger.error(f"Warning: Only found {len(rso_info)} RSOs out of 408 expected")
            
            # Save data to JSON file
            with open('rso_data.json', 'w', encoding='utf-8') as f:
                json.dump(rso_info, f, indent=2, ensure_ascii=False)
                
            logger.info(f"Successfully scraped {len(rso_info)} RSOs")
            return rso_info
            
        except Exception as e:
            logger.error(f"Error in scraping process: {str(e)}")
            return []
        finally:
            if self.driver:
                self.driver.quit()
                

In [None]:
scraper = BlueprintScraper()
scraper.scrape_all_rsos()

In [None]:
import json

def remove_field_from_json(file_path, field_to_remove):
    # Read the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # If the data is a dictionary, remove the field
    if isinstance(data, dict):
        if field_to_remove in data:
            del data[field_to_remove]
    # If the data is a list of dictionaries, remove the field from each item
    elif isinstance(data, list):
        for item in data:
            if isinstance(item, dict) and field_to_remove in item:
                del item[field_to_remove]
    
    # Write the updated data back to the file
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

# Example usage
file_path = 'rso_data.json'
field_to_remove = 'image_url'

try:
    remove_field_from_json(file_path, field_to_remove)
    print(f"Successfully removed '{field_to_remove}' from {file_path}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import json
import logging
from collections import defaultdict

In [15]:


class CategoryCollector:
    def __init__(self):
        self.base_url = "https://blueprint.uchicago.edu/organizations"
        self.category_map = {
            '4150': 'Academic Interest',
            '4174': 'Campus and Student Life',
            '4151': 'Community Service',
            '4152': 'Cultural & Ethnic',
            '4153': 'Fine Arts',
            '8195': 'Graduate/Professional',
            '4155': 'Media & Publication',
            '4156': 'Political & Advocacy',
            '4157': 'Religious & Spiritual',
            '4158': 'Social',
            '4159': 'Sports Clubs',
            '4288': 'Student Government',
            '4289': 'University Department/Program'
        }
        self.driver = None
        self.rso_categories = defaultdict(set)
    
    def setup_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-size=1920,1080')
        self.driver = webdriver.Chrome(options=options)
    
    def load_all_rsos(self, category_name=""):
        logger.info(f"Loading RSOs for {category_name}...")
        attempts = 0
        max_attempts = 50
        
        while attempts < max_attempts:
            try:
                show_more = WebDriverWait(self.driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, 
                    "//button[.//span[contains(text(), 'Load More')]]"))
                )
                
                self.driver.execute_script("""
                    var element = arguments[0];
                    var headerOffset = 100;
                    var elementPosition = element.getBoundingClientRect().top;
                    var offsetPosition = elementPosition + window.pageYOffset - headerOffset;
                    window.scrollTo({
                        top: offsetPosition,
                        behavior: 'smooth'
                    });
                """, show_more)
                
                time.sleep(0.5)
                self.driver.execute_script("arguments[0].click();", show_more)
                time.sleep(2)
                attempts += 1
                
            except TimeoutException:
                break
            except Exception as e:
                logger.error(f"Error in load_all_rsos for {category_name}: {str(e)}")
                break
    
    def collect_categories(self):
        try:
            self.setup_driver()
            
            # Load existing RSO data
            with open('rso_data.json', 'r', encoding='utf-8') as f:
                rso_data = json.load(f)
            
            # Create lookup of url_names
            all_rsos = {rso['url_name'] for rso in rso_data if rso.get('url_name')}
            logger.info(f"Loaded {len(all_rsos)} RSOs from existing data")
            
            # Check each category
            for cat_id, cat_name in self.category_map.items():
                try:
                    logger.info(f"Checking category: {cat_name}")
                    self.driver.get(f"{self.base_url}?categories={cat_id}")
                    self.load_all_rsos(cat_name)
                    
                    soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                    category_links = soup.find_all('a', href=lambda x: x and x.startswith('/organization/'))
                    category_rsos = {link['href'].split('/')[-1] for link in category_links}
                    
                    # Add category to each RSO found
                    for rso in category_rsos:
                        self.rso_categories[rso].add(cat_name)
                    
                    logger.info(f"Found {len(category_rsos)} RSOs in {cat_name}")
                    
                except Exception as e:
                    logger.error(f"Error processing category {cat_name}: {str(e)}")
            
            # Update the existing data with categories
            for rso in rso_data:
                url_name = rso.get('url_name')
                if url_name:
                    rso['categories'] = list(self.rso_categories.get(url_name, set()))
            
            # Save updated data
            with open('rso_data_with_categories.json', 'w', encoding='utf-8') as f:
                json.dump(rso_data, f, indent=2, ensure_ascii=False)
            
            # Log category statistics
            logger.info("\nCategory Statistics:")
            for cat_name in self.category_map.values():
                count = sum(1 for cats in self.rso_categories.values() if cat_name in cats)
                logger.info(f"  {cat_name}: {count} RSOs")
            
            no_category_count = sum(1 for rso in all_rsos if not self.rso_categories.get(rso))
            logger.info(f"RSOs with no category: {no_category_count}")
            
        except Exception as e:
            logger.error(f"Error in category collection: {str(e)}")
        finally:
            if self.driver:
                self.driver.quit()



In [None]:

collector = CategoryCollector()
collector.collect_categories()