In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import time
import logging
from urllib.parse import urljoin

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
class RSODetailScraper:
    def __init__(self, input_file='rso_data_with_categories.json'):
        self.input_file = input_file
        self.driver = None
        
    def setup_driver(self):
        """Initialize Selenium WebDriver"""
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-size=1920,1080')
        self.driver = webdriver.Chrome(options=options)
        
    def clean_text(self, text):
        """Clean text by removing extra whitespace and asterisks"""
        if not text:
            return ''
        return text.strip('* ').strip()
    
    def scrape_detail_page(self, url):
        """Scrape a single RSO detail page"""
        try:
            self.driver.get(url)
            time.sleep(1)  # Allow page to load
            
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            details = {
                'full_description': '',
                'contact': {},
                'additional_info': {}
            }
            
            # Get full description from the correct div class and all paragraph tags within
            description_div = soup.find('div', class_='bodyText-large userSupplied')
            if description_div:
                paragraphs = description_div.find_all('p')
                details['full_description'] = ' '.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
            
            # Get contact information - updated for new structure
            contact_email_div = soup.find('span', class_='sr-only', string='Contact Email')
            if contact_email_div and contact_email_div.parent:
                email_text = contact_email_div.parent.get_text()
                # Extract the email from the text (everything after "E: ")
                if 'E:' in email_text:
                    email = email_text.split('E:')[1].strip().strip('"')
                    details['contact']['email'] = email
                
            # Get address if available
            address_div = soup.find('div', string='Address')
            if address_div and address_div.find_next('div'):
                details['contact']['address'] = self.clean_text(address_div.find_next('div').get_text())
            
            # Rest of the method remains the same...
            [previous code for additional_info and social_media]
            
            return details
            
        except Exception as e:
            logger.error(f"Error scraping {url}: {str(e)}")
            return None
    
    def scrape_all_rsos(self):
        """Scrape details for all RSOs"""
        try:
            # Load existing RSO data
            with open(self.input_file, 'r') as f:
                rsos = json.load(f)
            
            logger.info(f"Loaded {len(rsos)} RSOs from {self.input_file}")
            self.setup_driver()
            
            # Process each RSO
            for i, rso in enumerate(rsos):
                url = rso.get('full_url')
                if not url:
                    continue
                
                logger.info(f"Processing RSO {i+1}/{len(rsos)}: {rso.get('name', 'Unknown')}")
                details = self.scrape_detail_page(url)
                
                if details:
                    rso.update(details)
                
                # Add a small delay between requests
                time.sleep(1)
            
            # Save updated data to new file
            output_file = 'rso_data_detailed.json'
            with open(output_file, 'w') as f:
                json.dump(rsos, f, indent=2)
            
            logger.info(f"Successfully saved detailed RSO data to {output_file}")
            
        except Exception as e:
            logger.error(f"Error in scraping process: {str(e)}")
        finally:
            if self.driver:
                self.driver.quit()
    


    # test method
    def test_detail_scraping(self, num_test_pages=3):
        """Test the detail scraping on a few RSO pages and print results"""
        try:
            # Load existing RSO data
            with open(self.input_file, 'r') as f:
                rsos = json.load(f)
            
            logger.info(f"Running test on {num_test_pages} RSO pages...")
            self.setup_driver()
            
            for i, rso in enumerate(rsos[:num_test_pages]):
                url = rso.get('full_url')
                if not url:
                    continue
                
                logger.info(f"\nTesting RSO {i+1}: {rso.get('name', 'Unknown')}")
                logger.info(f"URL: {url}")
                
                details = self.scrape_detail_page(url)
                
                # Print detailed results
                if details:
                    logger.info("\nScraped Details:")
                    logger.info(f"Description: {details['full_description'][:200]}...")
                    logger.info(f"Contact Info: {details['contact']}")
                    logger.info(f"Social Media: {details.get('social_media', {})}")
                    logger.info(f"Additional Info: {details['additional_info']}")
                else:
                    logger.error("Failed to scrape details")
                
                time.sleep(1)
                
        except Exception as e:
            logger.error(f"Error in test process: {str(e)}")
        finally:
            if self.driver:
                self.driver.quit()



SyntaxError: invalid syntax. Perhaps you forgot a comma? (1624573284.py, line 56)

In [None]:
scraper = RSODetailScraper()
scraper.scrape_all_rsos()

In [7]:
scraper = RSODetailScraper()
# Run test first
scraper.test_detail_scraping()

INFO:__main__:Running test on 3 RSO pages...
INFO:__main__:
Testing RSO 1: A Cappella Council
INFO:__main__:URL: https://blueprint.uchicago.edu/organization/acacouncil
INFO:__main__:
Scraped Details:
INFO:__main__:Description: A council comprised of representatives from all groups to oversee a cappella activities on campus. Organizes interactions between groups, event scheduling, microphone usage, and arbitration of the aud...
INFO:__main__:Contact Info: {}
INFO:__main__:Social Media: {'website': 'http://uchicagoacappella.org', 'instagram': 'https://www.instagram.com/uchicagoacacouncil/', 'facebook': 'https://www.facebook.com/uchicagoacappella'}
INFO:__main__:Additional Info: {}
INFO:__main__:
Testing RSO 2: Active Minds at the University of Chicago
INFO:__main__:URL: https://blueprint.uchicago.edu/organization/active-minds
INFO:__main__:
Scraped Details:
INFO:__main__:Description: The national purpose of Active Minds is to empower university students to speak openly about mental healt