In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import time
import logging
from urllib.parse import urljoin

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
class RSODetailScraper:
    def __init__(self, input_file='rso_data_with_categories.json'):
        self.input_file = input_file
        self.driver = None
        
    def setup_driver(self):
        """Initialize Selenium WebDriver"""
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-size=1920,1080')
        self.driver = webdriver.Chrome(options=options)
        
    def clean_text(self, text):
        """Clean text by removing extra whitespace and asterisks"""
        if not text:
            return ''
        return text.strip('* ').strip()
    
    def scrape_detail_page(self, url):
        """Scrape a single RSO detail page"""
        try:
            self.driver.get(url)
            time.sleep(1)  # Allow page to load
            
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            details = {
                'full_description': '',
                'contact': {},
                'additional_info': {}
            }
            
            # Get full description
            about_section = soup.find('div', string='About')
            if about_section and about_section.find_next('div'):
                details['full_description'] = self.clean_text(about_section.find_next('div').get_text())
            
            # Get contact information
            contact_section = soup.find('div', string='Contact Information')
            if contact_section:
                # Get email
                email = contact_section.find('div', string=lambda x: x and 'E:' in x)
                if email:
                    details['contact']['email'] = self.clean_text(email.get_text().replace('E:', ''))
                
                # Get address if available
                address = contact_section.find('div', string='Address')
                if address and address.find_next('div'):
                    details['contact']['address'] = self.clean_text(address.find_next('div').get_text())
            
            # Get additional information
            additional_info = soup.find('div', string='Additional Information')
            if additional_info:
                current_key = None
                for div in additional_info.find_all('div', recursive=False):
                    text = div.get_text().strip()
                    if '**' in text:  # This is a key
                        current_key = self.clean_text(text)
                    elif current_key:  # This is a value
                        details['additional_info'][current_key] = self.clean_text(text)
                        current_key = None
            
            # Get social media links
            social_links = soup.find_all('a', href=True)
            social_media = {}
            for link in social_links:
                href = link['href']
                if 'facebook.com' in href:
                    social_media['facebook'] = href
                elif 'instagram.com' in href:
                    social_media['instagram'] = href
                elif not any(domain in href for domain in ['facebook.com', 'instagram.com', 'blueprint.uchicago.edu']):
                    social_media['website'] = href
            
            if social_media:
                details['social_media'] = social_media
            
            return details
            
        except Exception as e:
            logger.error(f"Error scraping {url}: {str(e)}")
            return None
    
    def scrape_all_rsos(self):
        """Scrape details for all RSOs"""
        try:
            # Load existing RSO data
            with open(self.input_file, 'r') as f:
                rsos = json.load(f)
            
            logger.info(f"Loaded {len(rsos)} RSOs from {self.input_file}")
            self.setup_driver()
            
            # Process each RSO
            for i, rso in enumerate(rsos):
                url = rso.get('full_url')
                if not url:
                    continue
                
                logger.info(f"Processing RSO {i+1}/{len(rsos)}: {rso.get('name', 'Unknown')}")
                details = self.scrape_detail_page(url)
                
                if details:
                    rso.update(details)
                
                # Add a small delay between requests
                time.sleep(1)
            
            # Save updated data to new file
            output_file = 'rso_data_detailed.json'
            with open(output_file, 'w') as f:
                json.dump(rsos, f, indent=2)
            
            logger.info(f"Successfully saved detailed RSO data to {output_file}")
            
        except Exception as e:
            logger.error(f"Error in scraping process: {str(e)}")
        finally:
            if self.driver:
                self.driver.quit()


In [None]:
scraper = RSODetailScraper()
scraper.scrape_all_rsos()