In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import time
import logging
from urllib.parse import urljoin

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [25]:
class RSODetailScraper:
    def __init__(self, input_file='rso_data_with_categories.json'):
        self.input_file = input_file
        self.driver = None
        
    def setup_driver(self):
        """Initialize Selenium WebDriver"""
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-size=1920,1080')
        self.driver = webdriver.Chrome(options=options)
        
    def clean_text(self, text):
        """Clean text by removing extra whitespace and asterisks"""
        if not text:
            return ''
        return text.strip('* ').strip()
    
    def scrape_detail_page(self, url):
        """Scrape a single RSO detail page"""
        try:
            self.driver.get(url)
            time.sleep(1)  # Allow page to load
            
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            details = {
                'full_description': '',
                'contact': {},
                'additional_info': {}
            }
            
            # Get full description from the correct div class and all paragraph tags within
            description_div = soup.find('div', class_='bodyText-large userSupplied')
            if description_div:
                paragraphs = description_div.find_all('p')
                details['full_description'] = ' '.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
            
            # Get contact information - updated for new structure
            contact_email_div = soup.find('span', class_='sr-only', string='Contact Email')
            if contact_email_div and contact_email_div.parent:
                email_text = contact_email_div.parent.get_text()
                # Extract the email from the text (everything after "E: ")
                if 'E:' in email_text:
                    email = email_text.split('E:')[1].strip().strip('"')
                    details['contact']['email'] = email
                
            # Get address if available
            address_div = soup.find('div', string='Address')
            if address_div and address_div.find_next('div'):
                details['contact']['address'] = self.clean_text(address_div.find_next('div').get_text())
            
            
            # Get social media links and website
            social_media = {}
            
            # Get website from aria-label
            website_link = soup.find('a', attrs={'aria-label': lambda x: x and 'Visit our site' in x})
            if website_link and website_link.get('href'):
                social_media['website'] = website_link['href']
            
            # Get social media links
            social_links = soup.find_all('a', href=True)
            for link in social_links:
                href = link['href']
                if 'facebook.com' in href:
                    social_media['facebook'] = href
                elif 'instagram.com' in href:
                    social_media['instagram'] = href
            
            if social_media:
                details['social_media'] = social_media


           # Get additional information
            additional_info_h2 = soup.find('h2', string=lambda x: x and 'Additional Information' in x)
            if additional_info_h2:
                # Get to the container div
                container = additional_info_h2.parent.parent.find_next_sibling('div')
                if container:
                    # Find all field divs by their specific style
                    field_divs = container.find_all('div', style=lambda x: x and 'padding-bottom: 8px; margin-left: 15px;' in x)
                    
                    for field_div in field_divs:
                        # Get the label from the strong tag
                        label_div = field_div.find('div', style='font-weight: bold;')
                        if label_div and label_div.strong:
                            label = label_div.strong.text.strip()
                            
                            # Get the value by getting the second div (skipping the label div)
                            divs = field_div.find_all('div', recursive=False)
                            if len(divs) >= 2:  # Make sure we have both label and value divs
                                value_div = divs[1].find('div')  # Get the inner div of the second div
                                if value_div:
                                    value = value_div.text.strip()
                                    details['additional_info'][label] = value
                    
                    logger.info(f"Extracted additional info: {details['additional_info']}")
            return details
            
        except Exception as e:
            logger.error(f"Error scraping {url}: {str(e)}")
            return None
            
    
    def scrape_all_rsos(self):
        """Scrape details for all RSOs"""
        try:
            # Load existing RSO data
            with open(self.input_file, 'r') as f:
                rsos = json.load(f)
            
            logger.info(f"Loaded {len(rsos)} RSOs from {self.input_file}")
            self.setup_driver()
            
            # Process each RSO
            for i, rso in enumerate(rsos):
                url = rso.get('full_url')
                if not url:
                    continue
                
                logger.info(f"Processing RSO {i+1}/{len(rsos)}: {rso.get('name', 'Unknown')}")
                details = self.scrape_detail_page(url)
                
                if details:
                    rso.update(details)
                
                # Add a small delay between requests
                time.sleep(1)
            
            # Save updated data to new file
            output_file = 'rso_data_detailed.json'
            with open(output_file, 'w') as f:
                json.dump(rsos, f, indent=2)
            
            logger.info(f"Successfully saved detailed RSO data to {output_file}")
            
        except Exception as e:
            logger.error(f"Error in scraping process: {str(e)}")
        finally:
            if self.driver:
                self.driver.quit()
    


    # test method
    def test_detail_scraping(self, num_test_pages=3):
        """Test the detail scraping on a few RSO pages and print results"""
        try:
            # Load existing RSO data
            with open(self.input_file, 'r') as f:
                rsos = json.load(f)
            
            logger.info(f"Running test on {num_test_pages} RSO pages...")
            self.setup_driver()
            
            for i, rso in enumerate(rsos[:num_test_pages]):
                url = rso.get('full_url')
                if not url:
                    continue
                
                logger.info(f"\nTesting RSO {i+1}: {rso.get('name', 'Unknown')}")
                logger.info(f"URL: {url}")
                
                details = self.scrape_detail_page(url)
                
                # Print detailed results
                if details:
                    logger.info("\nScraped Details:")
                    logger.info(f"Description: {details['full_description'][:200]}...")
                    logger.info(f"Contact Info: {details['contact']}")
                    logger.info(f"Social Media: {details.get('social_media', {})}")
                    logger.info(f"Additional Info: {details['additional_info']}")
                else:
                    logger.error("Failed to scrape details")
                
                time.sleep(1)
                
        except Exception as e:
            logger.error(f"Error in test process: {str(e)}")
        finally:
            if self.driver:
                self.driver.quit()



In [None]:
scraper = RSODetailScraper()
scraper.scrape_all_rsos()

In [26]:
scraper = RSODetailScraper()
# Run test first
scraper.test_detail_scraping()

INFO:__main__:Running test on 3 RSO pages...
INFO:__main__:
Testing RSO 1: A Cappella Council
INFO:__main__:URL: https://blueprint.uchicago.edu/organization/acacouncil
INFO:__main__:Extracted additional info: {'RSO Advisor': 'Amie Bernstein Clark', 'Advising Model Categorization:': 'Green Group', 'Year Created:': '2009', 'Regular Meetings (Day/Time/Location):': 'No Response', 'RSO Listhost:': 'acacouncil@lists.uchicago.edu', 'This organization is affiliated with a parent/national/international organization.': 'No Response', 'Parent Organization Name and Website:': 'No Response'}
INFO:__main__:
Scraped Details:
INFO:__main__:Description: A council comprised of representatives from all groups to oversee a cappella activities on campus. Organizes interactions between groups, event scheduling, microphone usage, and arbitration of the aud...
INFO:__main__:Contact Info: {'email': 'uchicagoacappella@gmail.com', 'address': 'Contact Email E:  uchicagoacappella@gmail.com'}
INFO:__main__:Social M