In [2]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import json
import logging
import sys
from typing import List, Dict
from aiohttp import ClientTimeout
from asyncio import Semaphore


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
class AsyncRSOScraper:
    def __init__(self, input_file='rso_data_with_categories.json', concurrent_limit=3):
        self.input_file = input_file
        self.semaphore = Semaphore(concurrent_limit)
        self.timeout = ClientTimeout(total=30)
        
    async def scrape_detail_page(self, session: aiohttp.ClientSession, url: str, rso: Dict) -> Dict:
        async with self.semaphore:
            try:
                async with session.get(url, timeout=self.timeout) as response:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    
                    details = {
                        'full_description': '',
                        'contact': {},
                        'additional_info': {},
                        'social_media': {}
                    }
                    
                    # Description
                    if desc_div := soup.find('div', class_='bodyText-large userSupplied'):
                        details['full_description'] = ' '.join(p.get_text(strip=True) 
                            for p in desc_div.find_all('p') if p.get_text(strip=True))
                    
                    # Contact email
                    if email_div := soup.find('span', class_='sr-only', string='Contact Email'):
                        if email_text := email_div.parent.get_text():
                            if 'E:' in email_text:
                                details['contact']['email'] = email_text.split('E:')[1].strip().strip('"')
                    
                    # Social media
                    if website_link := soup.find('a', attrs={'aria-label': lambda x: x and 'Visit our site' in x}):
                        details['social_media']['website'] = website_link['href']
                    
                    for link in soup.find_all('a', href=True):
                        href = link['href']
                        if 'facebook.com' in href:
                            details['social_media']['facebook'] = href
                        elif 'instagram.com' in href:
                            details['social_media']['instagram'] = href
                    
                    # Additional info
                    if info_h2 := soup.find('h2', string=lambda x: x and 'Additional Information' in x):
                        if container := info_h2.parent.parent.find_next_sibling('div'):
                            for field_div in container.find_all('div', style=lambda x: x and 'padding-bottom: 8px; margin-left: 15px;' in x):
                                if label_div := field_div.find('div', style='font-weight: bold;'):
                                    if label := label_div.strong:
                                        divs = field_div.find_all('div', recursive=False)
                                        if len(divs) >= 2:
                                            if value_div := divs[1].find('div'):
                                                details['additional_info'][label.text.strip()] = value_div.text.strip()
                    
                    rso.update(details)
                    logger.info(f"Processed: {rso.get('name')}")
                    return rso
                    
            except Exception as e:
                logger.error(f"Error processing {url}: {e}")
                return rso

    async def scrape_all_rsos(self):
        async with aiohttp.ClientSession() as session:
            with open(self.input_file, 'r') as f:
                rsos = json.load(f)
            
            tasks = [
                self.scrape_detail_page(session, rso['full_url'], rso)
                for rso in rsos if 'full_url' in rso
            ]
            
            results = await asyncio.gather(*tasks, return_exceptions=True)
            valid_results = [r for r in results if isinstance(r, dict)]
            
            with open('rso_data_detailed.json', 'w') as f:
                json.dump(valid_results, f, indent=2)

    async def test_scraping(self, num_test_pages=3):
        async with aiohttp.ClientSession() as session:
            with open(self.input_file, 'r') as f:
                rsos = json.load(f)[:num_test_pages]
            
            logger.info(f"Testing {num_test_pages} RSO pages...")
            
            for rso in rsos:
                if url := rso.get('full_url'):
                    logger.info(f"\nTesting: {rso.get('name', 'Unknown')}")
                    logger.info(f"URL: {url}")
                    
                    result = await self.scrape_detail_page(session, url, rso.copy())
                    if result:
                        logger.info("\nScraped Details:")
                        logger.info(f"Description: {result['full_description'][:200]}...")
                        logger.info(f"Contact: {result['contact']}")
                        logger.info(f"Social Media: {result.get('social_media', {})}")
                        logger.info(f"Additional Info: {result['additional_info']}")





In [8]:
scraper = AsyncRSOScraper()


await scraper.test_scraping()

INFO:__main__:Testing 3 RSO pages...
INFO:__main__:
Testing: A Cappella Council
INFO:__main__:URL: https://blueprint.uchicago.edu/organization/acacouncil
ERROR:__main__:Error processing https://blueprint.uchicago.edu/organization/acacouncil: Cannot connect to host blueprint.uchicago.edu:443 ssl:True [SSLCertVerificationError: (1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')]
INFO:__main__:
Scraped Details:


KeyError: 'full_description'