In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from typing import Dict, List, Optional

class MayoClinicScraper:
    """
    Web scraper for Mayo Clinic supplements information.
    Extracts supplement details including safety info, interactions, and evidence.
    """
    
    BASE_URL = "https://www.mayoclinic.org"
    SUPPLEMENTS_PAGE = "https://www.mayoclinic.org/drugs-supplements"
    
    def __init__(self, delay: float = 2.0):
        """
        Initialize scraper with politeness delay between requests.
        
        Args:
            delay: Seconds to wait between requests (default 2.0)
        """
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.supplement_urls = {}
    
    def get_supplement_links(self) -> Dict[str, str]:
        """
        Scrape the main supplements page to get all supplement URLs with correct IDs.
        
        Returns:
            Dictionary mapping supplement names to their full URLs
        """
        print("Fetching supplement links from main page...")
        
        try:
            response = self.session.get(self.SUPPLEMENTS_PAGE, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the "Herbs, supplements and vitamins" section
            supplement_section = soup.find('h2', string=re.compile(r'Herbs, supplements and vitamins', re.I))
            
            if not supplement_section:
                print("Warning: Could not find supplements section")
                return {}
            
            # Find all links in the list after this heading
            supplement_links = {}
            current = supplement_section.find_next('ul')
            
            if current:
                for link in current.find_all('a', href=True):
                    href = link['href']
                    name = link.get_text(strip=True)
                    
                    # Only include links that match the pattern
                    if '/drugs-supplements-' in href and '/art-' in href:
                        full_url = href if href.startswith('http') else self.BASE_URL + href
                        supplement_links[name] = full_url
            
            print(f"Found {len(supplement_links)} supplement links")
            self.supplement_urls = supplement_links
            return supplement_links
            
        except requests.RequestException as e:
            print(f"Error fetching supplement links: {e}")
            return {}
    
    def scrape_supplement(self, supplement_name: str, url: str = None) -> Optional[Dict]:
        """
        Scrape data for a single supplement.
        
        Args:
            supplement_name: Display name of supplement (e.g., 'Melatonin')
            url: Full URL to scrape (optional, uses stored URL if available)
            
        Returns:
            Dictionary containing supplement data or None if scraping fails
        """
        if url is None:
            if supplement_name in self.supplement_urls:
                url = self.supplement_urls[supplement_name]
            else:
                print(f"No URL found for {supplement_name}. Run get_supplement_links() first.")
                return None
        
        print(f"Scraping: {supplement_name}")
        
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            data = {
                'name': supplement_name,
                'url': url,
                'display_name': self._extract_display_name(soup, supplement_name),
                'category': self._extract_category(soup),
                'overview': self._extract_overview(soup),
                'research_findings': self._extract_research(soup),
                'safety_rating': self._extract_safety_rating(soup),
                'side_effects': self._extract_side_effects(soup),
                'interactions': self._extract_interactions(soup),
                'contraindications': self._extract_contraindications(soup),
                'usage_notes': self._extract_usage_notes(soup)
            }
            
            return data
            
        except requests.RequestException as e:
            print(f"Error scraping {supplement_name}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error for {supplement_name}: {e}")
            return None
    
    def _extract_display_name(self, soup: BeautifulSoup, fallback: str) -> str:
        """Extract the display name of the supplement."""
        # Try h1 tag first
        h1 = soup.find('h1')
        if h1:
            return h1.get_text(strip=True)
        
        # Fallback to title tag
        title = soup.find('title')
        if title:
            title_text = title.get_text(strip=True)
            # Remove "- Mayo Clinic" suffix if present
            return title_text.split('-')[0].strip()
        
        # Final fallback: format the URL name
        return fallback.replace('-', ' ').title()
    
    def _extract_category(self, soup: BeautifulSoup) -> str:
        """Determine supplement category (vitamin, mineral, herb, etc.)."""
        text = soup.get_text().lower()
        
        if 'vitamin' in text[:500]:
            return 'Vitamin'
        elif 'mineral' in text[:500]:
            return 'Mineral'
        elif any(word in text[:500] for word in ['herb', 'botanical', 'plant']):
            return 'Herb'
        elif 'amino acid' in text[:500]:
            return 'Amino Acid'
        elif 'hormone' in text[:500]:
            return 'Hormone'
        else:
            return 'Other Supplement'
    
    def _extract_overview(self, soup: BeautifulSoup) -> str:
        """Extract overview/description section."""
        # Look for the "Overview" heading
        overview_heading = soup.find('h3', string=re.compile(r'Overview', re.I))
        if overview_heading:
            content = []
            # Get all paragraphs until the next h3
            for sibling in overview_heading.find_next_siblings():
                if sibling.name == 'h3':
                    break
                if sibling.name == 'p':
                    text = sibling.get_text(strip=True)
                    if text:
                        content.append(text)
            return ' '.join(content)
        return ''
    
    def _extract_research(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract research findings for specific conditions."""
        # Look for "What the research says" heading
        research_heading = soup.find('h3', string=re.compile(r'What the research says', re.I))
        findings = []
        
        if research_heading:
            # Get the next sibling (usually contains the list)
            next_elem = research_heading.find_next_sibling()
            
            if next_elem and next_elem.name == 'ul':
                # Each <li> contains a condition study
                for li in next_elem.find_all('li', recursive=False):
                    # The condition is in bold/strong at the start
                    strong_tag = li.find(['strong', 'b'])
                    if strong_tag:
                        condition = strong_tag.get_text(strip=True).rstrip('.')
                        # Get the full text and remove the condition name
                        full_text = li.get_text(strip=True)
                        evidence = full_text.replace(condition, '', 1).strip()
                        
                        findings.append({
                            'condition': condition,
                            'evidence': evidence
                        })
            
            # Also check for paragraphs after the heading (some pages use this format)
            if not findings:
                for sibling in research_heading.find_next_siblings():
                    if sibling.name == 'h3':
                        break
                    if sibling.name == 'p':
                        text = sibling.get_text(strip=True)
                        # Try to extract condition from bold text
                        bold = sibling.find(['strong', 'b'])
                        if bold:
                            condition = bold.get_text(strip=True).rstrip('.')
                            evidence = text.replace(condition, '', 1).strip()
                            findings.append({
                                'condition': condition,
                                'evidence': evidence
                            })
        
        return findings
    
    def _extract_safety_rating(self, soup: BeautifulSoup) -> Dict:
        """Extract Mayo Clinic's safety rating (green/yellow/red light)."""
        # Look for "Our take" heading
        our_take = soup.find('h3', string=re.compile(r'Our take', re.I))
        rating = {'rating': '', 'explanation': ''}
        
        if our_take:
            # Look for the rating in h4 (e.g., "Generally safe")
            rating_h4 = our_take.find_next_sibling('h4')
            if rating_h4:
                rating_text = rating_h4.get_text(strip=True)
                
                # Determine color based on text
                if 'generally safe' in rating_text.lower() or 'safe' in rating_text.lower():
                    rating['rating'] = f'Green Light - {rating_text}'
                elif 'caution' in rating_text.lower():
                    rating['rating'] = f'Yellow Light - {rating_text}'
                elif 'not recommended' in rating_text.lower():
                    rating['rating'] = f'Red Light - {rating_text}'
                else:
                    rating['rating'] = rating_text
            
            # Get explanation from next paragraph
            for sibling in our_take.find_next_siblings():
                if sibling.name == 'h3':
                    break
                if sibling.name == 'p':
                    text = sibling.get_text(strip=True)
                    if text and len(text) > 20:
                        rating['explanation'] = text
                        break
        
        return rating
    
    def _extract_side_effects(self, soup: BeautifulSoup) -> List[str]:
        """Extract side effects and adverse reactions."""
        # Look for "Safety and side effects" heading
        safety_heading = soup.find('h3', string=re.compile(r'Safety and side effects', re.I))
        side_effects = []
        
        if safety_heading:
            for sibling in safety_heading.find_next_siblings():
                if sibling.name == 'h3':
                    break
                
                # Look for unordered lists
                if sibling.name == 'ul':
                    for li in sibling.find_all('li', recursive=False):
                        effect = li.get_text(strip=True)
                        if effect:
                            side_effects.append(effect)
                
                # Also check paragraphs for side effects
                elif sibling.name == 'p':
                    text = sibling.get_text(strip=True)
                    # Keep informative paragraphs about side effects
                    if text and ('can cause' in text.lower() or 'side effect' in text.lower() or 
                                'might include' in text.lower() or 'may cause' in text.lower()):
                        side_effects.append(text)
        
        return side_effects
    
    def _extract_interactions(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract drug-supplement interactions."""
        # Look for "Interactions" heading
        interactions_heading = soup.find('h3', string=re.compile(r'Interactions', re.I))
        interactions = []
        
        if interactions_heading:
            # Check if there's a "Possible drug interactions include:" paragraph
            intro_para = interactions_heading.find_next_sibling('p')
            
            # Look for the list of interactions
            ul_elem = interactions_heading.find_next_sibling('ul')
            if not ul_elem and intro_para:
                ul_elem = intro_para.find_next_sibling('ul')
            
            if ul_elem:
                for li in ul_elem.find_all('li', recursive=False):
                    # The drug class is usually in bold/strong
                    strong_tag = li.find(['strong', 'b'])
                    if strong_tag:
                        drug_class = strong_tag.get_text(strip=True).rstrip('.')
                        # Get the full text and remove drug class
                        full_text = li.get_text(strip=True)
                        description = full_text.replace(drug_class, '', 1).strip()
                        
                        interactions.append({
                            'drug_class': drug_class,
                            'description': description,
                            'severity': self._infer_severity(description)
                        })
        
        return interactions
    
    def _extract_contraindications(self, soup: BeautifulSoup) -> List[str]:
        """Extract contraindications (when NOT to use)."""
        contraindications = []
        
        # Check "Safety and side effects" section for "don't use" statements
        safety_heading = soup.find('h3', string=re.compile(r'Safety and side effects', re.I))
        if safety_heading:
            for sibling in safety_heading.find_next_siblings():
                if sibling.name == 'h3':
                    break
                
                if sibling.name == 'p':
                    text = sibling.get_text(strip=True)
                    text_lower = text.lower()
                    # Look for contraindication phrases
                    if any(phrase in text_lower for phrase in ["don't use", "do not use", "avoid", 
                                                                 "should not", "shouldn't"]):
                        contraindications.append(text)
        
        return contraindications
    
    def _extract_usage_notes(self, soup: BeautifulSoup) -> str:
        """Extract general usage notes and recommendations."""
        notes = []
        
        # Check "Our take" section
        our_take = soup.find('h3', string=re.compile(r'Our take', re.I))
        if our_take:
            for sibling in our_take.find_next_siblings():
                if sibling.name == 'h3':
                    break
                if sibling.name == 'p':
                    text = sibling.get_text(strip=True)
                    if text and len(text) > 30:
                        notes.append(text)
        
        return ' '.join(notes)
    
    def _infer_severity(self, description: str) -> str:
        """Infer interaction severity from description text."""
        text_lower = description.lower()
        
        if any(word in text_lower for word in ['serious', 'severe', 'dangerous', 'life-threatening', 'major']):
            return 'Major'
        elif any(word in text_lower for word in ['moderate', 'increase', 'worsen', 'reduce effectiveness']):
            return 'Moderate'
        else:
            return 'Minor'
    
    def scrape_all_supplements(self) -> pd.DataFrame:
        """
        Scrape all supplements from the main page and return as DataFrame.
        First fetches all supplement URLs, then scrapes each one.
        
        Returns:
            pandas DataFrame with all supplement data
        """
        # First, get all the supplement links
        if not self.supplement_urls:
            self.get_supplement_links()
            time.sleep(self.delay)  # Be polite after fetching main page
        
        if not self.supplement_urls:
            print("Error: No supplement URLs found. Cannot proceed.")
            return pd.DataFrame()
        
        all_data = []
        total = len(self.supplement_urls)
        
        for idx, (name, url) in enumerate(self.supplement_urls.items(), 1):
            print(f"[{idx}/{total}] ", end="")
            data = self.scrape_supplement(name, url)
            if data:
                all_data.append(data)
            
            # Be polite - wait between requests
            if idx < total:  # Don't wait after the last one
                time.sleep(self.delay)
        
        print(f"\nSuccessfully scraped {len(all_data)}/{total} supplements")
        return pd.DataFrame(all_data)
    
    def save_to_csv(self, df: pd.DataFrame, filename: str = 'mayo_clinic_supplements_raw.csv'):
        """Save scraped data to CSV file."""
        # Convert list/dict columns to strings for CSV compatibility
        df_copy = df.copy()
        for col in df_copy.columns:
            if df_copy[col].dtype == 'object':
                df_copy[col] = df_copy[col].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
        
        df_copy.to_csv(filename, index=False, encoding='utf-8')
        print(f"Data saved to {filename}")


# Example usage
if __name__ == "__main__":
    # Initialize scraper with 2-second delay between requests
    scraper = MayoClinicScraper(delay=2.0)
    
    # First, get all supplement links from the main page
    print("Step 1: Fetching supplement URLs...")
    print("="*50)
    links = scraper.get_supplement_links()
    
    if links:
        print("\nFound supplements:")
        for name, url in list(links.items())[:5]:  # Show first 5
            print(f"  - {name}: {url}")
        print(f"  ... and {len(links)-5} more\n")
    
    # Option 1: Scrape a single supplement for testing
    print("="*50)
    print("Step 2: Testing with single supplement (Melatonin):")
    print("="*50)
    if 'Melatonin' in scraper.supplement_urls:
        melatonin_data = scraper.scrape_supplement('Melatonin')
        if melatonin_data:
            print("\nSample data structure:")
            for key, value in melatonin_data.items():
                if isinstance(value, str):
                    preview = value[:100] + "..." if len(value) > 100 else value
                    print(f"{key}: {preview}")
                else:
                    print(f"{key}: {type(value).__name__} with {len(value) if hasattr(value, '__len__') else 'N/A'} items")
    
    #Option 2: Scrape all supplements (uncomment to run)
    print("\n" + "="*50)
    print("Step 3: Scraping all supplements...")
    print("="*50 + "\n")
    df = scraper.scrape_all_supplements()
    scraper.save_to_csv(df)
    print("\nFirst few rows:")
    print(df[['name', 'category', 'safety_rating']].head())

Step 1: Fetching supplement URLs...
Fetching supplement links from main page...
Found 28 supplement links

Found supplements:
  - Acidophilus: https://www.mayoclinic.org/drugs-supplements-acidophilus/art-20361967
  - Aloe: https://www.mayoclinic.org/drugs-supplements-aloe/art-20362267
  - Coenzyme Q10: https://www.mayoclinic.org/drugs-supplements-coenzyme-q10/art-20362602
  - Creatine: https://www.mayoclinic.org/drugs-supplements-creatine/art-20347591
  - DHEA: https://www.mayoclinic.org/drugs-supplements-dhea/art-20364199
  ... and 23 more

Step 2: Testing with single supplement (Melatonin):
Scraping: Melatonin

Sample data structure:
name: Melatonin
url: https://www.mayoclinic.org/drugs-supplements-melatonin/art-20363071
display_name: Melatonin
category: Other Supplement
overview: Melatonin is a hormone in your body that plays a role in sleep. The production and release of melato...
research_findings: list with 0 items
safety_rating: dict with 2 items
side_effects: list with 7 items
