Data collection for AI inequality research.

Sources:
- Bureau of Labor Statistics data
- AI adoption rates across industries from multiple sources
- Economic indicators and metrics

#### 1. Imports and Setup


In [None]:
import pandas as pd
import requests
import json
from datetime import datetime
import logging
from typing import Dict, List, Optional

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Imports and setup completed!")

#### 3 Scraping Data

##### 3.1 Scraping Labor Statistics API

In [None]:
# =============================================================================
# CELL: BLS Data Scraping - Standalone
# =============================================================================

"""
Bureau of Labor Statistics (BLS) Data Scraper

This standalone script scrapes employment data from the BLS API.
It retrieves employment statistics, wage data, and occupational information
without the overhead of class-based architecture.
"""

import pandas as pd
import requests
import json
from datetime import datetime
import logging
from typing import List, Optional

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def scrape_bls_employment_data(series_ids: List[str], api_key: Optional[str] = None) -> pd.DataFrame:
    """
    Scrape employment data from BLS API for specified series IDs.

    Args:
        series_ids: List of BLS series IDs to retrieve (e.g., ['CES0000000001', 'CES0500000003'])
        api_key: BLS API key (optional for public data)

    Returns:
        DataFrame containing employment data with columns: series_id, year, period, value, footnotes
    """
    base_url = "https://api.bls.gov/publicAPI/v2"

    # Prepare the request payload
    payload = {
        "seriesid": series_ids,
        "startyear": "2020",
        "endyear": str(datetime.now().year),
        "registrationkey": api_key if api_key else ""
    }

    try:
        logger.info(f"Requesting BLS data for series: {series_ids}")

        # Make the API request
        response = requests.post(f"{base_url}/timeseries/data/", json=payload)
        response.raise_for_status()  # Raise exception for bad status codes

        # Parse the JSON response
        data = response.json()

        if data.get('status') == 'REQUEST_SUCCEEDED':
            # Extract data from the response
            results = []
            for series in data.get('Results', {}).get('series', []):
                series_id = series.get('seriesID', '')

                for item in series.get('data', []):
                    results.append({
                        'series_id': series_id,
                        'year': item.get('year', ''),
                        'period': item.get('period', ''),
                        'periodName': item.get('periodName', ''),
                        'value': item.get('value', ''),
                        'footnotes': [note.get('text', '') for note in item.get('footnotes', [])]
                    })

            # Convert to DataFrame
            df = pd.DataFrame(results)

            # Convert value column to numeric, handling 'null' strings
            df['value'] = pd.to_numeric(df['value'].replace('null', pd.NA), errors='coerce')

            logger.info(f"Successfully retrieved {len(df)} data points")
            return df

        else:
            logger.error(f"BLS API request failed: {data.get('message', 'Unknown error')}")
            return pd.DataFrame()

    except requests.exceptions.RequestException as e:
        logger.error(f"Network error when requesting BLS data: {e}")
        return pd.DataFrame()
    except json.JSONDecodeError as e:
        logger.error(f"Error parsing BLS API response: {e}")
        return pd.DataFrame()
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        return pd.DataFrame()

def get_common_bls_series() -> dict:
    """
    Get a dictionary of common BLS series IDs with their descriptions.

    Returns:
        Dictionary mapping series IDs to descriptions
    """
    return {
        # Total Nonfarm Employment
        'CES0000000001': 'Total Nonfarm Employment',

        # Private Sector Employment
        'CES0500000003': 'Private Sector Employment',

        # Manufacturing Employment
        'CES3000000001': 'Manufacturing Employment',

        # Information Technology Employment
        'CES5000000001': 'Information Services Employment',

        # Professional and Business Services
        'CES6000000001': 'Professional and Business Services Employment',

        # Average Hourly Earnings - Private Sector
        'CES0500000003': 'Average Hourly Earnings - Private Sector',

        # Average Weekly Hours - Private Sector
        'CES0500000002': 'Average Weekly Hours - Private Sector'
    }

# Example usage and testing
if __name__ == "__main__":
    # Get common series IDs
    common_series = get_common_bls_series()
    print("Common BLS Series IDs:")
    for series_id, description in common_series.items():
        print(f"  {series_id}: {description}")

    # Example: Scrape data for total nonfarm employment
    print("\n" + "="*50)
    print("Scraping BLS Employment Data...")
    print("="*50)

    # You can replace these with your own series IDs
    test_series = ['CES0000000001', 'CES3000000001']  # Total nonfarm and manufacturing

    # Scrape the data
    employment_df = scrape_bls_employment_data(test_series)

    if not employment_df.empty:
        print(f"\nRetrieved {len(employment_df)} data points")
        print("\nFirst 10 rows of data:")
        print(employment_df.head(10))

        # Basic statistics
        print(f"\nData summary:")
        print(f"Date range: {employment_df['year'].min()} - {employment_df['year'].max()}")
        print(f"Series IDs: {employment_df['series_id'].unique()}")

        # Save to CSV (optional)
        # employment_df.to_csv('bls_employment_data.csv', index=False)
        # print("\nData saved to 'bls_employment_data.csv'")

    else:
        print("No data retrieved. Check your series IDs and internet connection.")

print("BLS scraping script ready to use!")

##### 3.2 Scraping AI Adoption Trackers from Websites

###### Scraping per source

In [None]:
# SPECIFIC SCRAPING

"""
AI Adoption Data Scraper - Real Data Collection

This script scrapes actual AI adoption data from various sources including
research reports, surveys, and public datasets. It collects real metrics
on AI adoption rates across different industries and time periods.
"""

import pandas as pd
import requests
import json
from datetime import datetime
import logging
from typing import Dict, List, Optional
import time
from bs4 import BeautifulSoup
import re

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def scrape_mckinsey_ai_adoption() -> pd.DataFrame:
    """
    Scrape AI adoption data from McKinsey Global Institute reports.

    Returns:
        DataFrame containing AI adoption data from McKinsey sources
    """
    logger.info("Scraping McKinsey AI adoption data...")

    # McKinsey AI adoption data sources
    mckinsey_urls = [
        "https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-state-of-ai-in-2023-generative-ais-breakout-year",
        "https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/the-state-of-ai-in-2022-and-a-half-decade-in-review"
    ]

    adoption_data = []

    for url in mckinsey_urls:
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }

            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract year from URL or content
            year_match = re.search(r'202[0-9]', url)
            year = int(year_match.group()) if year_match else datetime.now().year

            # Look for adoption statistics in the content
            # This is a simplified approach - in practice you'd need more sophisticated parsing
            text_content = soup.get_text()

            # Common patterns for AI adoption data
            adoption_patterns = [
                r'(\d+(?:\.\d+)?)\s*percent.*adopt.*AI',
                r'AI.*adoption.*(\d+(?:\.\d+)?)\s*percent',
                r'(\d+(?:\.\d+)?)\s*%.*organizations.*AI',
                r'(\d+(?:\.\d+)?)\s*%.*companies.*AI'
            ]

            for pattern in adoption_patterns:
                matches = re.findall(pattern, text_content, re.IGNORECASE)
                for match in matches:
                    try:
                        adoption_rate = float(match) / 100  # Convert percentage to decimal
                        adoption_data.append({
                            'source': 'mckinsey_global_institute',
                            'year': year,
                            'adoption_rate': adoption_rate,
                            'industry': 'cross_industry',
                            'data_type': 'survey',
                            'url': url
                        })
                    except ValueError:
                        continue

            time.sleep(1)  # Be respectful to the server

        except Exception as e:
            logger.warning(f"Error scraping {url}: {e}")
            continue

    return pd.DataFrame(adoption_data)

def scrape_gartner_ai_adoption() -> pd.DataFrame:
    """
    Scrape AI adoption data from Gartner research reports.

    Returns:
        DataFrame containing AI adoption data from Gartner sources
    """
    logger.info("Scraping Gartner AI adoption data...")

    # Gartner AI adoption data sources
    gartner_urls = [
        "https://www.gartner.com/en/newsroom/press-releases/2023-08-15-gartner-identifies-four-emerging-technologies-that-will-transform-business-outcomes",
        "https://www.gartner.com/en/newsroom/press-releases/2022-08-16-gartner-identifies-four-emerging-technologies-that-will-transform-business-outcomes"
    ]

    adoption_data = []

    for url in gartner_urls:
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }

            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract year from URL or content
            year_match = re.search(r'202[0-9]', url)
            year = int(year_match.group()) if year_match else datetime.now().year

            text_content = soup.get_text()

            # Look for AI adoption patterns in Gartner content
            adoption_patterns = [
                r'(\d+(?:\.\d+)?)\s*percent.*AI.*adoption',
                r'AI.*adoption.*(\d+(?:\.\d+)?)\s*percent',
                r'(\d+(?:\.\d+)?)\s*%.*enterprises.*AI',
                r'(\d+(?:\.\d+)?)\s*%.*organizations.*AI'
            ]

            for pattern in adoption_patterns:
                matches = re.findall(pattern, text_content, re.IGNORECASE)
                for match in matches:
                    try:
                        adoption_rate = float(match) / 100
                        adoption_data.append({
                            'source': 'gartner_research',
                            'year': year,
                            'adoption_rate': adoption_rate,
                            'industry': 'cross_industry',
                            'data_type': 'research',
                            'url': url
                        })
                    except ValueError:
                        continue

            time.sleep(1)

        except Exception as e:
            logger.warning(f"Error scraping {url}: {e}")
            continue

    return pd.DataFrame(adoption_data)

def scrape_world_economic_forum_ai_data() -> pd.DataFrame:
    """
    Scrape AI adoption data from World Economic Forum reports.

    Returns:
        DataFrame containing AI adoption data from WEF sources
    """
    logger.info("Scraping World Economic Forum AI adoption data...")

    # WEF AI adoption data sources
    wef_urls = [
        "https://www.weforum.org/agenda/2023/01/ai-adoption-business-2023/",
        "https://www.weforum.org/agenda/2022/01/artificial-intelligence-adoption-business/"
    ]

    adoption_data = []

    for url in wef_urls:
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }

            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract year from URL
            year_match = re.search(r'202[0-9]', url)
            year = int(year_match.group()) if year_match else datetime.now().year

            text_content = soup.get_text()

            # Look for AI adoption patterns in WEF content
            adoption_patterns = [
                r'(\d+(?:\.\d+)?)\s*percent.*AI.*adoption',
                r'AI.*adoption.*(\d+(?:\.\d+)?)\s*percent',
                r'(\d+(?:\.\d+)?)\s*%.*businesses.*AI',
                r'(\d+(?:\.\d+)?)\s*%.*companies.*AI'
            ]

            for pattern in adoption_patterns:
                matches = re.findall(pattern, text_content, re.IGNORECASE)
                for match in matches:
                    try:
                        adoption_rate = float(match) / 100
                        adoption_data.append({
                            'source': 'world_economic_forum',
                            'year': year,
                            'adoption_rate': adoption_rate,
                            'industry': 'cross_industry',
                            'data_type': 'survey',
                            'url': url
                        })
                    except ValueError:
                        continue

            time.sleep(1)

        except Exception as e:
            logger.warning(f"Error scraping {url}: {e}")
            continue

    return pd.DataFrame(adoption_data)

def scrape_github_ai_adoption_data() -> pd.DataFrame:
    """
    Scrape AI adoption data from GitHub's State of the Octoverse reports.

    Returns:
        DataFrame containing AI adoption data from GitHub sources
    """
    logger.info("Scraping GitHub AI adoption data...")

    # GitHub State of the Octoverse URLs
    github_urls = [
        "https://octoverse.github.com/2023/state-of-open-source",
        "https://octoverse.github.com/2022/state-of-open-source"
    ]

    adoption_data = []

    for url in github_urls:
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }

            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract year from URL
            year_match = re.search(r'202[0-9]', url)
            year = int(year_match.group()) if year_match else datetime.now().year

            text_content = soup.get_text()

            # Look for AI/ML adoption patterns in GitHub content
            adoption_patterns = [
                r'(\d+(?:\.\d+)?)\s*percent.*AI.*ML',
                r'AI.*ML.*(\d+(?:\.\d+)?)\s*percent',
                r'(\d+(?:\.\d+)?)\s*%.*machine.*learning',
                r'(\d+(?:\.\d+)?)\s*%.*artificial.*intelligence'
            ]

            for pattern in adoption_patterns:
                matches = re.findall(pattern, text_content, re.IGNORECASE)
                for match in matches:
                    try:
                        adoption_rate = float(match) / 100
                        adoption_data.append({
                            'source': 'github_octoverse',
                            'year': year,
                            'adoption_rate': adoption_rate,
                            'industry': 'technology',
                            'data_type': 'open_source',
                            'url': url
                        })
                    except ValueError:
                        continue

            time.sleep(1)

        except Exception as e:
            logger.warning(f"Error scraping {url}: {e}")
            continue

    return pd.DataFrame(adoption_data)

def collect_all_ai_adoption_data() -> pd.DataFrame:
    """
    Collect AI adoption data from all available sources.

    Returns:
        Combined DataFrame with all AI adoption data
    """
    logger.info("Starting comprehensive AI adoption data collection...")

    all_data = []

    # Collect data from all sources
    sources = [
        scrape_mckinsey_ai_adoption,
        scrape_gartner_ai_adoption,
        scrape_world_economic_forum_ai_data,
        scrape_github_ai_adoption_data
    ]

    for source_func in sources:
        try:
            df = source_func()
            if not df.empty:
                all_data.append(df)
                logger.info(f"Collected {len(df)} records from {source_func.__name__}")
        except Exception as e:
            logger.error(f"Error collecting data from {source_func.__name__}: {e}")

    # Combine all data
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        logger.info(f"Total records collected: {len(combined_df)}")
        return combined_df
    else:
        logger.warning("No data collected from any source")
        return pd.DataFrame()

def save_ai_adoption_data(df: pd.DataFrame, filename: str = "ai_adoption_data.csv") -> None:
    """
    Save AI adoption data to CSV file.

    Args:
        df: DataFrame containing AI adoption data
        filename: Output filename
    """
    try:
        df.to_csv(filename, index=False)
        logger.info(f"AI adoption data saved to {filename}")
    except Exception as e:
        logger.error(f"Error saving data: {e}")

# Example usage and testing
if __name__ == "__main__":
    print("="*70)
    print("AI ADOPTION DATA SCRAPER - REAL DATA COLLECTION")
    print("="*70)

    # Collect all AI adoption data
    print("\nCollecting AI adoption data from multiple sources...")
    ai_adoption_df = collect_all_ai_adoption_data()

    if not ai_adoption_df.empty:
        print(f"\n✓ Successfully collected {len(ai_adoption_df)} data points")
        print(f"✓ Sources: {ai_adoption_df['source'].unique()}")
        print(f"✓ Years: {sorted(ai_adoption_df['year'].unique())}")
        print(f"✓ Industries: {ai_adoption_df['industry'].unique()}")

        # Show sample data
        print("\nSample data collected:")
        print(ai_adoption_df.head(10))

        # Summary statistics
        print(f"\nSummary Statistics:")
        print(f"   Average adoption rate: {ai_adoption_df['adoption_rate'].mean():.1%}")
        print(f"   Highest adoption rate: {ai_adoption_df['adoption_rate'].max():.1%}")
        print(f"   Lowest adoption rate: {ai_adoption_df['adoption_rate'].min():.1%}")

        # Data by source
        print(f"\nData by source:")
        source_summary = ai_adoption_df.groupby('source').agg({
            'adoption_rate': ['count', 'mean', 'std']
        }).round(3)
        print(source_summary)

        # Save the data
        print(f"\nSaving data...")
        save_ai_adoption_data(ai_adoption_df)

    else:
        print("✗ No data collected. This might be due to:")
        print("   - Network connectivity issues")
        print("   - Website structure changes")
        print("   - Rate limiting by websites")
        print("   - Need for more sophisticated scraping techniques")

print("\nAI Adoption Data Scraper ready to use!")

###### Universal Scraper

In [None]:
# =============================================================================
# CELL: Universal AI Adoption Data Scraper
# =============================================================================

"""
Universal AI Adoption Data Scraper

This script can scrape AI adoption data from any URL by using intelligent
pattern matching to detect adoption rates, percentages, and statistics
without needing separate parsers for each website.
"""

import pandas as pd
import requests
import json
from datetime import datetime
from typing import Dict, List, Optional, Tuple
import time
from bs4 import BeautifulSoup
import re
import urllib.parse

def extract_year_from_url_or_content(url: str, soup: BeautifulSoup) -> int:
    """
    Extract year from URL or webpage content.

    Args:
        url: The webpage URL
        soup: BeautifulSoup object of the page

    Returns:
        Year as integer, defaults to current year if not found
    """
    # Try to extract year from URL first
    year_match = re.search(r'20[12]\d', url)  # Matches 2010-2029
    if year_match:
        return int(year_match.group())

    # Try to extract year from page content
    text_content = soup.get_text()
    year_patterns = [
        r'20[12]\d',  # Any year 2010-2029
        r'published.*20[12]\d',  # Published in year
        r'updated.*20[12]\d',    # Updated in year
        r'©.*20[12]\d'           # Copyright year
    ]

    for pattern in year_patterns:
        matches = re.findall(pattern, text_content, re.IGNORECASE)
        if matches:
            return int(matches[0])

    return datetime.now().year

def extract_ai_adoption_patterns(text_content: str) -> List[Tuple[float, str, str]]:
    """
    Extract AI adoption rates using comprehensive pattern matching.

    Args:
        text_content: The text content of the webpage

    Returns:
        List of tuples: (adoption_rate, context, pattern_used)
    """
    adoption_data = []

    # Comprehensive patterns for AI adoption rates
    patterns = [
        # Direct percentage patterns
        (r'(\d+(?:\.\d+)?)\s*percent.*AI.*adoption', 'percent_ai_adoption'),
        (r'AI.*adoption.*(\d+(?:\.\d+)?)\s*percent', 'ai_adoption_percent'),
        (r'(\d+(?:\.\d+)?)\s*%.*organizations.*AI', 'percent_orgs_ai'),
        (r'(\d+(?:\.\d+)?)\s*%.*companies.*AI', 'percent_companies_ai'),
        (r'(\d+(?:\.\d+)?)\s*%.*businesses.*AI', 'percent_businesses_ai'),
        (r'(\d+(?:\.\d+)?)\s*%.*enterprises.*AI', 'percent_enterprises_ai'),

        # Adoption rate patterns
        (r'adoption.*rate.*(\d+(?:\.\d+)?)\s*percent', 'adoption_rate_percent'),
        (r'adoption.*rate.*(\d+(?:\.\d+)?)\s*%', 'adoption_rate_percent'),
        (r'(\d+(?:\.\d+)?)\s*%.*adoption.*rate', 'percent_adoption_rate'),

        # Implementation patterns
        (r'(\d+(?:\.\d+)?)\s*percent.*implement.*AI', 'percent_implement_ai'),
        (r'(\d+(?:\.\d+)?)\s*%.*implement.*AI', 'percent_implement_ai'),
        (r'AI.*implementation.*(\d+(?:\.\d+)?)\s*percent', 'ai_implementation_percent'),

        # Usage patterns
        (r'(\d+(?:\.\d+)?)\s*percent.*use.*AI', 'percent_use_ai'),
        (r'(\d+(?:\.\d+)?)\s*%.*use.*AI', 'percent_use_ai'),
        (r'AI.*usage.*(\d+(?:\.\d+)?)\s*percent', 'ai_usage_percent'),

        # Deployment patterns
        (r'(\d+(?:\.\d+)?)\s*percent.*deploy.*AI', 'percent_deploy_ai'),
        (r'(\d+(?:\.\d+)?)\s*%.*deploy.*AI', 'percent_deploy_ai'),
        (r'AI.*deployment.*(\d+(?:\.\d+)?)\s*percent', 'ai_deployment_percent'),

        # Machine learning specific patterns
        (r'(\d+(?:\.\d+)?)\s*percent.*machine.*learning', 'percent_ml'),
        (r'(\d+(?:\.\d+)?)\s*%.*machine.*learning', 'percent_ml'),
        (r'machine.*learning.*(\d+(?:\.\d+)?)\s*percent', 'ml_percent'),

        # Industry-specific patterns
        (r'(\d+(?:\.\d+)?)\s*percent.*tech.*companies.*AI', 'percent_tech_ai'),
        (r'(\d+(?:\.\d+)?)\s*percent.*finance.*AI', 'percent_finance_ai'),
        (r'(\d+(?:\.\d+)?)\s*percent.*healthcare.*AI', 'percent_healthcare_ai'),
        (r'(\d+(?:\.\d+)?)\s*percent.*manufacturing.*AI', 'percent_manufacturing_ai'),

        # Survey and study patterns
        (r'survey.*(\d+(?:\.\d+)?)\s*percent.*AI', 'survey_percent_ai'),
        (r'study.*(\d+(?:\.\d+)?)\s*percent.*AI', 'study_percent_ai'),
        (r'research.*(\d+(?:\.\d+)?)\s*percent.*AI', 'research_percent_ai'),

        # Growth and increase patterns
        (r'AI.*adoption.*increased.*(\d+(?:\.\d+)?)\s*percent', 'ai_adoption_increase'),
        (r'AI.*adoption.*grew.*(\d+(?:\.\d+)?)\s*percent', 'ai_adoption_growth'),
        (r'(\d+(?:\.\d+)?)\s*percent.*increase.*AI', 'percent_increase_ai'),
    ]

    for pattern, pattern_type in patterns:
        matches = re.findall(pattern, text_content, re.IGNORECASE)
        for match in matches:
            try:
                adoption_rate = float(match) / 100  # Convert percentage to decimal

                # Get context (surrounding text)
                match_index = text_content.lower().find(match.lower())
                if match_index != -1:
                    start = max(0, match_index - 100)
                    end = min(len(text_content), match_index + 100)
                    context = text_content[start:end].strip()
                else:
                    context = "Context not found"

                adoption_data.append((adoption_rate, context, pattern_type))

            except ValueError:
                continue

    return adoption_data

def extract_industry_from_url_or_content(url: str, soup: BeautifulSoup) -> str:
    """
    Extract industry information from URL or content.

    Args:
        url: The webpage URL
        soup: BeautifulSoup object of the page

    Returns:
        Industry name or 'cross_industry' if not specific
    """
    # Check URL for industry keywords
    url_lower = url.lower()

    industry_keywords = {
        'tech': 'technology',
        'finance': 'finance',
        'banking': 'finance',
        'healthcare': 'healthcare',
        'medical': 'healthcare',
        'manufacturing': 'manufacturing',
        'retail': 'retail',
        'ecommerce': 'retail',
        'education': 'education',
        'energy': 'energy',
        'automotive': 'automotive',
        'transportation': 'transportation'
    }

    for keyword, industry in industry_keywords.items():
        if keyword in url_lower:
            return industry

    # Check page content for industry mentions
    text_content = soup.get_text().lower()
    for keyword, industry in industry_keywords.items():
        if keyword in text_content:
            return industry

    return 'cross_industry'

def scrape_ai_adoption_from_url(url: str) -> pd.DataFrame:
    """
    Scrape AI adoption data from any given URL.

    Args:
        url: The URL to scrape

    Returns:
        DataFrame containing AI adoption data from the URL
    """
    print(f"Scraping: {url}")

    try:
        # Set up headers to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }

        # Make the request
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()

        # Parse the HTML
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text content (remove scripts, styles, etc.)
        for script in soup(["script", "style", "nav", "footer", "header"]):
            script.decompose()

        text_content = soup.get_text()

        # Extract metadata
        year = extract_year_from_url_or_content(url, soup)
        industry = extract_industry_from_url_or_content(url, soup)

        # Extract AI adoption patterns
        adoption_patterns = extract_ai_adoption_patterns(text_content)

        # Convert to DataFrame
        adoption_data = []
        for adoption_rate, context, pattern_type in adoption_patterns:
            adoption_data.append({
                'url': url,
                'year': year,
                'industry': industry,
                'adoption_rate': adoption_rate,
                'pattern_type': pattern_type,
                'context': context[:200] + '...' if len(context) > 200 else context,
                'source_domain': urllib.parse.urlparse(url).netloc
            })

        if adoption_data:
            print(f"  Found {len(adoption_data)} adoption rate(s)")
        else:
            print(f"  No adoption rates found")

        return pd.DataFrame(adoption_data)

    except requests.exceptions.RequestException as e:
        print(f"  Error: Network issue - {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"  Error: {e}")
        return pd.DataFrame()

def scrape_multiple_urls(url_list: List[str]) -> pd.DataFrame:
    """
    Scrape AI adoption data from a list of URLs.

    Args:
        url_list: List of URLs to scrape

    Returns:
        Combined DataFrame with all AI adoption data
    """
    print(f"Starting to scrape {len(url_list)} URLs...")

    all_data = []

    for i, url in enumerate(url_list, 1):
        print(f"\n[{i}/{len(url_list)}]", end=" ")

        df = scrape_ai_adoption_from_url(url)
        if not df.empty:
            all_data.append(df)

        # Be respectful to servers
        time.sleep(2)

    # Combine all data
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"\n\nTotal adoption rates found: {len(combined_df)}")
        return combined_df
    else:
        print("\n\nNo adoption data found from any URL")
        return pd.DataFrame()

def analyze_adoption_data(df: pd.DataFrame) -> None:
    """
    Analyze and display insights from the collected adoption data.

    Args:
        df: DataFrame containing AI adoption data
    """
    if df.empty:
        print("No data to analyze")
        return

    print("\n" + "="*60)
    print("AI ADOPTION DATA ANALYSIS")
    print("="*60)

    # Basic statistics
    print(f"\n📊 Basic Statistics:")
    print(f"   Total adoption rates found: {len(df)}")
    print(f"   Average adoption rate: {df['adoption_rate'].mean():.1%}")
    print(f"   Highest adoption rate: {df['adoption_rate'].max():.1%}")
    print(f"   Lowest adoption rate: {df['adoption_rate'].min():.1%}")

    # By source domain
    print(f"\n🌐 By Source Domain:")
    domain_stats = df.groupby('source_domain').agg({
        'adoption_rate': ['count', 'mean', 'max']
    }).round(3)
    print(domain_stats)

    # By industry
    print(f"\n�� By Industry:")
    industry_stats = df.groupby('industry').agg({
        'adoption_rate': ['count', 'mean', 'max']
    }).round(3)
    print(industry_stats)

    # By year
    print(f"\n📅 By Year:")
    year_stats = df.groupby('year').agg({
        'adoption_rate': ['count', 'mean', 'max']
    }).round(3)
    print(year_stats)

    # Pattern types
    print(f"\n🔍 Pattern Types Found:")
    pattern_stats = df.groupby('pattern_type').count()['adoption_rate'].sort_values(ascending=False)
    print(pattern_stats)

# Example usage
if __name__ == "__main__":
    print("="*70)
    print("UNIVERSAL AI ADOPTION DATA SCRAPER")
    print("="*70)

    # Example URLs to scrape (you can replace with your own)
    urls_to_scrape = [
        "https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-state-of-ai-in-2023-generative-ais-breakout-year",
        "https://www.gartner.com/en/newsroom/press-releases/2023-08-15-gartner-identifies-four-emerging-technologies-that-will-transform-business-outcomes",
        "https://www.weforum.org/agenda/2023/01/ai-adoption-business-2023/",
        "https://octoverse.github.com/2023/state-of-open-source",
        "https://www.ibm.com/watson/ai-adoption",
        "https://www.pwc.com/gx/en/issues/data-and-analytics/artificial-intelligence-ishtudy.html"
    ]

    # Scrape all URLs
    adoption_df = scrape_multiple_urls(urls_to_scrape)

    if not adoption_df.empty:
        # Show sample data
        print(f"\n📋 Sample Data:")
        print(adoption_df[['source_domain', 'year', 'industry', 'adoption_rate', 'pattern_type']].head(10))

        # Analyze the data
        analyze_adoption_data(adoption_df)

        # Save to CSV
        filename = f"ai_adoption_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        adoption_df.to_csv(filename, index=False)
        print(f"\n💾 Data saved to: {filename}")

    else:
        print("\n❌ No data collected. Try different URLs or check your internet connection.")

print("\nUniversal AI Adoption Scraper ready to use!")

##### 3.3 Scraping Economic Data

In [None]:
# =============================================================================
# CELL: Universal Economic Data Scraper
# =============================================================================

"""
Universal Economic Data Scraper

This script can scrape economic indicators and metrics from any URL by using
intelligent pattern matching to detect GDP, inflation, unemployment, income
inequality, productivity, and other economic statistics without needing
separate parsers for each website.
"""

import pandas as pd
import requests
import json
from datetime import datetime
from typing import Dict, List, Optional, Tuple
import time
from bs4 import BeautifulSoup
import re
import urllib.parse

def extract_year_from_url_or_content(url: str, soup: BeautifulSoup) -> int:
    """Extract year from URL or webpage content."""
    # Try URL first
    year_match = re.search(r'20[12]\d', url)
    if year_match:
        return int(year_match.group())

    # Try page content
    text_content = soup.get_text()
    year_patterns = [
        r'20[12]\d',
        r'published.*20[12]\d',
        r'updated.*20[12]\d',
        r'©.*20[12]\d'
    ]

    for pattern in year_patterns:
        matches = re.findall(pattern, text_content, re.IGNORECASE)
        if matches:
            return int(matches[0])

    return datetime.now().year

def extract_economic_indicators(text_content: str) -> List[Tuple[str, float, str, str]]:
    """
    Extract economic indicators using comprehensive pattern matching.

    Returns:
        List of tuples: (indicator_name, value, unit, pattern_used)
    """
    economic_data = []

    # Comprehensive patterns for economic indicators
    patterns = [
        # GDP Patterns
        (r'GDP.*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|trillion|million)?\s*(?:dollars?|USD)?', 'gdp', 'dollars'),
        (r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|trillion|million)?\s*(?:dollars?|USD).*GDP', 'gdp', 'dollars'),
        (r'GDP.*growth.*(\d+(?:\.\d+)?)\s*percent', 'gdp_growth', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*GDP.*growth', 'gdp_growth', 'percent'),

        # Inflation Patterns
        (r'inflation.*rate.*(\d+(?:\.\d+)?)\s*percent', 'inflation_rate', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*inflation', 'inflation_rate', 'percent'),
        (r'CPI.*(\d+(?:\.\d+)?)\s*percent', 'cpi', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*CPI', 'cpi', 'percent'),

        # Unemployment Patterns
        (r'unemployment.*rate.*(\d+(?:\.\d+)?)\s*percent', 'unemployment_rate', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*unemployment', 'unemployment_rate', 'percent'),
        (r'jobless.*rate.*(\d+(?:\.\d+)?)\s*percent', 'unemployment_rate', 'percent'),

        # Income Inequality Patterns
        (r'Gini.*coefficient.*(\d+(?:\.\d+)?)', 'gini_coefficient', 'ratio'),
        (r'(\d+(?:\.\d+)?).*Gini.*coefficient', 'gini_coefficient', 'ratio'),
        (r'income.*inequality.*(\d+(?:\.\d+)?)\s*percent', 'income_inequality', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*income.*inequality', 'income_inequality', 'percent'),

        # Productivity Patterns
        (r'productivity.*growth.*(\d+(?:\.\d+)?)\s*percent', 'productivity_growth', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*productivity.*growth', 'productivity_growth', 'percent'),
        (r'labor.*productivity.*(\d+(?:\.\d+)?)\s*percent', 'labor_productivity', 'percent'),

        # Wage Patterns
        (r'average.*wage.*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:dollars?|USD)', 'average_wage', 'dollars'),
        (r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:dollars?|USD).*average.*wage', 'average_wage', 'dollars'),
        (r'wage.*growth.*(\d+(?:\.\d+)?)\s*percent', 'wage_growth', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*wage.*growth', 'wage_growth', 'percent'),

        # Interest Rate Patterns
        (r'interest.*rate.*(\d+(?:\.\d+)?)\s*percent', 'interest_rate', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*interest.*rate', 'interest_rate', 'percent'),
        (r'federal.*funds.*rate.*(\d+(?:\.\d+)?)\s*percent', 'federal_funds_rate', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*federal.*funds.*rate', 'federal_funds_rate', 'percent'),

        # Employment Patterns
        (r'employment.*rate.*(\d+(?:\.\d+)?)\s*percent', 'employment_rate', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*employment.*rate', 'employment_rate', 'percent'),
        (r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*jobs.*created', 'jobs_created', 'count'),
        (r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*jobs.*added', 'jobs_added', 'count'),

        # Trade Patterns
        (r'trade.*deficit.*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million)?\s*(?:dollars?|USD)', 'trade_deficit', 'dollars'),
        (r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million)?\s*(?:dollars?|USD).*trade.*deficit', 'trade_deficit', 'dollars'),
        (r'exports.*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million)?\s*(?:dollars?|USD)', 'exports', 'dollars'),
        (r'imports.*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million)?\s*(?:dollars?|USD)', 'imports', 'dollars'),

        # Housing Patterns
        (r'median.*home.*price.*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:dollars?|USD)', 'median_home_price', 'dollars'),
        (r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:dollars?|USD).*median.*home.*price', 'median_home_price', 'dollars'),
        (r'housing.*starts.*(\d+(?:,\d{3})*(?:\.\d+)?)', 'housing_starts', 'count'),

        # Consumer Spending Patterns
        (r'consumer.*spending.*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million)?\s*(?:dollars?|USD)', 'consumer_spending', 'dollars'),
        (r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million)?\s*(?:dollars?|USD).*consumer.*spending', 'consumer_spending', 'dollars'),

        # Business Investment Patterns
        (r'business.*investment.*(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million)?\s*(?:dollars?|USD)', 'business_investment', 'dollars'),
        (r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million)?\s*(?:dollars?|USD).*business.*investment', 'business_investment', 'dollars'),

        # Poverty Patterns
        (r'poverty.*rate.*(\d+(?:\.\d+)?)\s*percent', 'poverty_rate', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*poverty.*rate', 'poverty_rate', 'percent'),

        # Wealth Inequality Patterns
        (r'wealth.*inequality.*(\d+(?:\.\d+)?)\s*percent', 'wealth_inequality', 'percent'),
        (r'(\d+(?:\.\d+)?)\s*percent.*wealth.*inequality', 'wealth_inequality', 'percent'),
        (r'top.*(\d+)\s*percent.*(\d+(?:\.\d+)?)\s*percent.*wealth', 'top_wealth_share', 'percent'),

        # Stock Market Patterns
        (r'S&P.*500.*(\d+(?:,\d{3})*(?:\.\d+)?)', 'sp500', 'points'),
        (r'Dow.*Jones.*(\d+(?:,\d{3})*(?:\.\d+)?)', 'dow_jones', 'points'),
        (r'NASDAQ.*(\d+(?:,\d{3})*(?:\.\d+)?)', 'nasdaq', 'points'),
    ]

    for pattern, indicator_name, unit in patterns:
        matches = re.findall(pattern, text_content, re.IGNORECASE)
        for match in matches:
            try:
                # Clean the value (remove commas, convert to float)
                clean_value = match.replace(',', '')
                value = float(clean_value)

                # Get context
                match_index = text_content.lower().find(match.lower())
                if match_index != -1:
                    start = max(0, match_index - 100)
                    end = min(len(text_content), match_index + 100)
                    context = text_content[start:end].strip()
                else:
                    context = "Context not found"

                economic_data.append((indicator_name, value, unit, context))

            except ValueError:
                continue

    return economic_data

def extract_country_from_url_or_content(url: str, soup: BeautifulSoup) -> str:
    """Extract country information from URL or content."""
    # Check URL for country keywords
    url_lower = url.lower()

    country_keywords = {
        'usa': 'United States',
        'us': 'United States',
        'america': 'United States',
        'united-states': 'United States',
        'uk': 'United Kingdom',
        'britain': 'United Kingdom',
        'england': 'United Kingdom',
        'canada': 'Canada',
        'australia': 'Australia',
        'germany': 'Germany',
        'france': 'France',
        'japan': 'Japan',
        'china': 'China',
        'india': 'India',
        'brazil': 'Brazil',
        'mexico': 'Mexico'
    }

    for keyword, country in country_keywords.items():
        if keyword in url_lower:
            return country

    # Check page content
    text_content = soup.get_text().lower()
    for keyword, country in country_keywords.items():
        if keyword in text_content:
            return country

    return 'Unknown'

def scrape_economic_data_from_url(url: str) -> pd.DataFrame:
    """
    Scrape economic data from any given URL.

    Args:
        url: The URL to scrape

    Returns:
        DataFrame containing economic data from the URL
    """
    print(f"Scraping: {url}")

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }

        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove scripts, styles, etc.
        for script in soup(["script", "style", "nav", "footer", "header"]):
            script.decompose()

        text_content = soup.get_text()

        # Extract metadata
        year = extract_year_from_url_or_content(url, soup)
        country = extract_country_from_url_or_content(url, soup)

        # Extract economic indicators
        economic_indicators = extract_economic_indicators(text_content)

        # Convert to DataFrame
        economic_data = []
        for indicator_name, value, unit, context in economic_indicators:
            economic_data.append({
                'url': url,
                'year': year,
                'country': country,
                'indicator_name': indicator_name,
                'value': value,
                'unit': unit,
                'context': context[:200] + '...' if len(context) > 200 else context,
                'source_domain': urllib.parse.urlparse(url).netloc
            })

        if economic_data:
            print(f"  Found {len(economic_data)} economic indicator(s)")
        else:
            print(f"  No economic indicators found")

        return pd.DataFrame(economic_data)

    except requests.exceptions.RequestException as e:
        print(f"  Error: Network issue - {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"  Error: {e}")
        return pd.DataFrame()

def scrape_multiple_economic_urls(url_list: List[str]) -> pd.DataFrame:
    """
    Scrape economic data from a list of URLs.

    Args:
        url_list: List of URLs to scrape

    Returns:
        Combined DataFrame with all economic data
    """
    print(f"Starting to scrape {len(url_list)} URLs for economic data...")

    all_data = []

    for i, url in enumerate(url_list, 1):
        print(f"\n[{i}/{len(url_list)}]", end=" ")

        df = scrape_economic_data_from_url(url)
        if not df.empty:
            all_data.append(df)

        time.sleep(2)

    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"\n\nTotal economic indicators found: {len(combined_df)}")
        return combined_df
    else:
        print("\n\nNo economic data found from any URL")
        return pd.DataFrame()

def analyze_economic_data(df: pd.DataFrame) -> None:
    """
    Analyze and display insights from the collected economic data.

    Args:
        df: DataFrame containing economic data
    """
    if df.empty:
        print("No data to analyze")
        return

    print("\n" + "="*60)
    print("ECONOMIC DATA ANALYSIS")
    print("="*60)

    # Basic statistics
    print(f"\n📊 Basic Statistics:")
    print(f"   Total indicators found: {len(df)}")
    print(f"   Unique indicators: {df['indicator_name'].nunique()}")
    print(f"   Countries covered: {df['country'].nunique()}")
    print(f"   Years covered: {df['year'].min()} - {df['year'].max()}")

    # By indicator type
    print(f"\n📈 By Indicator Type:")
    indicator_stats = df.groupby('indicator_name').agg({
        'value': ['count', 'mean', 'min', 'max']
    }).round(3)
    print(indicator_stats)

    # By country
    print(f"\n🌍 By Country:")
    country_stats = df.groupby('country').agg({
        'value': ['count', 'mean']
    }).round(3)
    print(country_stats)

    # By year
    print(f"\n📅 By Year:")
    year_stats = df.groupby('year').agg({
        'value': ['count', 'mean']
    }).round(3)
    print(year_stats)

    # By source domain
    print(f"\n🌐 By Source Domain:")
    domain_stats = df.groupby('source_domain').agg({
        'value': ['count', 'mean']
    }).round(3)
    print(domain_stats)

    # Most common indicators
    print(f"\n🔍 Most Common Indicators:")
    common_indicators = df['indicator_name'].value_counts().head(10)
    print(common_indicators)

# Example usage
if __name__ == "__main__":
    print("="*70)
    print("UNIVERSAL ECONOMIC DATA SCRAPER")
    print("="*70)

    # Example URLs to scrape (you can replace with your own)
    urls_to_scrape = [
        "https://www.bls.gov/news.release/empsit.nr0.htm",  # BLS Employment
        "https://www.bls.gov/news.release/cpi.nr0.htm",     # BLS Inflation
        "https://www.bea.gov/news/2023/gross-domestic-product-fourth-quarter-and-year-2022-advance-estimate",  # BEA GDP
        "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm",  # Fed rates
        "https://www.census.gov/newsroom/press-releases/2023/income-poverty-health-insurance-coverage.html",  # Census income
        "https://www.worldbank.org/en/topic/poverty/overview",  # World Bank poverty
        "https://www.imf.org/en/Publications/WEO/Issues/2023/10/10/world-economic-outlook-october-2023",  # IMF outlook
        "https://data.oecd.org/gdp/gross-domestic-product-gdp.htm"  # OECD GDP
    ]

    # Scrape all URLs
    economic_df = scrape_multiple_economic_urls(urls_to_scrape)

    if not economic_df.empty:
        # Show sample data
        print(f"\n📋 Sample Data:")
        print(economic_df[['source_domain', 'year', 'country', 'indicator_name', 'value', 'unit']].head(10))

        # Analyze the data
        analyze_economic_data(economic_df)

        # Save to CSV
        filename = f"economic_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        economic_df.to_csv(filename, index=False)
        print(f"\n💾 Data saved to: {filename}")

    else:
        print("\n❌ No data collected. Try different URLs or check your internet connection.")

print("\nUniversal Economic Data Scraper ready to use!")

#### 6

#### 7