In [1]:
pip install selenium beautifulsoup4 pandas python-dateutil webdriver-manager

Collecting selenium
  Downloading selenium-4.30.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.2.4-cp310-cp310-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting sortedcontainers (from trio~=0.17->selenium

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data
data_dir = 'tech_progress_data'
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'pose_estimation'), exist_ok=True)

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:Ω
            year = int(year_match.group())
            return datetime.date(year, 1, 1)  # Default to January 1st of the year
    return None

def extract_parameters(text):
    """Extract model parameter count in millions from text"""
    if not text:
        return None
    
    text = text.lower()
    param_match = re.search(r'(\d+(?:\.\d+)?)\s*([kmbtgp]?)\s*(?:parameters|params|param)', text)
    
    if param_match:
        param_value = float(param_match.group(1))
        param_unit = param_match.group(2).lower() if param_match.group(2) else ''
        
        # Convert to millions
        if param_unit == 'k':
            return param_value / 1000  # K to M
        elif param_unit == 'm':
            return param_value  # Already in M
        elif param_unit == 'b' or param_unit == 'g':
            return param_value * 1000  # B to M
        elif param_unit == 't':
            return param_value * 1000000  # T to M
        elif param_unit == 'p':
            return param_value * 1000000000  # P to M
        else:
            return param_value  # Assume already in M if no unit
            
    return None

def get_top_object_detection_datasets(max_datasets=5):
    """Get the most important pose estimaiton datasets/benchmarks"""
    print("Identifying top pose estimation datasets...")
    
    # Prioritize these well-known datasets first
    priority_datasets = ['coco', 'pascal-voc', 'objects365', 'open-images']
    
    driver = setup_driver()
    dataset_links = []
    
    try:
        # Navigate to pose estimation task page
        driver.get("https://paperswithcode.com/task/object-detection")
        time.sleep(3)
        
        # Find all dataset links
        links = driver.find_elements(By.TAG_NAME, "a")
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href:
                # Extract dataset name from URL
                dataset_name = href.split("/")[-1]
                if dataset_name:
                    dataset_links.append({
                        "name": dataset_name,
                        "url": href
                    })
        
        # Sort by priority
        def get_priority(dataset):
            for i, priority in enumerate(priority_datasets):
                if priority in dataset["name"].lower():
                    return i
            return len(priority_datasets)
        
        dataset_links.sort(key=get_priority)
        
        # Limit to max_datasets
        return dataset_links[:max_datasets]
    
    except Exception as e:
        print(f"Error getting datasets: {e}")
        return []
    finally:
        driver.quit()

def scrape_sota_timeline(dataset_url):
    """Scrape the SOTA over time for a specific dataset"""
    print(f"Scraping SOTA timeline for {dataset_url}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # First load the dataset page
        driver.get(dataset_url)
        time.sleep(3)
        
        # Try to find SOTA over time link
        timeline_url = None
        links = driver.find_elements(By.TAG_NAME, "a")
        for link in links:
            href = link.get_attribute("href")
            if href and "sota-over-time" in href:
                timeline_url = href
                break
        
        # If no timeline link found, use the main leaderboard
        if not timeline_url:
            timeline_url = dataset_url
            print(f"No timeline link found for {dataset_url}, using main leaderboard")
        
        # Navigate to the timeline page
        driver.get(timeline_url)
        time.sleep(3)
        
        # Extract dataset/benchmark name
        dataset_name = dataset_url.split("/")[-1].replace("-", " ").title()
        
        # Try to extract data from JSON first (best source of truth)
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Get model parameters and compute if available
                                model_desc = eval_data.get('model', {}).get('description', '')
                                parameters = extract_parameters(model_desc)
                                
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': dataset_name,
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'parameters_millions': parameters,
                                        'description': model_desc,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': dataset_name,
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        return models_data
    except Exception as e:
        print(f"Error scraping SOTA timeline: {e}")
        return []
    finally:
        driver.quit()

def consolidate_to_monthly_sota(models_df):
    """
    From a full model dataframe, create a consolidated monthly SOTA dataset
    with one best model per month per dataset
    """
    if models_df.empty:
        return pd.DataFrame()
    
    # Create year-month column if not exists
    if 'year_month' not in models_df.columns:
        models_df['year_month'] = pd.to_datetime(models_df['date']).dt.to_period('M')
    
    # Identify primary metric columns (typically mAP, accuracy, etc.)
    # This is a heuristic approach - you may need to adjust for specific datasets
    metric_cols = []
    for col in models_df.columns:
        if col.lower() in ['map', 'ap', 'accuracy', 'f1', 'precision', 'recall']:
            metric_cols.append(col)
    
    if not metric_cols:
        # If no clear metrics, try to find numeric columns
        for col in models_df.columns:
            if models_df[col].dtype in [float, int] and col not in ['year', 'month', 'parameters_millions']:
                metric_cols.append(col)
    
    # If still no metrics, just return the original data
    if not metric_cols:
        return models_df
    
    # Use first metric as primary sort key
    primary_metric = metric_cols[0]
    
    # Group by dataset and month, then take the best model
    # (assuming higher value is better for the metric)
    datasets = models_df['dataset'].unique()
    monthly_sota = []
    
    for dataset in datasets:
        dataset_df = models_df[models_df['dataset'] == dataset]
        
        # Get monthly best models
        try:
            # Sort by year_month and metric (descending for metric)
            monthly_best = dataset_df.sort_values(['year_month', primary_metric], 
                                              ascending=[True, False])\
                                   .groupby('year_month')\
                                   .first()\
                                   .reset_index()
            
            monthly_sota.append(monthly_best)
        except Exception as e:
            print(f"Error processing {dataset}: {e}")
    
    # Combine all datasets
    if monthly_sota:
        combined = pd.concat(monthly_sota, ignore_index=True)
        # Convert period to date string
        combined['year_month'] = combined['year_month'].astype(str)
        return combined
    else:
        return pd.DataFrame()

def main():
    print("Starting Pose Estimation time series data collection...")
    
    # Get top pose estimation datasets
    datasets = get_top_object_detection_datasets(max_datasets=5)
    
    if not datasets:
        print("No datasets found. Exiting.")
        return
    
    print(f"Found {len(datasets)} datasets to process: {[d['name'] for d in datasets]}")
    
    # DataFrame to store all models
    all_models = []
    
    # Process each dataset
    for dataset in datasets:
        try:
            dataset_models = scrape_sota_timeline(dataset["url"])
            
            if dataset_models:
                print(f"Scraped {len(dataset_models)} models for {dataset['name']}")
                all_models.extend(dataset_models)
            else:
                print(f"No models found for {dataset['name']}")
            
            # Be nice to the server
            time.sleep(3)
            
        except Exception as e:
            print(f"Error processing {dataset['name']}: {e}")
    
    # Convert to DataFrame
    if all_models:
        df = pd.DataFrame(all_models)
        
        # Save full dataset
        full_output_path = os.path.join(data_dir, 'pose_estimation', 'pose_estimation_all_models.csv')
        df.to_csv(full_output_path, index=False)
        print(f"Saved {len(df)} models to {full_output_path}")
        
        # Create year-month column for aggregation
        df['year_month'] = pd.to_datetime(df['date']).dt.to_period('M')
        
        # Save yearly summary
        yearly_summary = df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
        yearly_summary_path = os.path.join(data_dir, 'pose_estimation', 'pose_estimation_yearly_count.csv')
        yearly_summary.to_csv(yearly_summary_path, index=False)
        print(f"Saved yearly summary to {yearly_summary_path}")
        
        # Get monthly SOTA models
        monthly_sota = consolidate_to_monthly_sota(df)
        if not monthly_sota.empty:
            monthly_path = os.path.join(data_dir, 'pose_estimation', 'pose_estimation_monthly_sota.csv')
            monthly_sota.to_csv(monthly_path, index=False)
            print(f"Saved {len(monthly_sota)} monthly SOTA models to {monthly_path}")
        
        # Create a time series summary (year, month, count of models, avg performance)
        try:
            # Get the most common metric column
            metric_cols = []
            for col in df.columns:
                if col.lower() in ['map', 'ap', 'accuracy', 'f1']:
                    metric_cols.append(col)
            
            if metric_cols:
                primary_metric = metric_cols[0]
                time_series = df.groupby(['year', 'month']).agg({
                    'model_name': 'count',
                    primary_metric: ['mean', 'max'],
                    'parameters_millions': ['mean', 'max', 'count']
                }).reset_index()
                
                # Flatten multi-level columns
                time_series.columns = ['_'.join(col).strip('_') for col in time_series.columns.values]
                
                time_series_path = os.path.join(data_dir, 'pose_estimation', 'pose_estimation_time_series.csv')
                time_series.to_csv(time_series_path, index=False)
                print(f"Saved time series summary to {time_series_path}")
        except Exception as e:
            print(f"Error creating time series summary: {e}")
        
        print("Data collection complete!")
    else:
        print("No models collected.")

if __name__ == "__main__":
    main()

Starting Pose Estimation time series data collection...
Identifying top pose estimation datasets...
Found 5 datasets to process: ['object-detection-on-coco', 'object-detection-on-coco', 'object-detection-on-coco', 'object-detection-on-coco', 'object-detection-on-coco-minival']
Scraping SOTA timeline for https://paperswithcode.com/sota/object-detection-on-coco...
No timeline link found for https://paperswithcode.com/sota/object-detection-on-coco, using main leaderboard
Scraped 262 models for object-detection-on-coco
Scraping SOTA timeline for https://paperswithcode.com/sota/object-detection-on-coco...
No timeline link found for https://paperswithcode.com/sota/object-detection-on-coco, using main leaderboard
Scraped 262 models for object-detection-on-coco
Scraping SOTA timeline for https://paperswithcode.com/sota/object-detection-on-coco...
No timeline link found for https://paperswithcode.com/sota/object-detection-on-coco, using main leaderboard
Scraped 262 models for object-detection-o

In [1]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory (karmabirchakraborty)
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'pose_estimation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def get_pose_benchmarks_with_20plus():
    """Get pose estimation benchmarks with at least 20 datapoints"""
    print("Finding pose estimation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/pose-estimation-on-coco",
        "https://paperswithcode.com/sota/pose-estimation-on-mpii",
        "https://paperswithcode.com/sota/3d-human-pose-estimation-on-human36m",
        "https://paperswithcode.com/sota/pose-estimation-on-ochuman",
        "https://paperswithcode.com/sota/3d-human-pose-estimation-on-mpi-inf-3dhp",
        "https://paperswithcode.com/sota/multi-person-pose-estimation-on-coco"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main pose estimation page
        driver.get("https://paperswithcode.com/task/pose-estimation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "pose" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential pose estimation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'pose_estimation', 'pose_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'pose_estimation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'pose_estimation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all pose estimation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'pose_estimation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'pose_estimation', 'all_pose_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'pose_estimation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting pose estimation data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_pose_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No pose estimation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} pose estimation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Pose estimation data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting pose estimation data collection (20+ datapoints)...
Finding pose estimation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 34 potential pose estimation benchmarks
Checking pose-estimation-on-coco...
pose-estimation-on-coco: 10 datapoints
Checking pose-estimation-on-mpii...
pose-estimation-on-mpii: 1 datapoints
Checking 3d-human-pose-estimation-on-human36m...
3d-human-pose-estimation-on-human36m: 357 datapoints
Added 3d-human-pose-estimation-on-human36m with 357 datapoints
Checking pose-estimation-on-ochuman...
pose-estimation-on-ochuman: 18 datapoints
Checking 3d-human-pose-estimation-on-mpi-inf-3dhp...
3d-human-pose-estimation-on-mpi-inf-3dhp: 116 datapoints
Added 3d-human-pose-estimation-on-mpi-inf-3dhp with 116 datapoints
Checking multi-person-pose-estimation-on-coco...
multi-person-pose-estimation-on-coco: 15 datapoints
Checking pose-estimation-

In [2]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'object_detection'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_object_detection_benchmarks_with_20plus():
    """Get object detection benchmarks with at least 20 datapoints"""
    print("Finding object detection benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/object-detection-on-coco",
        "https://paperswithcode.com/sota/object-detection-on-pascal-voc-2007",
        "https://paperswithcode.com/sota/object-detection-on-pascal-voc-2012",
        "https://paperswithcode.com/sota/object-detection-on-lvis",
        "https://paperswithcode.com/sota/object-detection-on-open-images",
        "https://paperswithcode.com/sota/real-time-object-detection-on-coco"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main object detection page
        driver.get("https://paperswithcode.com/task/object-detection")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "detection" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential object detection benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'object_detection', 'object_detection_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'object_detection', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'object_detection', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def extract_inference_speed(model_name, description):
    """Extract inference speed (FPS) from model name or description"""
    if not model_name and not description:
        return None
    
    # Common patterns for FPS
    fps_patterns = [
        r'(\d+\.?\d*)\s*fps',
        r'(\d+\.?\d*)\s*FPS',
        r'(\d+\.?\d*)\s*frames\s*per\s*second',
        r'runs\s*at\s*(\d+\.?\d*)\s*fps',
        r'speed\s*[\-:]\s*(\d+\.?\d*)\s*fps'
    ]
    
    text = f"{model_name} {description}"
    
    for pattern in fps_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            try:
                return float(match.group(1))
            except:
                pass
    
    return None

def combine_all_models():
    """Combine all object detection models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'object_detection')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        
                        # Try to extract FPS if available in model name or description
                        if 'description' in df.columns and 'model_name' in df.columns:
                            fps_values = []
                            for _, row in df.iterrows():
                                fps = extract_inference_speed(row.get('model_name', ''), row.get('description', ''))
                                fps_values.append(fps)
                            df['inference_fps'] = fps_values
                        
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'object_detection', 'all_object_detection_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'object_detection', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
            
            # Analyze trends in performance metrics
            if len(combined_df) > 0:
                print("Analyzing performance trends...")
                # For COCO dataset specifically (if present)
                coco_data = combined_df[combined_df['dataset'].str.contains('coco', case=False, na=False)]
                if len(coco_data) > 0:
                    # Check common metrics in COCO
                    metric_cols = [col for col in coco_data.columns if any(m in col.lower() for m in ['map', 'ap', 'accuracy', 'f1'])]
                    if metric_cols:
                        # Create yearly metrics summary
                        yearly_metrics = coco_data.groupby('year')[metric_cols].mean().reset_index()
                        yearly_metrics.to_csv(os.path.join(data_dir, 'object_detection', 'coco_yearly_metrics.csv'), index=False)
                        print(f"Created yearly metrics summary for COCO data")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting object detection data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_object_detection_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No object detection benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} object detection benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Object detection data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting object detection data collection (20+ datapoints)...
Finding object detection benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 126 potential object detection benchmarks
Checking object-detection-on-coco...
object-detection-on-coco: 262 datapoints
Added object-detection-on-coco with 262 datapoints
Checking object-detection-on-pascal-voc-2007...
object-detection-on-pascal-voc-2007: 30 datapoints
Added object-detection-on-pascal-voc-2007 with 30 datapoints
Checking object-detection-on-pascal-voc-2012...
object-detection-on-pascal-voc-2012: 7 datapoints
Checking object-detection-on-lvis...
object-detection-on-lvis: 0 datapoints
Checking object-detection-on-open-images...
object-detection-on-open-images: 0 datapoints
Checking real-time-object-detection-on-coco...
real-time-object-detection-on-coco: 82 datapoints
Added real-time-object-detection-on-coco wit

In [3]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'representation_learning'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def extract_training_data_size(text):
    """Extract training dataset size from model description"""
    if not text:
        return None
    
    # Common patterns for dataset sizes
    patterns = [
        r'trained on\s*(\d+\.?\d*)\s*[Mm]illion\s*images',
        r'trained on\s*(\d+\.?\d*)\s*[Bb]illion\s*images',
        r'(\d+\.?\d*)\s*[Mm]\s*training samples',
        r'(\d+\.?\d*)\s*[Bb]\s*training samples',
        r'dataset of\s*(\d+\.?\d*)\s*[Mm]illion',
        r'dataset of\s*(\d+\.?\d*)\s*[Bb]illion',
        r'dataset with\s*(\d+\.?\d*)\s*[Mm]illion',
        r'dataset with\s*(\d+\.?\d*)\s*[Bb]illion',
        r'trained with\s*(\d+\.?\d*)\s*[Mm]',
        r'trained with\s*(\d+\.?\d*)\s*[Bb]'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            data_size = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b training' in text.lower():
                data_size *= 1000  # Convert billions to millions
            return data_size
    
    return None

def get_representation_learning_benchmarks_with_20plus():
    """Get representation learning benchmarks with at least 20 datapoints"""
    print("Finding representation learning benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/self-supervised-image-classification-on-1",
        "https://paperswithcode.com/sota/self-supervised-image-classification-on",
        "https://paperswithcode.com/sota/representation-learning-on-imagenet",
        "https://paperswithcode.com/sota/representation-learning-on-cifar-10",
        "https://paperswithcode.com/sota/representation-learning-on-cifar-100",
        "https://paperswithcode.com/sota/contrastive-learning-on-imagenet",
        "https://paperswithcode.com/sota/unsupervised-image-classification-on-imagenet"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main representation learning page
        driver.get("https://paperswithcode.com/task/representation-learning")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and any(term in href.lower() for term in ["representation", "self-supervised", "contrastive", "unsupervised", "embedding"]):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential representation learning benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'representation_learning', 'representation_learning_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def extract_compute_from_text(text):
    """Extract compute resources (FLOPs) from model description"""
    if not text:
        return None
    
    # Patterns for compute resources
    patterns = [
        r'(\d+\.?\d*)\s*[Pp]eta[Ff][Ll][Oo][Pp]s?',
        r'(\d+\.?\d*)\s*[Pp][Ff][Ll][Oo][Pp]s?',
        r'(\d+\.?\d*)\s*[Ee]xa[Ff][Ll][Oo][Pp]s?',
        r'(\d+\.?\d*)\s*[Ee][Ff][Ll][Oo][Pp]s?',
        r'compute:\s*(\d+\.?\d*)\s*[Pp]',
        r'compute:\s*(\d+\.?\d*)\s*[Ee]',
        r'trained with\s*(\d+\.?\d*)\s*[Pp]',
        r'trained with\s*(\d+\.?\d*)\s*[Ee]'
    ]
    
    units = {
        'p': 10**15,  # petaflops
        'e': 10**18   # exaflops
    }
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value = float(match.group(1))
            unit = text[match.end()-1:match.end()].lower()
            if unit in units:
                # Convert to petaflops
                if unit == 'e':
                    value *= 1000  # Convert exaflops to petaflops
                return value
    
    return None

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'representation_learning', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count, training data size, and compute
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                dataset_size = extract_training_data_size(model_desc)
                                compute = extract_compute_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'dataset_size_millions': dataset_size,
                                        'compute_petaflops': compute,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'representation_learning', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all representation learning models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'representation_learning')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'representation_learning', 'all_representation_learning_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'representation_learning', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
            
            # Analyze parameter scaling trends
            if 'parameters_millions' in combined_df.columns:
                param_df = combined_df[combined_df['parameters_millions'].notna()]
                if len(param_df) > 0:
                    yearly_params = param_df.groupby('year')['parameters_millions'].agg(['mean', 'median', 'max']).reset_index()
                    yearly_params.to_csv(os.path.join(data_dir, 'representation_learning', 'yearly_parameter_scaling.csv'), index=False)
                    print(f"Created yearly parameter scaling analysis")
            
            # Analyze compute trends
            if 'compute_petaflops' in combined_df.columns:
                compute_df = combined_df[combined_df['compute_petaflops'].notna()]
                if len(compute_df) > 0:
                    yearly_compute = compute_df.groupby('year')['compute_petaflops'].agg(['mean', 'median', 'max']).reset_index()
                    yearly_compute.to_csv(os.path.join(data_dir, 'representation_learning', 'yearly_compute_scaling.csv'), index=False)
                    print(f"Created yearly compute scaling analysis")
            
            # Analyze dataset size trends
            if 'dataset_size_millions' in combined_df.columns:
                dataset_df = combined_df[combined_df['dataset_size_millions'].notna()]
                if len(dataset_df) > 0:
                    yearly_dataset = dataset_df.groupby('year')['dataset_size_millions'].agg(['mean', 'median', 'max']).reset_index()
                    yearly_dataset.to_csv(os.path.join(data_dir, 'representation_learning', 'yearly_dataset_scaling.csv'), index=False)
                    print(f"Created yearly dataset size scaling analysis")
            
            # Create analysis of common metrics across datasets
            metric_cols = [col for col in combined_df.columns if col not in ['dataset', 'model_name', 'paper_title', 
                                                                            'paper_url', 'code_url', 'description', 
                                                                            'parameters_millions', 'dataset_size_millions', 
                                                                            'compute_petaflops', 'date', 'year', 'month', 'day']]
            
            # Find common metrics with numeric values
            numeric_metrics = []
            for col in metric_cols:
                if pd.api.types.is_numeric_dtype(combined_df[col]):
                    numeric_metrics.append(col)
            
            if numeric_metrics:
                yearly_metrics = combined_df.groupby(['dataset', 'year'])[numeric_metrics].agg(['mean', 'max']).reset_index()
                metric_file = os.path.join(data_dir, 'representation_learning', 'yearly_metric_analysis.csv')
                yearly_metrics.to_csv(metric_file, index=False)
                print(f"Created yearly metrics analysis")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting representation learning data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_representation_learning_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No representation learning benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} representation learning benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Representation learning data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting representation learning data collection (20+ datapoints)...
Finding representation learning benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 12 potential representation learning benchmarks
Checking self-supervised-image-classification-on-1...
self-supervised-image-classification-on-1: 65 datapoints
Added self-supervised-image-classification-on-1 with 65 datapoints
Checking self-supervised-image-classification-on...
self-supervised-image-classification-on: 142 datapoints
Added self-supervised-image-classification-on with 142 datapoints
Checking representation-learning-on-imagenet...
representation-learning-on-imagenet: 0 datapoints
Checking representation-learning-on-cifar-10...
representation-learning-on-cifar-10: 0 datapoints
Checking representation-learning-on-cifar-100...
representation-learning-on-cifar-100: 0 datapoints
Checking contrastive-learn

In [1]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'classification'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def extract_flops_from_text(text):
    """Extract FLOPs (computational complexity) from model description"""
    if not text:
        return None
    
    # FLOPs patterns (Billions or Millions)
    patterns = [
        r'(\d+\.?\d*)\s*[Gg][Ff][Ll][Oo][Pp][Ss]',  # GFLOPs
        r'(\d+\.?\d*)\s*[Mm][Ff][Ll][Oo][Pp][Ss]',  # MFLOPs
        r'(\d+\.?\d*)\s*[Bb]illion\s*[Ff][Ll][Oo][Pp][Ss]',
        r'(\d+\.?\d*)\s*[Mm]illion\s*[Ff][Ll][Oo][Pp][Ss]',
        r'(\d+\.?\d*)\s*[Bb]\s*[Ff][Ll][Oo][Pp][Ss]',
        r'(\d+\.?\d*)\s*[Mm]\s*[Ff][Ll][Oo][Pp][Ss]'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            flops = float(match.group(1))
            # Convert everything to millions of FLOPs for consistency
            if 'gflops' in text.lower() or 'g flops' in text.lower() or 'billion' in text.lower() or 'b flops' in text.lower():
                flops *= 1000  # Convert GFLOPs/BFLOPs to MFLOPs
            return flops
    
    return None

def get_classification_benchmarks_with_20plus():
    """Get classification benchmarks with at least 20 datapoints"""
    print("Finding classification benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/image-classification-on-imagenet",
        "https://paperswithcode.com/sota/image-classification-on-cifar-10",
        "https://paperswithcode.com/sota/image-classification-on-cifar-100",
        "https://paperswithcode.com/sota/fine-grained-image-classification-on-oxford",
        "https://paperswithcode.com/sota/fine-grained-image-classification-on-stanford",
        "https://paperswithcode.com/sota/image-classification-on-tiny-imagenet-200",
        "https://paperswithcode.com/sota/image-classification-on-mnist"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main classification page
        driver.get("https://paperswithcode.com/task/classification-1")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "classification" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential classification benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'classification', 'classification_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'classification', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count and FLOPs
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                flops = extract_flops_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'flops_millions': flops,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    param_col = None
                    flops_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif any(term in header for term in ['param', 'parameters']):
                            param_col = i
                        elif any(term in header for term in ['flop', 'flops', 'mac', 'macs']):
                            flops_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract parameter count if available from dedicated column
                            param_count = None
                            if param_col is not None:
                                param_text = cells[param_col].text.strip()
                                param_count = extract_parameters_from_text(param_text)
                            
                            # Extract FLOPs if available from dedicated column
                            flops = None
                            if flops_col is not None:
                                flops_text = cells[flops_col].text.strip()
                                flops = extract_flops_from_text(flops_text)
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'parameters_millions': param_count,
                                    'flops_millions': flops,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'classification', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all classification models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'classification')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'classification', 'all_classification_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'classification', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting classification data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_classification_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No classification benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} classification benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Classification data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting classification data collection (20+ datapoints)...
Finding classification benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 65 potential classification benchmarks
Checking image-classification-on-imagenet...
image-classification-on-imagenet: 1058 datapoints
Added image-classification-on-imagenet with 1058 datapoints
Checking image-classification-on-cifar-10...
image-classification-on-cifar-10: 264 datapoints
Added image-classification-on-cifar-10 with 264 datapoints
Checking image-classification-on-cifar-100...
image-classification-on-cifar-100: 210 datapoints
Added image-classification-on-cifar-100 with 210 datapoints
Checking fine-grained-image-classification-on-oxford...
fine-grained-image-classification-on-oxford: 25 datapoints
Added fine-grained-image-classification-on-oxford with 25 datapoints
Checking fine-grained-image-classification-on-stanfor

In [2]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'object'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_object_benchmarks_with_20plus():
    """Get object benchmarks with at least 20 datapoints"""
    print("Finding object benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-moderate",
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-easy",
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-hard",
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-cyclists",
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-pedestrians",
        "https://paperswithcode.com/sota/3d-object-detection-on-nuscenes"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main object page
        driver.get("https://paperswithcode.com/task/object")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "object" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential object benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'object', 'object_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'object', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'object', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all object models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'object')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'object', 'all_object_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'object', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting object task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_object_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No object benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} object benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Object task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting object task data collection (20+ datapoints)...
Finding object benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 6 potential object benchmarks
Checking 3d-object-detection-on-kitti-cars-moderate...
3d-object-detection-on-kitti-cars-moderate: 0 datapoints
Checking 3d-object-detection-on-kitti-cars-easy...
3d-object-detection-on-kitti-cars-easy: 26 datapoints
Added 3d-object-detection-on-kitti-cars-easy with 26 datapoints
Checking 3d-object-detection-on-kitti-cars-hard...
3d-object-detection-on-kitti-cars-hard: 25 datapoints
Added 3d-object-detection-on-kitti-cars-hard with 25 datapoints
Checking 3d-object-detection-on-kitti-cyclists...
3d-object-detection-on-kitti-cyclists: 13 datapoints
Checking 3d-object-detection-on-kitti-pedestrians...
3d-object-detection-on-kitti-pedestrians: 12 datapoints
Checking 3d-object-detection-on-nuscenes...
3d-object-detec

In [None]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'object'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_object_benchmarks_with_20plus():
    """Get object benchmarks with at least 20 datapoints"""
    print("Finding object benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-moderate",
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-easy",
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-cars-hard",
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-cyclists",
        "https://paperswithcode.com/sota/3d-object-detection-on-kitti-pedestrians",
        "https://paperswithcode.com/sota/3d-object-detection-on-nuscenes"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main object page
        driver.get("https://paperswithcode.com/task/object")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "object" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential object benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'object', 'object_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'object', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'object', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all object models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'object')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'object', 'all_object_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'object', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting object task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_object_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No object benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} object benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Object task data collection complete!")

if __name__ == "__main__":
    main()

In [None]:
import os
import re
import json
import time
import datetime
import pandas as pd
from typing import List, Dict, Any, Optional

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

class ImageClassificationScraper:
    def __init__(self):
        # Base directory for saving data
        self.home_dir = os.path.expanduser("~")
        self.data_dir = os.path.join(
            self.home_dir, 
            "Documents", 
            "Jupyter Notebooks", 
            "RA Task", 
            "tech_progress_data"
        )
        os.makedirs(self.data_dir, exist_ok=True)
        os.makedirs(os.path.join(self.data_dir, 'image'), exist_ok=True)

    def _setup_driver(self):
        """Initialize and return a Chrome WebDriver"""
        chrome_options = Options()
        chrome_options.add_argument("--headless=new")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        
        driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()), 
            options=chrome_options
        )
        driver.set_page_load_timeout(30)
        return driver

    @staticmethod
    def _extract_date_from_text(text: str) -> Optional[datetime.date]:
        """Extract a date from text using various heuristics"""
        if not text:
            return None
        
        try:
            # Try direct parsing
            return parser.parse(text, fuzzy=True).date()
        except:
            # Look for year patterns
            year_match = re.search(r'20[0-2][0-9]', text)
            if year_match:
                year = int(year_match.group())
                # Look for month patterns
                month_match = re.search(
                    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', 
                    text, 
                    re.IGNORECASE
                )
                if month_match:
                    month_text = month_match.group().lower()
                    month_map = {
                        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                        'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                    }
                    for prefix, month_num in month_map.items():
                        if month_text.startswith(prefix):
                            return datetime.date(year, month_num, 1)
                
                # If only year was found, default to January
                return datetime.date(year, 1, 1)
        return None

    @staticmethod
    def _extract_parameters_from_text(text: str) -> Optional[float]:
        """Extract parameter count from model description"""
        if not text:
            return None
        
        # Common patterns for parameter counts (millions, billions)
        patterns = [
            r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
            r'(\d+\.?\d*)\s*[Mm]\s*parameters',
            r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
            r'(\d+\.?\d*)\s*[Bb]\s*parameters',
            r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
            r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
            r'params:\s*(\d+\.?\d*)\s*[Mm]',
            r'params:\s*(\d+\.?\d*)\s*[Bb]',
            r'(\d+\.?\d*)\s*[Mm]\s*params',
            r'(\d+\.?\d*)\s*[Bb]\s*params'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                param_count = float(match.group(1))
                # Convert to millions
                if any(term in text.lower() for term in ['billion', 'b params', 'b parameters']):
                    param_count *= 1000  # Convert billions to millions
                return param_count
        
        return None

    def find_benchmarks(self) -> List[Dict[str, Any]]:
        """Find image classification benchmarks with at least 20 datapoints"""
        # Major benchmarks to start with
        major_benchmarks = [
            "https://paperswithcode.com/sota/image-classification-on-imagenet",
            "https://paperswithcode.com/sota/image-classification-on-cifar-10",
            "https://paperswithcode.com/sota/image-classification-on-cifar-100",
            "https://paperswithcode.com/sota/image-classification-on-caltech-101",
            "https://paperswithcode.com/sota/image-classification-on-stl-10"
        ]
        
        driver = self._setup_driver()
        benchmarks_with_data = []
        
        try:
            # Navigate to image classification page
            driver.get("https://paperswithcode.com/task/image-classification")
            time.sleep(3)
            
            # Collect benchmark links
            links = driver.find_elements(By.TAG_NAME, "a")
            benchmark_links = set(major_benchmarks)
            
            # Add additional benchmarks from page
            for link in links:
                href = link.get_attribute("href")
                if href and "/sota/" in href and "image-classification" in href.lower():
                    benchmark_links.add(href)
            
            # Process each benchmark
            for benchmark_url in benchmark_links:
                try:
                    benchmark_name = benchmark_url.split("/")[-1]
                    driver.get(benchmark_url)
                    time.sleep(3)
                    
                    # Find timeline link
                    timeline_url = None
                    timeline_links = driver.find_elements(By.TAG_NAME, "a")
                    for link in timeline_links:
                        href = link.get_attribute("href")
                        if href and "sota-over-time" in href:
                            timeline_url = href
                            break
                    
                    # Count datapoints
                    max_datapoints = 0
                    tables = driver.find_elements(By.TAG_NAME, "table")
                    for table in tables:
                        rows = table.find_elements(By.TAG_NAME, "tr")
                        datapoints = len(rows) - 1  # Subtract header
                        max_datapoints = max(max_datapoints, datapoints)
                    
                    # Add if 20+ datapoints
                    if max_datapoints >= 20:
                        benchmarks_with_data.append({
                            "name": benchmark_name,
                            "url": benchmark_url,
                            "timeline_url": timeline_url,
                            "datapoints": max_datapoints
                        })
                
                except Exception as e:
                    print(f"Error processing {benchmark_url}: {e}")
            
            # Save benchmark list
            if benchmarks_with_data:
                df = pd.DataFrame(benchmarks_with_data)
                df.to_csv(
                    os.path.join(self.data_dir, 'image', 'image_benchmarks_20plus.csv'), 
                    index=False
                )
        
        except Exception as e:
            print(f"Error finding benchmarks: {e}")
        
        finally:
            driver.quit()
        
        return benchmarks_with_data

    def scrape_benchmark_data(self, benchmark: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Scrape model data from a specific benchmark"""
        driver = self._setup_driver()
        models_data = []
        
        try:
            # Use timeline URL if available
            url = benchmark.get('timeline_url', benchmark['url'])
            driver.get(url)
            time.sleep(3)
            
            # Find and extract data
            scripts = driver.find_elements(By.TAG_NAME, "script")
            json_data_found = False
            
            for script in scripts:
                script_text = script.get_attribute("innerHTML")
                if "window.INITIAL_DATA" in script_text:
                    json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                    if json_match:
                        try:
                            data = json.loads(json_match.group(1))
                            
                            # Save raw JSON
                            benchmark_dir = os.path.join(self.data_dir, 'image', benchmark['name'])
                            os.makedirs(benchmark_dir, exist_ok=True)
                            with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                                json.dump(data, f, indent=2)
                            
                            # Process SOTA data
                            if 'sota' in data and 'evaluations' in data['sota']:
                                json_data_found = True
                                for eval_data in data['sota']['evaluations']:
                                    # Extract publication date
                                    date_str = eval_data.get('date')
                                    pub_date = self._extract_date_from_text(date_str) if date_str else None
                                    
                                    # Extract model info
                                    model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                    model_desc = eval_data.get('model', {}).get('description', '')
                                    
                                    # Only add if we have date and model name
                                    if pub_date and model_name:
                                        model_entry = {
                                            'dataset': benchmark['name'],
                                            'model_name': model_name,
                                            'paper_title': eval_data.get('paper', {}).get('title', ''),
                                            'paper_url': eval_data.get('paper', {}).get('url', ''),
                                            'code_url': eval_data.get('code', {}).get('url', '') if eval_data.get('code') else '',
                                            'description': model_desc,
                                            'parameters_millions': self._extract_parameters_from_text(model_desc),
                                            'date': pub_date,
                                            'year': pub_date.year,
                                            'month': pub_date.month,
                                            'day': pub_date.day,
                                            **{
                                                metric.get('name'): float(metric.get('value', '').replace('%', ''))
                                                for metric in eval_data.get('metrics', [])
                                                if metric.get('name') and metric.get('value')
                                            }
                                        }
                                        models_data.append(model_entry)
                        except Exception as e:
                            print(f"Error parsing JSON data: {e}")
            
            # If no JSON data, try table extraction (fallback method)
            if not json_data_found:
                tables = driver.find_elements(By.TAG_NAME, "table")
                for table in tables:
                    try:
                        rows = table.find_elements(By.TAG_NAME, "tr")
                        if len(rows) <= 1:
                            continue
                        
                        # Extract header information
                        headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                        date_col = next((i for i, h in enumerate(headers) if any(term in h for term in ['date', 'published', 'year'])), None)
                        model_col = next((i for i, h in enumerate(headers) if any(term in h for term in ['model', 'method', 'name'])), None)
                        
                        # Process rows if we have date and model columns
                        if date_col is not None and model_col is not None:
                            for row in rows[1:]:
                                cells = row.find_elements(By.TAG_NAME, "td")
                                if len(cells) != len(headers):
                                    continue
                                
                                # Extract date and model name
                                date_text = cells[date_col].text.strip()
                                pub_date = self._extract_date_from_text(date_text)
                                model_name = cells[model_col].text.strip()
                                
                                # Get paper link
                                paper_url = ""
                                paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                                if paper_links:
                                    paper_url = paper_links[0].get_attribute("href")
                                
                                # Extract metrics
                                metrics = {}
                                for i, header in enumerate(headers):
                                    if header not in ['paper', 'code', 'link', headers[date_col], headers[model_col]]:
                                        value = cells[i].text.strip()
                                        try:
                                            metrics[header] = float(value.replace('%', ''))
                                        except:
                                            metrics[header] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_url': paper_url,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error processing table: {e}")
            
            # Save models for this benchmark
            if models_data:
                benchmark_dir = os.path.join(self.data_dir, 'image', benchmark['name'])
                os.makedirs(benchmark_dir, exist_ok=True)
                
                models_df = pd.DataFrame(models_data)
                models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
        
        except Exception as e:
            print(f"Error scraping benchmark data: {e}")
        
        finally:
            driver.quit()
        
        return models_data

    def combine_models(self):
        """Combine all collected models into a single dataset"""
        try:
            # Collect all model CSV files
            all_models = []
            for root, _, files in os.walk(os.path.join(self.data_dir, 'image')):
                for file in files:
                    if file == "models.csv":
                        file_path = os.path.join(root, file)
                        try:
                            df = pd.read_csv(file_path)
                            all_models.append(df)
                        except Exception as e:
                            print(f"Error reading {file_path}: {e}")
# Combine and process models
            if all_models:
                combined_df = pd.concat(all_models, ignore_index=True)
                
                # Ensure date is properly formatted
                if 'date' in combined_df.columns:
                    combined_df['date'] = pd.to_datetime(combined_df['date'])
                
                # Sort by date
                combined_df = combined_df.sort_values('date')
                
                # Save combined models
                combined_output_path = os.path.join(self.data_dir, 'image', 'all_image_models.csv')
                combined_df.to_csv(combined_output_path, index=False)
                print(f"Combined {len(combined_df)} models from all benchmarks")
                
                # Create yearly summary
                yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
                yearly_summary_path = os.path.join(self.data_dir, 'image', 'yearly_model_count.csv')
                yearly_summary.to_csv(yearly_summary_path, index=False)
                print("Created yearly model count summary")
        
        except Exception as e:
            print(f"Error combining models: {e}")

def main():
    """Main execution method for image classification data collection"""
    print("Starting image classification data collection (20+ datapoints)...")
    
    # Initialize scraper
    scraper = ImageClassificationScraper()
    
    # Find benchmarks with 20+ datapoints
    benchmarks = scraper.find_benchmarks()
    
    if not benchmarks:
        print("No image classification benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} image classification benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            # Scrape models for the benchmark
            models = scraper.scrape_benchmark_data(benchmark)
            
            if models:
                all_models_count += len(models)
            
            # Pause between benchmarks to be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all collected models
    scraper.combine_models()
    
    print("Image classification data collection complete!")

if __name__ == "__main__":
    main()

Starting image classification data collection (20+ datapoints)...


In [1]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'text_classification'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_text_classification_benchmarks_with_20plus():
    """Get text classification benchmarks with at least 20 datapoints"""
    print("Finding text classification benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/text-classification-on-imdb",
        "https://paperswithcode.com/sota/text-classification-on-ag-news",
        "https://paperswithcode.com/sota/text-classification-on-trec-fine-grained",
        "https://paperswithcode.com/sota/text-classification-on-dbpedia",
        "https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary",
        "https://paperswithcode.com/sota/text-classification-on-sst-5"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main text classification page
        driver.get("https://paperswithcode.com/task/text-classification")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and ("text-classification" in href.lower() or "sentiment" in href.lower()):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential text classification benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'text_classification', 'text_classification_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'text_classification', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'text_classification', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all text classification models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'text_classification')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'text_classification', 'all_text_classification_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'text_classification', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting text classification task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_text_classification_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No text classification benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} text classification benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Text classification task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting text classification task data collection (20+ datapoints)...
Finding text classification benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 69 potential text classification benchmarks
Checking text-classification-on-imdb...
text-classification-on-imdb: 0 datapoints
Checking text-classification-on-ag-news...
text-classification-on-ag-news: 21 datapoints
Added text-classification-on-ag-news with 21 datapoints
Checking text-classification-on-trec-fine-grained...
text-classification-on-trec-fine-grained: 0 datapoints
Checking text-classification-on-dbpedia...
text-classification-on-dbpedia: 21 datapoints
Added text-classification-on-dbpedia with 21 datapoints
Checking sentiment-analysis-on-sst-2-binary...
sentiment-analysis-on-sst-2-binary: 88 datapoints
Added sentiment-analysis-on-sst-2-binary with 88 datapoints
Checking text-classification-on-sst-5...
tex

In [2]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'image_generation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_image_generation_benchmarks_with_20plus():
    """Get image generation benchmarks with at least 20 datapoints"""
    print("Finding image generation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/image-generation-on-ms-coco",
        "https://paperswithcode.com/sota/image-generation-on-cifar-10",
        "https://paperswithcode.com/sota/image-generation-on-celeba-64x64",
        "https://paperswithcode.com/sota/image-generation-on-lsun-bedroom-256-x-256",
        "https://paperswithcode.com/sota/image-generation-on-imagenet-64x64",
        "https://paperswithcode.com/sota/image-generation-on-stl-10"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main image generation page
        driver.get("https://paperswithcode.com/task/image-generation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and ("image-generation" in href.lower() or "gan" in href.lower()):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential image generation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'image_generation', 'image_generation_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'image_generation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'image_generation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all image generation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'image_generation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'image_generation', 'all_image_generation_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'image_generation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting image generation task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_image_generation_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No image generation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} image generation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Image generation task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting image generation task data collection (20+ datapoints)...
Finding image generation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 91 potential image generation benchmarks
Checking image-generation-on-ms-coco...
image-generation-on-ms-coco: 0 datapoints
Checking image-generation-on-cifar-10...
image-generation-on-cifar-10: 70 datapoints
Added image-generation-on-cifar-10 with 70 datapoints
Checking image-generation-on-celeba-64x64...
image-generation-on-celeba-64x64: 39 datapoints
Added image-generation-on-celeba-64x64 with 39 datapoints
Checking image-generation-on-lsun-bedroom-256-x-256...
image-generation-on-lsun-bedroom-256-x-256: 32 datapoints
Added image-generation-on-lsun-bedroom-256-x-256 with 32 datapoints
Checking image-generation-on-imagenet-64x64...
image-generation-on-imagenet-64x64: 64 datapoints
Added image-generation-on-imagenet-64x64

In [3]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'domain_adaptation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_domain_adaptation_benchmarks_with_20plus():
    """Get domain adaptation benchmarks with at least 20 datapoints"""
    print("Finding domain adaptation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/domain-adaptation-on-office-31",
        "https://paperswithcode.com/sota/domain-adaptation-on-digits",
        "https://paperswithcode.com/sota/domain-adaptation-on-visda-2017",
        "https://paperswithcode.com/sota/domain-adaptation-on-office-home",
        "https://paperswithcode.com/sota/domain-adaptation-on-imageclef",
        "https://paperswithcode.com/sota/domain-adaptation-on-office-caltech"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main domain adaptation page
        driver.get("https://paperswithcode.com/task/domain-adaptation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "domain-adaptation" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential domain adaptation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'domain_adaptation', 'domain_adaptation_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'domain_adaptation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'domain_adaptation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all domain adaptation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'domain_adaptation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'domain_adaptation', 'all_domain_adaptation_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'domain_adaptation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting domain adaptation task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_domain_adaptation_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No domain adaptation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} domain adaptation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Domain adaptation task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting domain adaptation task data collection (20+ datapoints)...
Finding domain adaptation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 61 potential domain adaptation benchmarks
Checking domain-adaptation-on-office-31...
domain-adaptation-on-office-31: 40 datapoints
Added domain-adaptation-on-office-31 with 40 datapoints
Checking domain-adaptation-on-digits...
domain-adaptation-on-digits: 0 datapoints
Checking domain-adaptation-on-visda-2017...
domain-adaptation-on-visda-2017: 0 datapoints
Checking domain-adaptation-on-office-home...
domain-adaptation-on-office-home: 29 datapoints
Added domain-adaptation-on-office-home with 29 datapoints
Checking domain-adaptation-on-imageclef...
domain-adaptation-on-imageclef: 0 datapoints
Checking domain-adaptation-on-office-caltech...
domain-adaptation-on-office-caltech: 8 datapoints
Checking domain-adaptation-on-syn

In [4]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'domain_adaptation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_domain_adaptation_benchmarks_with_20plus():
    """Get domain adaptation benchmarks with at least 20 datapoints"""
    print("Finding domain adaptation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/domain-adaptation-on-office-31",
        "https://paperswithcode.com/sota/domain-adaptation-on-digits",
        "https://paperswithcode.com/sota/domain-adaptation-on-visda-2017",
        "https://paperswithcode.com/sota/domain-adaptation-on-office-home",
        "https://paperswithcode.com/sota/domain-adaptation-on-imageclef",
        "https://paperswithcode.com/sota/domain-adaptation-on-office-caltech"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main domain adaptation page
        driver.get("https://paperswithcode.com/task/domain-adaptation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "domain-adaptation" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential domain adaptation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'domain_adaptation', 'domain_adaptation_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'domain_adaptation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'domain_adaptation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all domain adaptation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'domain_adaptation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'domain_adaptation', 'all_domain_adaptation_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'domain_adaptation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting domain adaptation task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_domain_adaptation_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No domain adaptation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} domain adaptation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Domain adaptation task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting domain adaptation task data collection (20+ datapoints)...
Finding domain adaptation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 61 potential domain adaptation benchmarks
Checking domain-adaptation-on-office-31...
domain-adaptation-on-office-31: 40 datapoints
Added domain-adaptation-on-office-31 with 40 datapoints
Checking domain-adaptation-on-digits...
domain-adaptation-on-digits: 0 datapoints
Checking domain-adaptation-on-visda-2017...
domain-adaptation-on-visda-2017: 0 datapoints
Checking domain-adaptation-on-office-home...
domain-adaptation-on-office-home: 29 datapoints
Added domain-adaptation-on-office-home with 29 datapoints
Checking domain-adaptation-on-imageclef...
domain-adaptation-on-imageclef: 0 datapoints
Checking domain-adaptation-on-office-caltech...
domain-adaptation-on-office-caltech: 8 datapoints
Checking domain-adaptation-on-syn

In [5]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'data_augmentation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_data_augmentation_benchmarks_with_20plus():
    """Get data augmentation benchmarks with at least 20 datapoints"""
    print("Finding data augmentation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/data-augmentation-on-cifar-10",
        "https://paperswithcode.com/sota/data-augmentation-on-cifar-100",
        "https://paperswithcode.com/sota/data-augmentation-on-imagenet",
        "https://paperswithcode.com/sota/data-augmentation-on-svhn",
        "https://paperswithcode.com/sota/data-augmentation-on-tiny-imagenet-200",
        "https://paperswithcode.com/sota/data-augmentation-on-mnist"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main data augmentation page
        driver.get("https://paperswithcode.com/task/data-augmentation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "data-augmentation" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential data augmentation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'data_augmentation', 'data_augmentation_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'data_augmentation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'data_augmentation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all data augmentation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'data_augmentation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'data_augmentation', 'all_data_augmentation_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'data_augmentation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting data augmentation task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_data_augmentation_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No data augmentation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} data augmentation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Data augmentation task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting data augmentation task data collection (20+ datapoints)...
Finding data augmentation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 7 potential data augmentation benchmarks
Checking data-augmentation-on-cifar-10...
data-augmentation-on-cifar-10: 5 datapoints
Checking data-augmentation-on-cifar-100...
data-augmentation-on-cifar-100: 0 datapoints
Checking data-augmentation-on-imagenet...
data-augmentation-on-imagenet: 17 datapoints
Checking data-augmentation-on-svhn...
data-augmentation-on-svhn: 0 datapoints
Checking data-augmentation-on-tiny-imagenet-200...
data-augmentation-on-tiny-imagenet-200: 0 datapoints
Checking data-augmentation-on-mnist...
data-augmentation-on-mnist: 0 datapoints
Checking data-augmentation-on-ga1457...
data-augmentation-on-ga1457: 4 datapoints
No data augmentation benchmarks with 20+ datapoints found. Exiting.


In [6]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'retrieval'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_retrieval_benchmarks_with_20plus():
    """Get retrieval benchmarks with at least 20 datapoints"""
    print("Finding retrieval benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/image-retrieval-on-oxford-5k",
        "https://paperswithcode.com/sota/image-retrieval-on-paris-6k",
        "https://paperswithcode.com/sota/cross-modal-retrieval-on-ms-coco-1k-test",
        "https://paperswithcode.com/sota/image-retrieval-on-roxford-medium",
        "https://paperswithcode.com/sota/image-retrieval-on-rparis-medium",
        "https://paperswithcode.com/sota/document-retrieval-on-robust04"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main retrieval page
        driver.get("https://paperswithcode.com/task/retrieval")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and ("retrieval" in href.lower() or "search" in href.lower()):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential retrieval benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'retrieval', 'retrieval_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'retrieval', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'retrieval', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all retrieval models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'retrieval')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'retrieval', 'all_retrieval_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'retrieval', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting retrieval task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_retrieval_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No retrieval benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} retrieval benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Retrieval task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting retrieval task data collection (20+ datapoints)...
Finding retrieval benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 14 potential retrieval benchmarks
Checking image-retrieval-on-oxford-5k...
image-retrieval-on-oxford-5k: 0 datapoints
Checking image-retrieval-on-paris-6k...
image-retrieval-on-paris-6k: 0 datapoints
Checking cross-modal-retrieval-on-ms-coco-1k-test...
cross-modal-retrieval-on-ms-coco-1k-test: 0 datapoints
Checking image-retrieval-on-roxford-medium...
image-retrieval-on-roxford-medium: 23 datapoints
Added image-retrieval-on-roxford-medium with 23 datapoints
Checking image-retrieval-on-rparis-medium...
image-retrieval-on-rparis-medium: 23 datapoints
Added image-retrieval-on-rparis-medium with 23 datapoints
Checking document-retrieval-on-robust04...
document-retrieval-on-robust04: 0 datapoints
Checking retrieval-on-quora-question-pairs..

In [7]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'denoising'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_denoising_benchmarks_with_20plus():
    """Get denoising benchmarks with at least 20 datapoints"""
    print("Finding denoising benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/image-denoising-on-cbsd68-sigma15",
        "https://paperswithcode.com/sota/image-denoising-on-cbsd68-sigma25",
        "https://paperswithcode.com/sota/image-denoising-on-cbsd68-sigma50",
        "https://paperswithcode.com/sota/denoising-on-set12-sigma15",
        "https://paperswithcode.com/sota/denoising-on-set12-sigma25",
        "https://paperswithcode.com/sota/denoising-on-set12-sigma50"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main denoising page
        driver.get("https://paperswithcode.com/task/denoising")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "denoising" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential denoising benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'denoising', 'denoising_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'denoising', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'denoising', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all denoising models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'denoising')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'denoising', 'all_denoising_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'denoising', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting denoising task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_denoising_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No denoising benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} denoising benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Denoising task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting denoising task data collection (20+ datapoints)...
Finding denoising benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 12 potential denoising benchmarks
Checking image-denoising-on-cbsd68-sigma15...
image-denoising-on-cbsd68-sigma15: 0 datapoints
Checking image-denoising-on-cbsd68-sigma25...
image-denoising-on-cbsd68-sigma25: 0 datapoints
Checking image-denoising-on-cbsd68-sigma50...
image-denoising-on-cbsd68-sigma50: 0 datapoints
Checking denoising-on-set12-sigma15...
denoising-on-set12-sigma15: 0 datapoints
Checking denoising-on-set12-sigma25...
denoising-on-set12-sigma25: 0 datapoints
Checking denoising-on-set12-sigma50...
denoising-on-set12-sigma50: 0 datapoints
Checking denoising-on-darmstadt-noise-dataset...
denoising-on-darmstadt-noise-dataset: 10 datapoints
Checking denoising-on-aapm...
denoising-on-aapm: 1 datapoints
Checking denoising-on-iris

In [9]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'anomaly_detection'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_anomaly_detection_benchmarks_with_20plus():
    """Get anomaly detection benchmarks with at least 20 datapoints"""
    print("Finding anomaly detection benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/anomaly-detection-on-mvtec",
        "https://paperswithcode.com/sota/anomaly-detection-on-mvtec-ad",
        "https://paperswithcode.com/sota/anomaly-detection-on-cifar-10",
        "https://paperswithcode.com/sota/anomaly-detection-on-kdd-cup-99",
        "https://paperswithcode.com/sota/anomaly-detection-on-ucsd-ped1",
        "https://paperswithcode.com/sota/anomaly-detection-on-ucsd-ped2"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main anomaly detection page
        driver.get("https://paperswithcode.com/task/anomaly-detection")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "anomaly" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential anomaly detection benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'anomaly_detection', 'anomaly_detection_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'anomaly_detection', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'anomaly_detection', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all anomaly detection models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'anomaly_detection')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'anomaly_detection', 'all_anomaly_detection_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'anomaly_detection', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting anomaly detection task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_anomaly_detection_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No anomaly detection benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} anomaly detection benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Anomaly detection task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting anomaly detection task data collection (20+ datapoints)...
Finding anomaly detection benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 78 potential anomaly detection benchmarks
Checking anomaly-detection-on-mvtec...
anomaly-detection-on-mvtec: 0 datapoints
Checking anomaly-detection-on-mvtec-ad...
anomaly-detection-on-mvtec-ad: 144 datapoints
Added anomaly-detection-on-mvtec-ad with 144 datapoints
Checking anomaly-detection-on-cifar-10...
anomaly-detection-on-cifar-10: 1 datapoints
Checking anomaly-detection-on-kdd-cup-99...
anomaly-detection-on-kdd-cup-99: 0 datapoints
Checking anomaly-detection-on-ucsd-ped1...
anomaly-detection-on-ucsd-ped1: 0 datapoints
Checking anomaly-detection-on-ucsd-ped2...
anomaly-detection-on-ucsd-ped2: 13 datapoints
Checking anomaly-detection-on-visa...
anomaly-detection-on-visa: 47 datapoints
Added anomaly-detection-on-visa

In [10]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'sentiment_analysis'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_sentiment_analysis_benchmarks_with_20plus():
    """Get sentiment analysis benchmarks with at least 20 datapoints"""
    print("Finding sentiment analysis benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary",
        "https://paperswithcode.com/sota/sentiment-analysis-on-imdb",
        "https://paperswithcode.com/sota/sentiment-analysis-on-sst-5-fine-grained",
        "https://paperswithcode.com/sota/sentiment-analysis-on-yelp-binary",
        "https://paperswithcode.com/sota/sentiment-analysis-on-yelp-fine-grained",
        "https://paperswithcode.com/sota/aspect-based-sentiment-analysis-on-semeval"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main sentiment analysis page
        driver.get("https://paperswithcode.com/task/sentiment-analysis")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "sentiment" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential sentiment analysis benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'sentiment_analysis', 'sentiment_analysis_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'sentiment_analysis', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'sentiment_analysis', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all sentiment analysis models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'sentiment_analysis')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'sentiment_analysis', 'all_sentiment_analysis_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'sentiment_analysis', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting sentiment analysis task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_sentiment_analysis_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No sentiment analysis benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} sentiment analysis benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Sentiment analysis task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting sentiment analysis task data collection (20+ datapoints)...
Finding sentiment analysis benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 43 potential sentiment analysis benchmarks
Checking sentiment-analysis-on-sst-2-binary...
sentiment-analysis-on-sst-2-binary: 88 datapoints
Added sentiment-analysis-on-sst-2-binary with 88 datapoints
Checking sentiment-analysis-on-imdb...
sentiment-analysis-on-imdb: 48 datapoints
Added sentiment-analysis-on-imdb with 48 datapoints
Checking sentiment-analysis-on-sst-5-fine-grained...
sentiment-analysis-on-sst-5-fine-grained: 30 datapoints
Added sentiment-analysis-on-sst-5-fine-grained with 30 datapoints
Checking sentiment-analysis-on-yelp-binary...
sentiment-analysis-on-yelp-binary: 20 datapoints
Added sentiment-analysis-on-yelp-binary with 20 datapoints
Checking sentiment-analysis-on-yelp-fine-grained...
sentiment-ana

In [11]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'medical_image_segmentation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_medical_image_segmentation_benchmarks_with_20plus():
    """Get medical image segmentation benchmarks with at least 20 datapoints"""
    print("Finding medical image segmentation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/medical-image-segmentation-on-brats",
        "https://paperswithcode.com/sota/medical-image-segmentation-on-isic-2017",
        "https://paperswithcode.com/sota/medical-image-segmentation-on-isic-2018",
        "https://paperswithcode.com/sota/medical-image-segmentation-on-promise12",
        "https://paperswithcode.com/sota/medical-image-segmentation-on-kvasir-seg",
        "https://paperswithcode.com/sota/brain-tumor-segmentation-on-brats-2018"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main medical image segmentation page
        driver.get("https://paperswithcode.com/task/medical-image-segmentation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and ("medical" in href.lower() or "segmentation" in href.lower()):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential medical image segmentation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'medical_image_segmentation', 'medical_image_segmentation_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'medical_image_segmentation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'medical_image_segmentation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all medical image segmentation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'medical_image_segmentation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'medical_image_segmentation', 'all_medical_image_segmentation_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'medical_image_segmentation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting medical image segmentation task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_medical_image_segmentation_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No medical image segmentation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} medical image segmentation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Medical image segmentation task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting medical image segmentation task data collection (20+ datapoints)...
Finding medical image segmentation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 51 potential medical image segmentation benchmarks
Checking medical-image-segmentation-on-brats...
medical-image-segmentation-on-brats: 0 datapoints
Checking medical-image-segmentation-on-isic-2017...
medical-image-segmentation-on-isic-2017: 0 datapoints
Checking medical-image-segmentation-on-isic-2018...
medical-image-segmentation-on-isic-2018: 2 datapoints
Checking medical-image-segmentation-on-promise12...
medical-image-segmentation-on-promise12: 1 datapoints
Checking medical-image-segmentation-on-kvasir-seg...
medical-image-segmentation-on-kvasir-seg: 56 datapoints
Added medical-image-segmentation-on-kvasir-seg with 56 datapoints
Checking brain-tumor-segmentation-on-brats-2018...
brain-tumor-segmen

In [12]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'few_shot_learning'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_few_shot_learning_benchmarks_with_20plus():
    """Get few-shot learning benchmarks with at least 20 datapoints"""
    print("Finding few-shot learning benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/few-shot-image-classification-on-meta-dataset",
        "https://paperswithcode.com/sota/few-shot-image-classification-on-omniglot",
        "https://paperswithcode.com/sota/few-shot-image-classification-on-mini",
        "https://paperswithcode.com/sota/few-shot-image-classification-on-tiered",
        "https://paperswithcode.com/sota/few-shot-image-classification-on-cifarfs",
        "https://paperswithcode.com/sota/few-shot-image-classification-on-fc100"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main few-shot learning page
        driver.get("https://paperswithcode.com/task/few-shot-learning")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and ("few-shot" in href.lower() or "fewshot" in href.lower()):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential few-shot learning benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'few_shot_learning', 'few_shot_learning_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'few_shot_learning', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'few_shot_learning', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all few-shot learning models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'few_shot_learning')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'few_shot_learning', 'all_few_shot_learning_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'few_shot_learning', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting few-shot learning task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_few_shot_learning_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No few-shot learning benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} few-shot learning benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Few-shot learning task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting few-shot learning task data collection (20+ datapoints)...
Finding few-shot learning benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 33 potential few-shot learning benchmarks
Checking few-shot-image-classification-on-meta-dataset...
few-shot-image-classification-on-meta-dataset: 22 datapoints
Added few-shot-image-classification-on-meta-dataset with 22 datapoints
Checking few-shot-image-classification-on-omniglot...
few-shot-image-classification-on-omniglot: 2 datapoints
Checking few-shot-image-classification-on-mini...
few-shot-image-classification-on-mini: 0 datapoints
Checking few-shot-image-classification-on-tiered...
few-shot-image-classification-on-tiered: 49 datapoints
Added few-shot-image-classification-on-tiered with 49 datapoints
Checking few-shot-image-classification-on-cifarfs...
few-shot-image-classification-on-cifarfs: 0 datapoints
Check

In [13]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'action_recognition'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_action_recognition_benchmarks_with_20plus():
    """Get action recognition benchmarks with at least 20 datapoints"""
    print("Finding action recognition benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/action-recognition-in-videos-on-kinetics-400",
        "https://paperswithcode.com/sota/action-recognition-in-videos-on-hmdb-51",
        "https://paperswithcode.com/sota/action-recognition-in-videos-on-ucf101",
        "https://paperswithcode.com/sota/action-recognition-in-videos-on-kinetics-600",
        "https://paperswithcode.com/sota/action-recognition-in-videos-on-kinetics-700",
        "https://paperswithcode.com/sota/action-recognition-in-videos-on-something"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main action recognition page
        driver.get("https://paperswithcode.com/task/action-recognition-in-videos")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and ("action-recognition" in href.lower() or "video" in href.lower()):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential action recognition benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'action_recognition', 'action_recognition_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'action_recognition', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'action_recognition', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all action recognition models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'action_recognition')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'action_recognition', 'all_action_recognition_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'action_recognition', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting action recognition task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_action_recognition_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No action recognition benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} action recognition benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Action recognition task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting action recognition task data collection (20+ datapoints)...
Finding action recognition benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 58 potential action recognition benchmarks
Checking action-recognition-in-videos-on-kinetics-400...
action-recognition-in-videos-on-kinetics-400: 0 datapoints
Checking action-recognition-in-videos-on-hmdb-51...
action-recognition-in-videos-on-hmdb-51: 76 datapoints
Added action-recognition-in-videos-on-hmdb-51 with 76 datapoints
Checking action-recognition-in-videos-on-ucf101...
action-recognition-in-videos-on-ucf101: 90 datapoints
Added action-recognition-in-videos-on-ucf101 with 90 datapoints
Checking action-recognition-in-videos-on-kinetics-600...
action-recognition-in-videos-on-kinetics-600: 1 datapoints
Checking action-recognition-in-videos-on-kinetics-700...
action-recognition-in-videos-on-kinetics-700: 0 datapo

In [1]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'language_modelling'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_language_modelling_benchmarks_with_20plus():
    """Get language modelling benchmarks with at least 20 datapoints"""
    print("Finding language modelling benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/language-modelling-on-penn-treebank-word",
        "https://paperswithcode.com/sota/language-modelling-on-wikitext-103",
        "https://paperswithcode.com/sota/language-modelling-on-wikitext-2",
        "https://paperswithcode.com/sota/language-modelling-on-1b-word",
        "https://paperswithcode.com/sota/language-modelling-on-lambada",
        "https://paperswithcode.com/sota/language-modelling-on-ptb-character"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main language modelling page
        driver.get("https://paperswithcode.com/task/language-modelling")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "language-modelling" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential language modelling benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'language_modelling', 'language_modelling_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'language_modelling', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'language_modelling', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all language modelling models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'language_modelling')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'language_modelling', 'all_language_modelling_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'language_modelling', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting language modelling task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_language_modelling_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No language modelling benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} language modelling benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Language modelling task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting language modelling task data collection (20+ datapoints)...
Finding language modelling benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 57 potential language modelling benchmarks
Checking language-modelling-on-penn-treebank-word...
language-modelling-on-penn-treebank-word: 43 datapoints
Added language-modelling-on-penn-treebank-word with 43 datapoints
Checking language-modelling-on-wikitext-103...
language-modelling-on-wikitext-103: 89 datapoints
Added language-modelling-on-wikitext-103 with 89 datapoints
Checking language-modelling-on-wikitext-2...
language-modelling-on-wikitext-2: 38 datapoints
Added language-modelling-on-wikitext-2 with 38 datapoints
Checking language-modelling-on-1b-word...
language-modelling-on-1b-word: 0 datapoints
Checking language-modelling-on-lambada...
language-modelling-on-lambada: 37 datapoints
Added language-modelling-on-

In [2]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'representation_learning'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_representation_learning_benchmarks_with_20plus():
    """Get representation learning benchmarks with at least 20 datapoints"""
    print("Finding representation learning benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/representation-learning-on-imagenet",
        "https://paperswithcode.com/sota/representation-learning-on-cifar-10",
        "https://paperswithcode.com/sota/representation-learning-on-cifar-100",
        "https://paperswithcode.com/sota/representation-learning-on-stl-10",
        "https://paperswithcode.com/sota/representation-learning-on-coco"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main representation learning page
        driver.get("https://paperswithcode.com/task/representation-learning")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "representation-learning" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential representation learning benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'representation_learning', 'representation_learning_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'representation_learning', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'representation_learning', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all representation learning models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'representation_learning')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'representation_learning', 'all_representation_learning_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'representation_learning', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting representation learning task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_representation_learning_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No representation learning benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} representation learning benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Representation learning task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting representation learning task data collection (20+ datapoints)...
Finding representation learning benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 10 potential representation learning benchmarks
Checking representation-learning-on-imagenet...
representation-learning-on-imagenet: 0 datapoints
Checking representation-learning-on-cifar-10...
representation-learning-on-cifar-10: 0 datapoints
Checking representation-learning-on-cifar-100...
representation-learning-on-cifar-100: 0 datapoints
Checking representation-learning-on-stl-10...
representation-learning-on-stl-10: 0 datapoints
Checking representation-learning-on-coco...
representation-learning-on-coco: 0 datapoints
Checking representation-learning-on-scidocs...
representation-learning-on-scidocs: 7 datapoints
Checking representation-learning-on-circle-data...
representation-learning-on-circle-data: 1 

In [3]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'classification'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_classification_benchmarks_with_20plus():
    """Get classification benchmarks with at least 20 datapoints"""
    print("Finding classification benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/image-classification-on-imagenet",
        "https://paperswithcode.com/sota/image-classification-on-cifar-10",
        "https://paperswithcode.com/sota/image-classification-on-cifar-100",
        "https://paperswithcode.com/sota/text-classification-on-imdb",
        "https://paperswithcode.com/sota/text-classification-on-yelp-5"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main classification page
        driver.get("https://paperswithcode.com/task/classification-1")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "classification" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential classification benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'classification', 'classification_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'classification', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'classification', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all classification models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'classification')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'classification', 'all_classification_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'classification', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting classification task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_classification_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No classification benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} classification benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Classification task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting classification task data collection (20+ datapoints)...
Finding classification benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 63 potential classification benchmarks
Checking image-classification-on-imagenet...
image-classification-on-imagenet: 1060 datapoints
Added image-classification-on-imagenet with 1060 datapoints
Checking image-classification-on-cifar-10...
image-classification-on-cifar-10: 264 datapoints
Added image-classification-on-cifar-10 with 264 datapoints
Checking image-classification-on-cifar-100...
image-classification-on-cifar-100: 210 datapoints
Added image-classification-on-cifar-100 with 210 datapoints
Checking text-classification-on-imdb...
text-classification-on-imdb: 0 datapoints
Checking text-classification-on-yelp-5...
text-classification-on-yelp-5: 7 datapoints
Checking classification-on-n-imagenet...
classification-on-n-ima

In [4]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'text_classification'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_text_classification_benchmarks_with_20plus():
    """Get text classification benchmarks with at least 20 datapoints"""
    print("Finding text classification benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/text-classification-on-imdb",
        "https://paperswithcode.com/sota/text-classification-on-yelp-5",
        "https://paperswithcode.com/sota/text-classification-on-ag-news",
        "https://paperswithcode.com/sota/text-classification-on-dbpedia",
        "https://paperswithcode.com/sota/text-classification-on-sst-2-binary"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main text classification page
        driver.get("https://paperswithcode.com/task/text-classification")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "text-classification" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential text classification benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'text_classification', 'text_classification_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'text_classification', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'text_classification', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all text classification models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'text_classification')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'text_classification', 'all_text_classification_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'text_classification', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting text classification task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_text_classification_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No text classification benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} text classification benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Text classification task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting text classification task data collection (20+ datapoints)...
Finding text classification benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 67 potential text classification benchmarks
Checking text-classification-on-imdb...
text-classification-on-imdb: 0 datapoints
Checking text-classification-on-yelp-5...
text-classification-on-yelp-5: 7 datapoints
Checking text-classification-on-ag-news...
text-classification-on-ag-news: 21 datapoints
Added text-classification-on-ag-news with 21 datapoints
Checking text-classification-on-dbpedia...
text-classification-on-dbpedia: 21 datapoints
Added text-classification-on-dbpedia with 21 datapoints
Checking text-classification-on-sst-2-binary...
text-classification-on-sst-2-binary: 0 datapoints
Checking text-classification-on-mteb...
text-classification-on-mteb: 31 datapoints
Added text-classification-on-mteb with 31 

In [5]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'deep_reinforcement_learning'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_deep_reinforcement_learning_benchmarks_with_20plus():
    """Get deep reinforcement learning benchmarks with at least 20 datapoints"""
    print("Finding deep reinforcement learning benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/atari-games-on-atari-2600-breakout",
        "https://paperswithcode.com/sota/atari-games-on-atari-2600-pong",
        "https://paperswithcode.com/sota/atari-games-on-atari-2600-seaquest",
        "https://paperswithcode.com/sota/continuous-control-on-mujoco",
        "https://paperswithcode.com/sota/atari-games-on-atari-2600-qbert"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main deep reinforcement learning page
        driver.get("https://paperswithcode.com/task/deep-reinforcement-learning")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and ("reinforcement-learning" in href.lower() or "atari" in href.lower() or "mujoco" in href.lower()):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential deep reinforcement learning benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'deep_reinforcement_learning', 'deep_reinforcement_learning_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'deep_reinforcement_learning', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'deep_reinforcement_learning', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all deep reinforcement learning models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'deep_reinforcement_learning')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'deep_reinforcement_learning', 'all_deep_reinforcement_learning_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'deep_reinforcement_learning', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting deep reinforcement learning task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_deep_reinforcement_learning_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No deep reinforcement learning benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} deep reinforcement learning benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Deep reinforcement learning task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting deep reinforcement learning task data collection (20+ datapoints)...
Finding deep reinforcement learning benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 5 potential deep reinforcement learning benchmarks
Checking atari-games-on-atari-2600-breakout...
atari-games-on-atari-2600-breakout: 58 datapoints
Added atari-games-on-atari-2600-breakout with 58 datapoints
Checking atari-games-on-atari-2600-pong...
atari-games-on-atari-2600-pong: 52 datapoints
Added atari-games-on-atari-2600-pong with 52 datapoints
Checking atari-games-on-atari-2600-seaquest...
atari-games-on-atari-2600-seaquest: 57 datapoints
Added atari-games-on-atari-2600-seaquest with 57 datapoints
Checking continuous-control-on-mujoco...
continuous-control-on-mujoco: 0 datapoints
Checking atari-games-on-atari-2600-qbert...
atari-games-on-atari-2600-qbert: 57 datapoints
Added atari-games-on-ata

In [6]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'retrieval'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_retrieval_benchmarks_with_20plus():
    """Get retrieval benchmarks with at least 20 datapoints"""
    print("Finding retrieval benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/zero-shot-text-retrieval-on-ms-marco",
        "https://paperswithcode.com/sota/image-retrieval-on-oxford-5k",
        "https://paperswithcode.com/sota/image-retrieval-on-paris-6k",
        "https://paperswithcode.com/sota/text-retrieval-on-natural-questions",
        "https://paperswithcode.com/sota/text-retrieval-on-ms-marco"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main retrieval page
        driver.get("https://paperswithcode.com/task/retrieval")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "retrieval" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential retrieval benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'retrieval', 'retrieval_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'retrieval', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'retrieval', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all retrieval models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'retrieval')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'retrieval', 'all_retrieval_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'retrieval', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting retrieval task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_retrieval_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No retrieval benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} retrieval benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Retrieval task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting retrieval task data collection (20+ datapoints)...
Finding retrieval benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 13 potential retrieval benchmarks
Checking zero-shot-text-retrieval-on-ms-marco...
zero-shot-text-retrieval-on-ms-marco: 0 datapoints
Checking image-retrieval-on-oxford-5k...
image-retrieval-on-oxford-5k: 0 datapoints
Checking image-retrieval-on-paris-6k...
image-retrieval-on-paris-6k: 0 datapoints
Checking text-retrieval-on-natural-questions...
text-retrieval-on-natural-questions: 1 datapoints
Checking text-retrieval-on-ms-marco...
text-retrieval-on-ms-marco: 1 datapoints
Checking retrieval-on-quora-question-pairs...
retrieval-on-quora-question-pairs: 4 datapoints
Checking retrieval-on-hotpotqa...
retrieval-on-hotpotqa: 3 datapoints
Checking retrieval-on-natural-questions...
retrieval-on-natural-questions: 3 datapoints
Checking retrie

In [7]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'question_answering'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_question_answering_benchmarks_with_20plus():
    """Get question answering benchmarks with at least 20 datapoints"""
    print("Finding question answering benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/question-answering-on-squad11",
        "https://paperswithcode.com/sota/question-answering-on-squad20",
        "https://paperswithcode.com/sota/question-answering-on-natural-questions",
        "https://paperswithcode.com/sota/question-answering-on-hotpotqa",
        "https://paperswithcode.com/sota/open-domain-question-answering-on-triviaqa"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main question answering page
        driver.get("https://paperswithcode.com/task/question-answering")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "question-answering" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential question answering benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'question_answering', 'question_answering_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'question_answering', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'question_answering', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all question answering models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'question_answering')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'question_answering', 'all_question_answering_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'question_answering', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting question answering task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_question_answering_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No question answering benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} question answering benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Question answering task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting question answering task data collection (20+ datapoints)...
Finding question answering benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 143 potential question answering benchmarks
Checking question-answering-on-squad11...
question-answering-on-squad11: 213 datapoints
Added question-answering-on-squad11 with 213 datapoints
Checking question-answering-on-squad20...
question-answering-on-squad20: 286 datapoints
Added question-answering-on-squad20 with 286 datapoints
Checking question-answering-on-natural-questions...
question-answering-on-natural-questions: 47 datapoints
Added question-answering-on-natural-questions with 47 datapoints
Checking question-answering-on-hotpotqa...
question-answering-on-hotpotqa: 72 datapoints
Added question-answering-on-hotpotqa with 72 datapoints
Checking open-domain-question-answering-on-triviaqa...
open-domain-question-an

In [8]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'knowledge_graphs'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_knowledge_graphs_benchmarks_with_20plus():
    """Get knowledge graphs benchmarks with at least 20 datapoints"""
    print("Finding knowledge graphs benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/link-prediction-on-fb15k-237",
        "https://paperswithcode.com/sota/link-prediction-on-wn18rr",
        "https://paperswithcode.com/sota/knowledge-graph-completion-on-fb15k",
        "https://paperswithcode.com/sota/knowledge-graph-completion-on-wn18",
        "https://paperswithcode.com/sota/link-prediction-on-nell-995"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main knowledge graphs page
        driver.get("https://paperswithcode.com/task/knowledge-graphs")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and ("knowledge-graph" in href.lower() or "link-prediction" in href.lower()):
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential knowledge graphs benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'knowledge_graphs', 'knowledge_graphs_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'knowledge_graphs', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'knowledge_graphs', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all knowledge graphs models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'knowledge_graphs')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'knowledge_graphs', 'all_knowledge_graphs_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'knowledge_graphs', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting knowledge graphs task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_knowledge_graphs_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No knowledge graphs benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} knowledge graphs benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Knowledge graphs task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting knowledge graphs task data collection (20+ datapoints)...
Finding knowledge graphs benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 9 potential knowledge graphs benchmarks
Checking link-prediction-on-fb15k-237...
link-prediction-on-fb15k-237: 74 datapoints
Added link-prediction-on-fb15k-237 with 74 datapoints
Checking link-prediction-on-wn18rr...
link-prediction-on-wn18rr: 74 datapoints
Added link-prediction-on-wn18rr with 74 datapoints
Checking knowledge-graph-completion-on-fb15k...
knowledge-graph-completion-on-fb15k: 0 datapoints
Checking knowledge-graph-completion-on-wn18...
knowledge-graph-completion-on-wn18: 0 datapoints
Checking link-prediction-on-nell-995...
link-prediction-on-nell-995: 4 datapoints
Checking knowledge-graphs-on-mars-multimodal...
knowledge-graphs-on-mars-multimodal: 8 datapoints
Checking knowledge-graphs-on-jerichoworld...
kno

In [9]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'machine_translation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_machine_translation_benchmarks_with_20plus():
    """Get machine translation benchmarks with at least 20 datapoints"""
    print("Finding machine translation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/machine-translation-on-wmt2014-english-german",
        "https://paperswithcode.com/sota/machine-translation-on-wmt2014-english-french",
        "https://paperswithcode.com/sota/machine-translation-on-wmt2016-english-german",
        "https://paperswithcode.com/sota/machine-translation-on-wmt2016-english-romanian",
        "https://paperswithcode.com/sota/machine-translation-on-wmt2019-english-german"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main machine translation page
        driver.get("https://paperswithcode.com/task/machine-translation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "machine-translation" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential machine translation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'machine_translation', 'machine_translation_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'machine_translation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'machine_translation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all machine translation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'machine_translation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'machine_translation', 'all_machine_translation_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'machine_translation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting machine translation task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_machine_translation_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No machine translation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} machine translation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Machine translation task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting machine translation task data collection (20+ datapoints)...
Finding machine translation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 84 potential machine translation benchmarks
Checking machine-translation-on-wmt2014-english-german...
machine-translation-on-wmt2014-english-german: 91 datapoints
Added machine-translation-on-wmt2014-english-german with 91 datapoints
Checking machine-translation-on-wmt2014-english-french...
machine-translation-on-wmt2014-english-french: 57 datapoints
Added machine-translation-on-wmt2014-english-french with 57 datapoints
Checking machine-translation-on-wmt2016-english-german...
machine-translation-on-wmt2016-english-german: 12 datapoints
Checking machine-translation-on-wmt2016-english-romanian...
machine-translation-on-wmt2016-english-romanian: 0 datapoints
Checking machine-translation-on-wmt2019-english-german...
ma

In [10]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'image_segmentation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_image_segmentation_benchmarks_with_20plus():
    """Get image segmentation benchmarks with at least 20 datapoints"""
    print("Finding image segmentation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes",
        "https://paperswithcode.com/sota/semantic-segmentation-on-pascal-voc-2012",
        "https://paperswithcode.com/sota/semantic-segmentation-on-ade20k",
        "https://paperswithcode.com/sota/instance-segmentation-on-coco",
        "https://paperswithcode.com/sota/panoptic-segmentation-on-coco-test-dev"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main image segmentation page
        driver.get("https://paperswithcode.com/task/image-segmentation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "segmentation" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential image segmentation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'image_segmentation', 'image_segmentation_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'image_segmentation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'image_segmentation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all image segmentation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'image_segmentation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'image_segmentation', 'all_image_segmentation_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'image_segmentation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting image segmentation task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_image_segmentation_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No image segmentation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} image segmentation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Image segmentation task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting image segmentation task data collection (20+ datapoints)...
Finding image segmentation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 17 potential image segmentation benchmarks
Checking semantic-segmentation-on-cityscapes...
semantic-segmentation-on-cityscapes: 105 datapoints
Added semantic-segmentation-on-cityscapes with 105 datapoints
Checking semantic-segmentation-on-pascal-voc-2012...
semantic-segmentation-on-pascal-voc-2012: 51 datapoints
Added semantic-segmentation-on-pascal-voc-2012 with 51 datapoints
Checking semantic-segmentation-on-ade20k...
semantic-segmentation-on-ade20k: 231 datapoints
Added semantic-segmentation-on-ade20k with 231 datapoints
Checking instance-segmentation-on-coco...
instance-segmentation-on-coco: 112 datapoints
Added instance-segmentation-on-coco with 112 datapoints
Checking panoptic-segmentation-on-coco-test-dev...
pa

In [12]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'continual_learning'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_continual_learning_benchmarks_with_20plus():
    """Get continual learning benchmarks with at least 20 datapoints"""
    print("Finding continual learning benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/continual-learning-on-split-cifar-100",
        "https://paperswithcode.com/sota/continual-learning-on-split-mnist",
        "https://paperswithcode.com/sota/continual-learning-on-permuted-mnist",
        "https://paperswithcode.com/sota/continual-learning-on-rotated-mnist",
        "https://paperswithcode.com/sota/continual-learning-on-core50"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main continual learning page
        driver.get("https://paperswithcode.com/task/continual-learning")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "continual-learning" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential continual learning benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'continual_learning', 'continual_learning_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'continual_learning', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'continual_learning', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all continual learning models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'continual_learning')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'continual_learning', 'all_continual_learning_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'continual_learning', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting continual learning task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_continual_learning_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No continual learning benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} continual learning benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Continual learning task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting continual learning task data collection (20+ datapoints)...
Finding continual learning benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 34 potential continual learning benchmarks
Checking continual-learning-on-split-cifar-100...
continual-learning-on-split-cifar-100: 2 datapoints
Checking continual-learning-on-split-mnist...
continual-learning-on-split-mnist: 0 datapoints
Checking continual-learning-on-permuted-mnist...
continual-learning-on-permuted-mnist: 3 datapoints
Checking continual-learning-on-rotated-mnist...
continual-learning-on-rotated-mnist: 1 datapoints
Checking continual-learning-on-core50...
continual-learning-on-core50: 0 datapoints
Checking continual-learning-on-asc-19-tasks...
continual-learning-on-asc-19-tasks: 15 datapoints
Checking continual-learning-on-visual-domain-decathlon...
continual-learning-on-visual-domain-decathlon: 14 d

In [1]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'medical_image_segmentation'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_medical_image_segmentation_benchmarks_with_20plus():
    """Get medical image segmentation benchmarks with at least 20 datapoints"""
    print("Finding medical image segmentation benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/medical-image-segmentation-on-acdc",
        "https://paperswithcode.com/sota/medical-image-segmentation-on-brats",
        "https://paperswithcode.com/sota/medical-image-segmentation-on-kvasir-seg",
        "https://paperswithcode.com/sota/medical-image-segmentation-on-isic-2017"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main medical image segmentation page
        driver.get("https://paperswithcode.com/task/medical-image-segmentation")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "medical-image-segmentation" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential medical image segmentation benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'medical_image_segmentation', 'medical_image_segmentation_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'medical_image_segmentation', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'medical_image_segmentation', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all medical image segmentation models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'medical_image_segmentation')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'medical_image_segmentation', 'all_medical_image_segmentation_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'medical_image_segmentation', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting medical image segmentation task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_medical_image_segmentation_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No medical image segmentation benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} medical image segmentation benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Medical image segmentation task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting medical image segmentation task data collection (20+ datapoints)...
Finding medical image segmentation benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 52 potential medical image segmentation benchmarks
Checking medical-image-segmentation-on-acdc...
medical-image-segmentation-on-acdc: 6 datapoints
Checking medical-image-segmentation-on-brats...
medical-image-segmentation-on-brats: 0 datapoints
Checking medical-image-segmentation-on-kvasir-seg...
medical-image-segmentation-on-kvasir-seg: 56 datapoints
Added medical-image-segmentation-on-kvasir-seg with 56 datapoints
Checking medical-image-segmentation-on-isic-2017...
medical-image-segmentation-on-isic-2017: 0 datapoints
Checking medical-image-segmentation-on-cvc-clinicdb...
medical-image-segmentation-on-cvc-clinicdb: 46 datapoints
Added medical-image-segmentation-on-cvc-clinicdb with 46 datapoints
Chec

In [2]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'transfer_learning'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_transfer_learning_benchmarks_with_20plus():
    """Get transfer learning benchmarks with at least 20 datapoints"""
    print("Finding transfer learning benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/transfer-learning-on-imagenet",
        "https://paperswithcode.com/sota/transfer-learning-on-cifar-10",
        "https://paperswithcode.com/sota/transfer-learning-on-cifar-100",
        "https://paperswithcode.com/sota/transfer-learning-on-pascal-voc-2007"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main transfer learning page
        driver.get("https://paperswithcode.com/task/transfer-learning")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "transfer-learning" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential transfer learning benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'transfer_learning', 'transfer_learning_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'transfer_learning', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'transfer_learning', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all transfer learning models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'transfer_learning')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'transfer_learning', 'all_transfer_learning_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'transfer_learning', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting transfer learning task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_transfer_learning_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No transfer learning benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} transfer learning benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Transfer learning task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting transfer learning task data collection (20+ datapoints)...
Finding transfer learning benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 10 potential transfer learning benchmarks
Checking transfer-learning-on-imagenet...
transfer-learning-on-imagenet: 0 datapoints
Checking transfer-learning-on-cifar-10...
transfer-learning-on-cifar-10: 0 datapoints
Checking transfer-learning-on-cifar-100...
transfer-learning-on-cifar-100: 0 datapoints
Checking transfer-learning-on-pascal-voc-2007...
transfer-learning-on-pascal-voc-2007: 0 datapoints
Checking transfer-learning-on-office-home...
transfer-learning-on-office-home: 5 datapoints
Checking transfer-learning-on-banglalekha-isolated...
transfer-learning-on-banglalekha-isolated: 1 datapoints
Checking transfer-learning-on-coco70...
transfer-learning-on-coco70: 1 datapoints
Checking transfer-learning-on-100-sleep-nig

In [3]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'multi_task_learning'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_multi_task_learning_benchmarks_with_20plus():
    """Get multi-task learning benchmarks with at least 20 datapoints"""
    print("Finding multi-task learning benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/multi-task-learning-on-glue",
        "https://paperswithcode.com/sota/multi-task-learning-on-decanlp",
        "https://paperswithcode.com/sota/multi-task-learning-on-nyu-depth-v2",
        "https://paperswithcode.com/sota/multi-task-learning-on-cityscapes"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main multi-task learning page
        driver.get("https://paperswithcode.com/task/multi-task-learning")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "multi-task-learning" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential multi-task learning benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'multi_task_learning', 'multi_task_learning_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'multi_task_learning', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'multi_task_learning', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all multi-task learning models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'multi_task_learning')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'multi_task_learning', 'all_multi_task_learning_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'multi_task_learning', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting multi-task learning task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_multi_task_learning_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No multi-task learning benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} multi-task learning benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Multi-task learning task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting multi-task learning task data collection (20+ datapoints)...
Finding multi-task learning benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 11 potential multi-task learning benchmarks
Checking multi-task-learning-on-glue...
multi-task-learning-on-glue: 0 datapoints
Checking multi-task-learning-on-decanlp...
multi-task-learning-on-decanlp: 0 datapoints
Checking multi-task-learning-on-nyu-depth-v2...
multi-task-learning-on-nyu-depth-v2: 0 datapoints
Checking multi-task-learning-on-cityscapes...
multi-task-learning-on-cityscapes: 3 datapoints
Checking multi-task-learning-on-omniglot...
multi-task-learning-on-omniglot: 2 datapoints
Checking multi-task-learning-on-nyuv2...
multi-task-learning-on-nyuv2: 2 datapoints
Checking multi-task-learning-on-qm9...
multi-task-learning-on-qm9: 5 datapoints
Checking multi-task-learning-on-celeba...
multi-task-learning-on-

In [4]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'anomaly_detection'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_anomaly_detection_benchmarks_with_20plus():
    """Get anomaly detection benchmarks with at least 20 datapoints"""
    print("Finding anomaly detection benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/anomaly-detection-on-mvtec-ad",
        "https://paperswithcode.com/sota/anomaly-detection-on-kdd-cup-99",
        "https://paperswithcode.com/sota/anomaly-detection-on-nsl-kdd",
        "https://paperswithcode.com/sota/anomaly-detection-on-odds"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main anomaly detection page
        driver.get("https://paperswithcode.com/task/anomaly-detection")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "anomaly-detection" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential anomaly detection benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'anomaly_detection', 'anomaly_detection_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'anomaly_detection', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'anomaly_detection', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all anomaly detection models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'anomaly_detection')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'anomaly_detection', 'all_anomaly_detection_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'anomaly_detection', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting anomaly detection task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_anomaly_detection_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No anomaly detection benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} anomaly detection benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Anomaly detection task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting anomaly detection task data collection (20+ datapoints)...
Finding anomaly detection benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 77 potential anomaly detection benchmarks
Checking anomaly-detection-on-mvtec-ad...
anomaly-detection-on-mvtec-ad: 144 datapoints
Added anomaly-detection-on-mvtec-ad with 144 datapoints
Checking anomaly-detection-on-kdd-cup-99...
anomaly-detection-on-kdd-cup-99: 0 datapoints
Checking anomaly-detection-on-nsl-kdd...
anomaly-detection-on-nsl-kdd: 0 datapoints
Checking anomaly-detection-on-odds...
anomaly-detection-on-odds: 3 datapoints
Checking anomaly-detection-on-visa...
anomaly-detection-on-visa: 47 datapoints
Added anomaly-detection-on-visa with 47 datapoints
Checking anomaly-detection-on-mvtec-loco-ad...
anomaly-detection-on-mvtec-loco-ad: 40 datapoints
Added anomaly-detection-on-mvtec-loco-ad with 40 datapoints
Chec

In [5]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'denoising'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_denoising_benchmarks_with_20plus():
    """Get denoising benchmarks with at least 20 datapoints"""
    print("Finding denoising benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/denoising-on-set12",
        "https://paperswithcode.com/sota/denoising-on-sidd",
        "https://paperswithcode.com/sota/denoising-on-bsd400",
        "https://paperswithcode.com/sota/denoising-on-set14"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main denoising page
        driver.get("https://paperswithcode.com/task/denoising")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "denoising" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential denoising benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'denoising', 'denoising_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'denoising', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'denoising', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all denoising models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'denoising')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'denoising', 'all_denoising_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'denoising', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting denoising task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_denoising_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No denoising benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} denoising benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Denoising task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting denoising task data collection (20+ datapoints)...
Finding denoising benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 10 potential denoising benchmarks
Checking denoising-on-set12...
denoising-on-set12: 0 datapoints
Checking denoising-on-sidd...
denoising-on-sidd: 0 datapoints
Checking denoising-on-bsd400...
denoising-on-bsd400: 0 datapoints
Checking denoising-on-set14...
denoising-on-set14: 0 datapoints
Checking denoising-on-darmstadt-noise-dataset...
denoising-on-darmstadt-noise-dataset: 10 datapoints
Checking denoising-on-aapm...
denoising-on-aapm: 1 datapoints
Checking denoising-on-iris...
denoising-on-iris: 1 datapoints
Checking denoising-on-dnd-1...
denoising-on-dnd-1: 1 datapoints
Checking denoising-on-cbsd68-sigm75...
denoising-on-cbsd68-sigm75: 1 datapoints
Checking denoising-on-div2k...
denoising-on-div2k: 1 datapoints
No denoising benchmark

In [6]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'speech_recognition'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_speech_recognition_benchmarks_with_20plus():
    """Get speech recognition benchmarks with at least 20 datapoints"""
    print("Finding speech recognition benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/speech-recognition-on-librispeech-test-clean",
        "https://paperswithcode.com/sota/speech-recognition-on-librispeech-test-other",
        "https://paperswithcode.com/sota/speech-recognition-on-switchboard",
        "https://paperswithcode.com/sota/speech-recognition-on-chime-4"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main speech recognition page
        driver.get("https://paperswithcode.com/task/speech-recognition")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "speech-recognition" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential speech recognition benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'speech_recognition', 'speech_recognition_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'speech_recognition', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'speech_recognition', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all speech recognition models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'speech_recognition')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'speech_recognition', 'all_speech_recognition_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'speech_recognition', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting speech recognition task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_speech_recognition_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No speech recognition benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} speech recognition benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Speech recognition task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting speech recognition task data collection (20+ datapoints)...
Finding speech recognition benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 67 potential speech recognition benchmarks
Checking speech-recognition-on-librispeech-test-clean...
speech-recognition-on-librispeech-test-clean: 63 datapoints
Added speech-recognition-on-librispeech-test-clean with 63 datapoints
Checking speech-recognition-on-librispeech-test-other...
speech-recognition-on-librispeech-test-other: 53 datapoints
Added speech-recognition-on-librispeech-test-other with 53 datapoints
Checking speech-recognition-on-switchboard...
speech-recognition-on-switchboard: 0 datapoints
Checking speech-recognition-on-chime-4...
speech-recognition-on-chime-4: 0 datapoints
Checking speech-recognition-on-timit...
speech-recognition-on-timit: 22 datapoints
Added speech-recognition-on-timit with 22 datap

In [7]:
import pandas as pd
import time
import os
import re
import datetime
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from dateutil import parser

# Base directory for saving data - using the exact path structure
home_dir = os.path.expanduser("~")  # Gets the user's home directory
data_dir = os.path.join(home_dir, "Documents", "Jupyter Notebooks", "RA Task", "tech_progress_data")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'knowledge_graphs'), exist_ok=True)

print(f"Data will be stored in: {data_dir}")

def setup_driver():
    """Initialize and return a Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Updated headless syntax
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Helpful for Mac
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    # Set timeout
    driver.set_page_load_timeout(30)
    return driver

def extract_date_from_text(text):
    """Try to extract a date from text using various formats and heuristics"""
    if not text:
        return None
        
    try:
        # Try direct parsing
        return parser.parse(text, fuzzy=True).date()
    except:
        # Look for year patterns
        year_match = re.search(r'20[0-2][0-9]', text)
        if year_match:
            year = int(year_match.group())
            # Look for month patterns
            month_match = re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', text, re.IGNORECASE)
            if month_match:
                month_text = month_match.group().lower()
                month_map = {
                    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
                }
                for prefix, month_num in month_map.items():
                    if month_text.startswith(prefix):
                        return datetime.date(year, month_num, 1)
            
            # If only year was found, default to January
            return datetime.date(year, 1, 1)
    return None

def extract_parameters_from_text(text):
    """Extract parameter count from model description"""
    if not text:
        return None
    
    # Common patterns for parameter counts (millions, billions)
    patterns = [
        r'(\d+\.?\d*)\s*[Mm]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Mm]\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]illion\s*parameters',
        r'(\d+\.?\d*)\s*[Bb]\s*parameters',
        r'parameters:\s*(\d+\.?\d*)\s*[Mm]',
        r'parameters:\s*(\d+\.?\d*)\s*[Bb]',
        r'params:\s*(\d+\.?\d*)\s*[Mm]',
        r'params:\s*(\d+\.?\d*)\s*[Bb]',
        r'(\d+\.?\d*)\s*[Mm]\s*params',
        r'(\d+\.?\d*)\s*[Bb]\s*params'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            param_count = float(match.group(1))
            # Convert to millions
            if 'billion' in text.lower() or 'b params' in text.lower() or 'b parameters' in text.lower():
                param_count *= 1000  # Convert billions to millions
            return param_count
    
    return None

def get_knowledge_graphs_benchmarks_with_20plus():
    """Get knowledge graphs benchmarks with at least 20 datapoints"""
    print("Finding knowledge graphs benchmarks with 20+ datapoints...")
    
    # Start with major benchmarks
    major_benchmarks = [
        "https://paperswithcode.com/sota/knowledge-graph-completion-on-fb15k-237",
        "https://paperswithcode.com/sota/knowledge-graph-completion-on-wn18rr",
        "https://paperswithcode.com/sota/knowledge-graph-completion-on-fb15k",
        "https://paperswithcode.com/sota/knowledge-graph-completion-on-wn18"
    ]
    
    driver = setup_driver()
    benchmarks_with_data = []
    
    try:
        # First check the main knowledge graphs page
        driver.get("https://paperswithcode.com/task/knowledge-graphs")
        time.sleep(3)
        
        # Find all benchmark links
        print("Looking for benchmark links...")
        links = driver.find_elements(By.TAG_NAME, "a")
        benchmark_links = []
        
        # Add major benchmarks first
        for url in major_benchmarks:
            benchmark_links.append(url)
        
        # Add additional benchmarks from page
        for link in links:
            href = link.get_attribute("href")
            if href and "/sota/" in href and "knowledge-graph" in href.lower():
                if href not in benchmark_links:
                    benchmark_links.append(href)
        
        print(f"Found {len(benchmark_links)} potential knowledge graphs benchmarks")
        
        # Check each benchmark for datapoint count
        for benchmark_url in benchmark_links:
            try:
                benchmark_name = benchmark_url.split("/")[-1]
                print(f"Checking {benchmark_name}...")
                
                # Load the benchmark page
                driver.get(benchmark_url)
                time.sleep(3)
                
                # Try to find timeline link
                timeline_url = None
                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    href = link.get_attribute("href")
                    if href and "sota-over-time" in href:
                        timeline_url = href
                        break
                
                # If found, check the timeline page
                if timeline_url:
                    print(f"Found timeline link for {benchmark_name}")
                    driver.get(timeline_url)
                    time.sleep(3)
                
                # Count datapoints in tables
                tables = driver.find_elements(By.TAG_NAME, "table")
                max_datapoints = 0
                
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    datapoints = len(rows) - 1  # Subtract 1 for header
                    max_datapoints = max(max_datapoints, datapoints)
                
                print(f"{benchmark_name}: {max_datapoints} datapoints")
                
                # Add if it has 20+ datapoints
                if max_datapoints >= 20:
                    benchmarks_with_data.append({
                        "name": benchmark_name,
                        "url": benchmark_url,
                        "timeline_url": timeline_url,
                        "datapoints": max_datapoints
                    })
                    print(f"Added {benchmark_name} with {max_datapoints} datapoints")
            
            except Exception as e:
                print(f"Error checking {benchmark_url}: {e}")
                continue
        
        # Save the benchmark list
        if benchmarks_with_data:
            df = pd.DataFrame(benchmarks_with_data)
            df.to_csv(os.path.join(data_dir, 'knowledge_graphs', 'knowledge_graphs_benchmarks_20plus.csv'), index=False)
            print(f"Saved {len(benchmarks_with_data)} benchmarks with 20+ datapoints")
    
    except Exception as e:
        print(f"Error getting benchmarks: {e}")
    
    finally:
        driver.quit()
    
    return benchmarks_with_data

def scrape_benchmark_data(benchmark):
    """Scrape model data from a benchmark"""
    print(f"Scraping data for {benchmark['name']}...")
    
    driver = setup_driver()
    models_data = []
    
    try:
        # Use timeline URL if available, otherwise use benchmark URL
        url = benchmark.get('timeline_url') if benchmark.get('timeline_url') else benchmark['url']
        driver.get(url)
        time.sleep(3)
        
        # Try to extract data from JSON first
        scripts = driver.find_elements(By.TAG_NAME, "script")
        json_data_found = False
        
        for script in scripts:
            script_text = script.get_attribute("innerHTML")
            if "window.INITIAL_DATA" in script_text:
                json_match = re.search(r'window.INITIAL_DATA\s*=\s*({.*?});', script_text, re.DOTALL)
                if json_match:
                    try:
                        data = json.loads(json_match.group(1))
                        
                        # Save raw JSON for reference
                        benchmark_dir = os.path.join(data_dir, 'knowledge_graphs', benchmark['name'])
                        os.makedirs(benchmark_dir, exist_ok=True)
                        with open(os.path.join(benchmark_dir, "raw_data.json"), 'w') as f:
                            json.dump(data, f, indent=2)
                        
                        # Check if we have SOTA data
                        if 'sota' in data and 'evaluations' in data['sota']:
                            json_data_found = True
                            print(f"Found JSON data with {len(data['sota']['evaluations'])} evaluations")
                            
                            for eval_data in data['sota']['evaluations']:
                                # Extract date
                                date_str = eval_data.get('date')
                                pub_date = None
                                if date_str:
                                    try:
                                        pub_date = parser.parse(date_str).date()
                                    except:
                                        pass
                                
                                # Extract model info
                                model_name = eval_data.get('model', {}).get('name', 'Unknown')
                                paper_title = eval_data.get('paper', {}).get('title', '')
                                paper_url = eval_data.get('paper', {}).get('url', '')
                                code_url = eval_data.get('code', {}).get('url', '') if eval_data.get('code') else ''
                                
                                # Extract model description to get parameter count
                                model_desc = eval_data.get('model', {}).get('description', '')
                                param_count = extract_parameters_from_text(model_desc)
                                
                                # Extract metrics
                                metrics = {}
                                for metric in eval_data.get('metrics', []):
                                    name = metric.get('name', '')
                                    value = metric.get('value', '')
                                    if name and value:
                                        try:
                                            # Convert to float if possible
                                            if isinstance(value, str):
                                                value = value.replace('%', '')
                                            metrics[name] = float(value)
                                        except:
                                            metrics[name] = value
                                
                                # Only add if we have date and model name
                                if pub_date and model_name:
                                    model_entry = {
                                        'dataset': benchmark['name'],
                                        'model_name': model_name,
                                        'paper_title': paper_title,
                                        'paper_url': paper_url,
                                        'code_url': code_url,
                                        'description': model_desc,
                                        'parameters_millions': param_count,
                                        'date': pub_date,
                                        'year': pub_date.year,
                                        'month': pub_date.month,
                                        'day': pub_date.day,
                                        **metrics
                                    }
                                    models_data.append(model_entry)
                    except Exception as e:
                        print(f"Error parsing JSON data: {e}")
        
        # If no JSON data, extract from tables
        if not json_data_found:
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            for table in tables:
                try:
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    if len(rows) <= 1:  # Skip tables with just headers
                        continue
                    
                    # Get headers
                    headers = [th.text.strip().lower() for th in rows[0].find_elements(By.TAG_NAME, "th")]
                    
                    # Find date and model column indices
                    date_col = None
                    model_col = None
                    metric_cols = []
                    
                    for i, header in enumerate(headers):
                        if any(term in header for term in ['date', 'published', 'year']):
                            date_col = i
                        elif any(term in header for term in ['model', 'method', 'name']):
                            model_col = i
                        elif header and header not in ['paper', 'code', 'link']:
                            metric_cols.append((i, header))
                    
                    # If we found date and model columns, process the rows
                    if date_col is not None and model_col is not None:
                        for row in rows[1:]:  # Skip header row
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) != len(headers):
                                continue
                            
                            # Extract date
                            date_text = cells[date_col].text.strip()
                            pub_date = extract_date_from_text(date_text)
                            
                            # Extract model name
                            model_name = cells[model_col].text.strip()
                            
                            # Get paper link if available
                            paper_url = ""
                            paper_links = cells[model_col].find_elements(By.TAG_NAME, "a")
                            if paper_links:
                                paper_url = paper_links[0].get_attribute("href")
                            
                            # Extract metrics
                            metrics = {}
                            for col_idx, col_name in metric_cols:
                                value = cells[col_idx].text.strip()
                                if value:
                                    try:
                                        # Convert percentage strings to float
                                        if '%' in value:
                                            value = value.replace('%', '')
                                        metrics[col_name] = float(value)
                                    except:
                                        metrics[col_name] = value
                            
                            # Only add if we have date and model name
                            if pub_date and model_name:
                                model_entry = {
                                    'dataset': benchmark['name'],
                                    'model_name': model_name,
                                    'paper_url': paper_url,
                                    'date': pub_date,
                                    'year': pub_date.year,
                                    'month': pub_date.month,
                                    'day': pub_date.day,
                                    **metrics
                                }
                                models_data.append(model_entry)
                
                except Exception as e:
                    print(f"Error processing table: {e}")
        
        # Sort models by date
        models_data.sort(key=lambda x: x.get('date'))
        
        # Save the models
        if models_data:
            benchmark_dir = os.path.join(data_dir, 'knowledge_graphs', benchmark['name'])
            os.makedirs(benchmark_dir, exist_ok=True)
            
            models_df = pd.DataFrame(models_data)
            models_df.to_csv(os.path.join(benchmark_dir, "models.csv"), index=False)
            print(f"Saved {len(models_data)} models for {benchmark['name']}")
        
        return models_data
    
    except Exception as e:
        print(f"Error scraping benchmark data: {e}")
        return []
    
    finally:
        driver.quit()

def combine_all_models():
    """Combine all knowledge graphs models into a single file"""
    try:
        all_models = []
        
        # Get all model CSV files
        for root, dirs, files in os.walk(os.path.join(data_dir, 'knowledge_graphs')):
            for file in files:
                if file == "models.csv":
                    file_path = os.path.join(root, file)
                    try:
                        df = pd.read_csv(file_path)
                        all_models.append(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        # Combine and save
        if all_models:
            combined_df = pd.concat(all_models, ignore_index=True)
            
            # Make sure we have full date info
            if 'date' in combined_df.columns:
                combined_df['date'] = pd.to_datetime(combined_df['date'])
            
            # Sort by date
            combined_df = combined_df.sort_values('date')
            
            # Save to CSV
            combined_df.to_csv(os.path.join(data_dir, 'knowledge_graphs', 'all_knowledge_graphs_models.csv'), index=False)
            print(f"Combined {len(combined_df)} models from all benchmarks")
            
            # Create yearly count summary
            yearly_summary = combined_df.groupby(['dataset', 'year']).size().reset_index(name='model_count')
            yearly_summary.to_csv(os.path.join(data_dir, 'knowledge_graphs', 'yearly_model_count.csv'), index=False)
            print(f"Created yearly model count summary")
    
    except Exception as e:
        print(f"Error combining models: {e}")

def main():
    print("Starting knowledge graphs task data collection (20+ datapoints)...")
    
    # Get benchmarks with 20+ datapoints
    benchmarks = get_knowledge_graphs_benchmarks_with_20plus()
    
    if not benchmarks:
        print("No knowledge graphs benchmarks with 20+ datapoints found. Exiting.")
        return
    
    print(f"Found {len(benchmarks)} knowledge graphs benchmarks with 20+ datapoints")
    
    # Process each benchmark
    all_models_count = 0
    for benchmark in benchmarks:
        try:
            models = scrape_benchmark_data(benchmark)
            if models:
                all_models_count += len(models)
            
            # Be nice to the server
            time.sleep(5)
        
        except Exception as e:
            print(f"Error processing {benchmark['name']}: {e}")
    
    print(f"Collected a total of {all_models_count} models across all benchmarks")
    
    # Combine all models
    combine_all_models()
    
    print("Knowledge graphs task data collection complete!")

if __name__ == "__main__":
    main()

Data will be stored in: /Users/karmabirchakraborty/Documents/Jupyter Notebooks/RA Task/tech_progress_data
Starting knowledge graphs task data collection (20+ datapoints)...
Finding knowledge graphs benchmarks with 20+ datapoints...
Looking for benchmark links...
Found 8 potential knowledge graphs benchmarks
Checking knowledge-graph-completion-on-fb15k-237...
knowledge-graph-completion-on-fb15k-237: 4 datapoints
Checking knowledge-graph-completion-on-wn18rr...
knowledge-graph-completion-on-wn18rr: 2 datapoints
Checking knowledge-graph-completion-on-fb15k...
knowledge-graph-completion-on-fb15k: 0 datapoints
Checking knowledge-graph-completion-on-wn18...
knowledge-graph-completion-on-wn18: 0 datapoints
Checking knowledge-graphs-on-mars-multimodal...
knowledge-graphs-on-mars-multimodal: 8 datapoints
Checking knowledge-graphs-on-jerichoworld...
knowledge-graphs-on-jerichoworld: 5 datapoints
Checking knowledge-graphs-on-wikikg90m-lsc...
knowledge-graphs-on-wikikg90m-lsc: 4 datapoints
Checkin