In [2]:
pip install huggingfac_hub

[31mERROR: Could not find a version that satisfies the requirement huggingfac_hub (from versions: none)[0m[31m
[31mERROR: No matching distribution found for huggingfac_hub[0m[31m
Note: you may need to restart the kernel to use updated packages.


In [13]:
# Modified version without tqdm.notebook dependency
import os
import re
import json
import time
import pandas as pd
import requests
from tqdm import tqdm  # Using regular tqdm instead of notebook version
from datetime import datetime
from bs4 import BeautifulSoup

class HuggingFaceScraper:
    def __init__(self):
        self.base_url = "https://huggingface.co"
        self.api_url = "https://huggingface.co/api/models"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        self.models_data = []
    
    # [Rest of the class methods remain the same]
    def get_popular_model_families(self):
        """Returns list of popular model families to search for"""
        return [
            # Large language models
            "gpt", "llama", "mistral", "falcon",
            # Encoders/BERT-like models
            "bert", "roberta", 
            # Vision models
            "vit", "clip", "resnet", 
            # Audio models
            "whisper"
        ]
    
    def get_models_by_family(self, family, limit=20):
        """Get popular models for a specific family"""
        params = {
            "search": family,
            "sort": "downloads",
            "direction": "-1",
            "limit": str(limit)
        }
        
        response = requests.get(self.api_url, params=params, headers=self.headers)
        if response.status_code == 200:
            models = response.json()
            print(f"Found {len(models)} models for family {family}")
            return models
        else:
            print(f"Error getting models for family {family}: {response.status_code}")
            return []
    
    def get_model_details(self, model_id):
        """Get detailed information about a specific model"""
        api_url = f"{self.api_url}/{model_id}"
        response = requests.get(api_url, headers=self.headers)
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error fetching details for {model_id}: {response.status_code}")
            return None
    
    def extract_parameters(self, model_info, model_id):
        """Extract parameter count from model info"""
        # Try to get from card data first
        if model_info and "cardData" in model_info and model_info["cardData"]:
            card_data = model_info["cardData"]
            if "model-index" in card_data and len(card_data["model-index"]) > 0:
                params = card_data["model-index"][0].get("parameters")
                if params:
                    return params
        
        # If not found, try to scrape from model page
        model_url = f"{self.base_url}/{model_id}"
        response = requests.get(model_url, headers=self.headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            readme = soup.select_one('.prose') or soup.select_one('.markdown')
            
            if readme:
                text = readme.get_text()
                
                # Various patterns for parameter counts
                param_patterns = [
                    r'(\d+(\.\d+)?)\s*[Bb](illion)?\s*parameters',
                    r'(\d+(\.\d+)?)\s*[Mm](illion)?\s*parameters',
                    r'(\d+(\.\d+)?)\s*billion\s*parameters',
                    r'(\d+(\.\d+)?)\s*million\s*parameters',
                    r'(\d+(\.\d+)?)\s*[BbMm]\s*params',
                    r'parameters:\s*(\d+(\.\d+)?)\s*[BbMm]',
                ]
                
                for pattern in param_patterns:
                    match = re.search(pattern, text, re.IGNORECASE)
                    if match:
                        value = float(match.group(1))
                        if 'billion' in pattern.lower() or 'b' in pattern.lower():
                            value *= 1_000_000_000
                        elif 'million' in pattern.lower() or 'm' in pattern.lower():
                            value *= 1_000_000
                        
                        return int(value)
        
        # Try to infer from model name as last resort
        model_id_lower = model_id.lower()
        if 'gpt2-xl' in model_id_lower:
            return 1_500_000_000
        elif 'gpt2-large' in model_id_lower:
            return 774_000_000
        elif 'gpt2-medium' in model_id_lower:
            return 355_000_000
        elif 'gpt2' in model_id_lower:
            return 124_000_000
        elif 'bert-base' in model_id_lower:
            return 110_000_000
        elif 'bert-large' in model_id_lower:
            return 340_000_000
            
        return None
    
    def extract_metrics(self, model_info):
        """Extract performance metrics from model info"""
        metrics = {}
        
        if not model_info or "cardData" not in model_info or not model_info["cardData"]:
            return metrics
        
        card_data = model_info["cardData"]
        
        # Try to get from model-index structure
        if "model-index" in card_data and len(card_data["model-index"]) > 0:
            model_index = card_data["model-index"][0]
            
            # Get results/metrics
            results = model_index.get("results", [])
            for result in results:
                task_info = result.get("task", {})
                task_type = task_info.get("type", "unknown")
                dataset = task_info.get("dataset", {}).get("name", "unknown")
                
                for metric in result.get("metrics", []):
                    metric_name = metric.get("type", "")
                    metric_value = metric.get("value")
                    
                    if metric_name and metric_value is not None:
                        key = f"{task_type}_{dataset}_{metric_name}"
                        metrics[key] = metric_value
        
        return metrics
    
    def scrape_models(self, max_models=100):
        """Main method to scrape models"""
        # Get models by family
        all_models = []
        
        for family in self.get_popular_model_families():
            family_models = self.get_models_by_family(family, limit=20)
            all_models.extend(family_models)
            time.sleep(0.5)  # Be nice to the API
        
        # Remove duplicates and limit
        unique_models = []
        seen_ids = set()
        
        for model in all_models:
            if model['id'] not in seen_ids:
                seen_ids.add(model['id'])
                unique_models.append(model)
        
        models_to_scrape = unique_models[:max_models]
        print(f"Will scrape details for {len(models_to_scrape)} models")
        
        # Scrape detailed information for each model
        for model in tqdm(models_to_scrape, desc="Scraping model details"):
            model_id = model['id']
            
            try:
                # Get basic info
                model_data = {
                    "model_id": model_id,
                    "name": model_id.split('/')[-1] if '/' in model_id else model_id,
                    "author": model.get('author', ''),
                    "downloads": model.get('downloads', 0),
                    "likes": model.get('likes', 0),
                    "tags": model.get('tags', []),
                    "created_at": model.get('createdAt', ''),
                    "last_modified": model.get('lastModified', '')
                }
                
                # Get detailed model info
                model_details = self.get_model_details(model_id)
                
                # Extract parameter count
                model_data["parameters"] = self.extract_parameters(model_details, model_id)
                
                # Extract metrics
                metrics = self.extract_metrics(model_details)
                for key, value in metrics.items():
                    model_data[f"metric_{key}"] = value
                
                self.models_data.append(model_data)
                time.sleep(0.5)  # Be nice to the API
                
            except Exception as e:
                print(f"Error processing {model_id}: {e}")
    
    def save_to_csv(self, filename="huggingface_models.csv"):
        """Save the scraped data to CSV"""
        df = pd.DataFrame(self.models_data)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")

In [11]:
# pip install huggingface_hub requests 

In [14]:
# Create scraper instance
scraper = HuggingFaceScraper()

# Run the scraper (adjust max_models as needed)
scraper.scrape_models(max_models=50)

# Save results to CSV
scraper.save_to_csv("huggingface_models.csv")

Found 20 models for family gpt
Found 20 models for family llama
Found 20 models for family mistral
Found 20 models for family falcon
Found 20 models for family bert
Found 20 models for family roberta
Found 20 models for family vit
Found 20 models for family clip
Found 20 models for family resnet
Found 20 models for family whisper
Will scrape details for 50 models



[Aaping model details:   0%|                            | 0/50 [00:00<?, ?it/s]
[Aaping model details:   2%|▍                   | 1/50 [00:00<00:44,  1.10it/s]
[Aaping model details:   4%|▊                   | 2/50 [00:01<00:43,  1.12it/s]
[Aaping model details:   6%|█▏                  | 3/50 [00:02<00:41,  1.15it/s]
[Aaping model details:   8%|█▌                  | 4/50 [00:03<00:40,  1.14it/s]
[Aaping model details:  10%|██                  | 5/50 [00:04<00:45,  1.02s/it]
[Aaping model details:  12%|██▍                 | 6/50 [00:06<00:50,  1.14s/it]
[Aaping model details:  14%|██▊                 | 7/50 [00:06<00:44,  1.03s/it]
[Aaping model details:  16%|███▏                | 8/50 [00:07<00:41,  1.01it/s]
[Aaping model details:  18%|███▌                | 9/50 [00:08<00:39,  1.05it/s]
[Aaping model details:  20%|███▊               | 10/50 [00:09<00:36,  1.09it/s]
[Aaping model details:  22%|████▏              | 11/50 [00:10<00:35,  1.11it/s]
[Aaping model details:  24

Data saved to huggingface_models.csv



