<a href="https://colab.research.google.com/github/hamidahoderinwale/model_metadata_analyses/blob/main/scraping_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install huggingface_hub -U
!pip install backoff

import pandas as pd
from huggingface_hub import HfApi
import json
import time
import backoff # for rate limit managing

# Initialize API
api = HfApi()


models = api.list_models()
model_list = [model.__dict__ for model in models]
df = pd.DataFrame(model_list)
df.head()


quantized_models = df[df['tags'].apply(lambda x: 'quantized' in x)] # looks for quantized models

def find_child_models(base_model_id, all_models):
    return [model for model in all_models if model['id'].startswith(base_model_id + '/')]

def create_model_tree(base_model_id, all_models):
    base_model = next((model for model in all_models if model['id'] == base_model_id), None)
    if not base_model:
        return None

    child_models = find_child_models(base_model_id, all_models)
    quantized_versions = [model for model in all_models if model['id'].startswith(base_model_id) and 'quantized' in model['tags']]

    return {
        "model_id": base_model_id,
        "parent_models": [],
        "child_models": [create_model_tree(child['id'], all_models) for child in child_models],
        "quantizations": [model['id'] for model in quantized_versions],
        "model_merges": []
    }

Collecting huggingface_hub
  Downloading huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.29.2-py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.1/468.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.28.1
    Uninstalling huggingface-hub-0.28.1:
      Successfully uninstalled huggingface-hub-0.28.1
Successfully installed huggingface_hub-0.29.2
Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
import backoff
import networkx as nx
from tqdm import tqdm  # Progress bar
import uuid
from datetime import datetime

# Base URL for Hugging Face model categories
CATEGORY_TEMPLATE = "https://huggingface.co/models?other=base_model:{}:{}"

# Different types of derived models
DERIVED_TYPES = ["finetune", "adapter", "quantized", "merge"]

# Base Model URL
BASE_MODEL_URL = "https://huggingface.co/deepseek-ai/DeepSeek-R1"

# Metadata for the collection
metadata = {
    "collection_id": str(uuid.uuid4()),
    "name": "Model Collection for DeepSeek-R1",
    "description": "Collection of all derived models and relationships from DeepSeek-R1",
    "created_date": datetime.now().isoformat(),
    "last_updated": datetime.now().isoformat(),
    "tags": ["DeepSeek-R1", "HuggingFace", "AI Models"],
    "organization": "DeepSeek AI"
}

# Initialize Model Tree Structure
model_trees = []

@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5)
def fetch_page(url):
    """Fetch an HTML page and return a BeautifulSoup object."""
    response = requests.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def extract_derived_models(base_model):
    """Find all models derived from a given base model."""
    derived_models = []

    for model_type in DERIVED_TYPES:
        url = CATEGORY_TEMPLATE.format(model_type, base_model)
        soup = fetch_page(url)

        for link in soup.find_all("a", class_="text-gray-700 underline"):
            model_url = "https://huggingface.co" + link["href"]
            derived_models.append({"type": model_type, "url": model_url})

    return derived_models

def extract_model_id(model_url):
    """Extract the model ID from a model page."""
    try:
        soup = fetch_page(model_url)
        header = soup.find("header", class_="flex items-center mb-0.5")
        if header and "title" in header.attrs:
            return header["title"]
    except Exception as e:
        print(f"Error fetching {model_url}: {e}")
    return None

def extract_base_models(model_url):
    """Extract base model(s) from a model's page."""
    try:
        soup = fetch_page(model_url)
        base_model_section = soup.find("div", class_="font-semibold", text="Base model")
        if base_model_section:
            base_model_links = base_model_section.find_next_sibling("div").find_all("a")
            return [link.text for link in base_model_links]
    except Exception as e:
        print(f"Error fetching base models for {model_url}: {e}")
    return []

def build_model_tree(base_model):
    """Recursively build a model tree for a given base model."""
    print(f"📌 Building model tree for: {base_model}")

    # Graph representation of the tree
    G = nx.DiGraph()
    G.add_node(base_model)

    # Scrape derived models
    derived_models = extract_derived_models(base_model)

    for model in tqdm(derived_models, desc="Processing models"):
        model_id = extract_model_id(model["url"])
        if model_id:
            G.add_edge(base_model, model_id, relation=model["type"])

            # Fetch base models of this model (to handle multi-parent relationships)
            base_models = extract_base_models(model["url"])
            for parent_model in base_models:
                G.add_edge(parent_model, model_id, relation="derived")

    return G

def build_model_collection(base_model):
    """Build and save model tree collection."""
    model_graph = build_model_tree(base_model)

    # Generate tree data
    tree_data = []
    for parent, child, attr in model_graph.edges(data=True):
        tree_data.append({
            "parent": parent,
            "child": child,
            "relation": attr["relation"]
        })

    # Add model tree data
    tree_id = str(uuid.uuid4())
    model_tree = {
        "tree_id": tree_id,
        "root_model_id": base_model,
        "tree_name": f"Model Tree for {base_model}",
        "tree_description": f"Model tree for {base_model} including all derived models",
        "model_family": "DeepSeek",
        "tree_data": tree_data
    }

    # Append the model tree to the collection
    model_trees.append(model_tree)

    return model_graph, tree_id, model_tree

def save_model_collection(model_graph, base_model):
    """Save the collection in the required JSON format."""
    collection = {
        "metadata": metadata,
        "model_trees": model_trees,
        "cross_tree_relationships": []  # Can be populated later if needed
    }

    # Create a safe filename by replacing slashes with underscores or another safe character becuase the OS takes the slash as a dir. seperator
    safe_filename = base_model.replace("/", "_")

    with open(f"{safe_filename}_model_collection.json", "w") as f:
        json.dump(collection, f, indent=2)

    print(f"Model collection saved: {safe_filename}_model_collection.json")

# Example usage
model_graph, tree_id, model_tree = build_model_collection("deepseek-ai/DeepSeek-R1")
save_model_collection(model_graph, "deepseek-ai/DeepSeek-R1")

# check the div of the model tree
# get the numbers as well
# model level feature (parent & children)


📌 Building model tree for: deepseek-ai/DeepSeek-R1


Processing deepseek-ai/DeepSeek-R1 models: 0it [00:00, ?it/s]

Model collection saved: deepseek-ai_DeepSeek-R1_model_collection.json





In [None]:
# Script 2
# 1. Take link as input (format check). This is the "main model"
# 2. Give the link to the page with the finetunes for the inputted model

from huggingface_hub import HfApi
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Initialize API
api = HfApi()

model_url=

In [None]:
# Script 1: takes input model url, validates url, and generates file metadata

from huggingface_hub import HfApi
import validators

hf_api = HfApi()

def input_url():
    while True:
        input_model_url = input("Enter model URL: ")
        print(f"You entered: {input_model_url}")

        if validators.url(input_model_url) and "huggingface.co" in input_model_url:
            try:
                # Extract the model ID from the URL
                model_id = input_model_url.split("huggingface.co/")[-1]
                model_info = hf_api.model_info(model_id)
                print(f"Model info: {model_info}")
                return model_info
            except Exception as e:
                print(f"Error fetching model info: {str(e)}")
        else:
            print("Invalid URL. Please enter a valid Hugging Face model URL.")

# Call the function
model_info = input_url()


In [None]:
!pip install huggingface_hub -U
!pip install backoff

import pandas as pd
from huggingface_hub import HfApi
import json
import time
import backoff  # for rate limit management

# Initialize API
api = HfApi()

# Backoff decorator to manage rate limiting
@backoff.on_exception(backoff.expo, Exception, max_tries=5, jitter=None)
def get_all_models():
    return api.list_models()

# Fetch all models with retries to handle rate limiting
models = get_all_models()
model_list = [model.__dict__ for model in models]

# Create a dataframe
df = pd.DataFrame(model_list)

# Extract top 10 models, based on a criterion (e.g., number of tags, or sorted by model ID)
df['tags_count'] = df['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
top_10_df = df.nlargest(10, 'tags_count')

# Save the full model data to CSV
df.to_csv('huggingface_models.csv', index=False)

# Save top 10 models to a CSV
top_10_df.to_csv('huggingface_top_10_models.csv', index=False)

# Show top 10 models DataFrame
print(top_10_df.head())

# Find models that are quantized
quantized_models = df[df['tags'].apply(lambda x: 'quantized' in x if isinstance(x, list) else False)]  # looks for quantized models
print(f"Quantized models:\n{quantized_models[['id', 'tags']].head()}")

# Helper function to find child models (models derived from a base model)
def find_child_models(base_model_id, all_models):
    return [model for model in all_models if model['id'].startswith(base_model_id + '/')]

# Function to create model tree for a given base model
def create_model_tree(base_model_id, all_models):
    base_model = next((model for model in all_models if model['id'] == base_model_id), None)
    if not base_model:
        return None

    child_models = find_child_models(base_model_id, all_models)
    quantized_versions = [model for model in all_models if model['id'].startswith(base_model_id) and 'quantized' in model['tags']]

    return {
        "model_id": base_model_id,
        "parent_models": [],
        "child_models": [create_model_tree(child['id'], all_models) for child in child_models],
        "quantizations": [model['id'] for model in quantized_versions],
        "model_merges": []  # Placeholder for model merges, can be extended later
    }

# Example: Create model tree for a specific base model
base_model_id = 'deepseek-ai/DeepSeek-R1'  # Replace with any model ID
model_tree = create_model_tree(base_model_id, model_list)
print(f"Model Tree for {base_model_id}: {json.dumps(model_tree, indent=2)}")


                                                        id author   sha  \
9461                                           espnet/xeus   None  None   
5507                                       cis-lmu/glotlid   None  None   
953771                          vonjack/opus-mt-mul-en-big   None  None   
1027546  Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_...   None  None   
1027617  Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_...   None  None   

        last_modified                created_at  private gated disabled  \
9461             None 2024-06-25 04:25:33+00:00    False  None     None   
5507             None 2023-10-19 23:46:58+00:00    False  None     None   
953771           None 2024-09-18 13:27:10+00:00    False  None     None   
1027546          None 2024-10-09 15:24:39+00:00    False  None     None   
1027617          None 2024-10-09 15:54:32+00:00    False  None     None   

         downloads downloads_all_time  ...  siblings spaces safetensors  \
9461            18     