<a href="https://colab.research.google.com/github/hamidahoderinwale/model_metadata_analyses/blob/main/scraping_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Script 1: takes input model url, validates url, and gives model metadata
!pip install validators
from huggingface_hub import HfApi
import validators
import json
import csv

hf_api = HfApi()

def input_url():
    while True:
        input_model_url = input("Enter model URL: ")
        print(f"You entered: {input_model_url}")

        if validators.url(input_model_url) and "huggingface.co" in input_model_url:
            try:
                # Extract the model ID from the URL
                model_id = input_model_url.split("huggingface.co/")[-1]
                model_info = hf_api.model_info(model_id) # Get model info: https://huggingface.co/docs/huggingface_hub/v0.29.2/en/package_reference/hf_api#huggingface_hub.ModelInfo
                json_output = json.dumps(model_info.__dict__, indent=4, default=str)
                print(json_output)
                with open('model_info.json', 'w') as json_file:
                    json_file.write(json_output)
            except Exception as e:
                    print(f"Error fetching model info: {str(e)}")
            for key, value in model_info.__dict__.items():
                    print(f"{key}: {value}")
                    return model_info

        else:
            print("Invalid URL. Please enter a valid Hugging Face model URL.") # error code

# Call the function
model_info = input_url()

# DeepSeek link for testing: https://huggingface.co/deepseek-ai/DeepSeek-R1


In [53]:
# Script 2
  # 1. Take link as input (format check). This is the "main model"
  # 2. Give the link to the page with the fine-tunes for the inputted model

from huggingface_hub import HfApi
import requests
from bs4 import BeautifulSoup
import re

# Initialize API
api = HfApi()

# Function to validate Hugging Face model URL
def validate_hf_model_url(url):
    pattern = r"^https://huggingface.co/([\w\-]+)/([\w\-]+)$"
    match = re.match(pattern, url)
    if match:
        return match.groups()  # Returns (org/user, model_name)
    return None

# Function to find fine-tuned models
def get_finetuned_models_page(model_org, model_name):
    search_url = f"https://huggingface.co/models?search={model_name}"
    response = requests.get(search_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        model_links = [
            a["href"] for a in soup.find_all("a", href=True)
            if model_name.lower() in a["href"].lower()
        ]
        return [f"https://huggingface.co{link}" for link in model_links if model_org not in link]

    return []

# Main execution
model_url = input("Enter the Hugging Face model URL: ").strip()

validated = validate_hf_model_url(model_url)
if validated:
    org, model_name = validated
    finetune_links = get_finetuned_models_page(org, model_name)

    if finetune_links:
        print("Fine-tuned models found:")
        for link in finetune_links:
            print(link)
    else:
        print("No fine-tuned models found for this model.")
else:
    print("Invalid Hugging Face model URL format.")



Enter the Hugging Face model URL: https://huggingface.co/deepseek-ai/DeepSeek-R1
Fine-tuned models found:
https://huggingface.co/unsloth/DeepSeek-R1-GGUF
https://huggingface.co/nvidia/DeepSeek-R1-FP4
https://huggingface.co/meituan/DeepSeek-R1-Block-INT8
https://huggingface.co/mradermacher/DeepSeek-R1-Distill-Qwen-14B-Uncensored-GGUF
https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF
https://huggingface.co/cognitivecomputations/DeepSeek-R1-AWQ
https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B
https://huggingface.co/suayptalha/DeepSeek-R1-Distill-Llama-3B
https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8
https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-14B-GGUF
https://huggingface.co/huihui-ai/DeepSeek-R1-Distill-Qwen-32B-abliterated
https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF
https://huggingface.co/bartowski/DeepSeek-R1-GGUF
https://huggingface.co/unsloth/DeepSeek-R1-BF16
https://huggingface.co/mlx-community/DeepSeek-R1-Disti

In [None]:
import requests
import json
import csv
from huggingface_hub import HfApi
from bs4 import BeautifulSoup
import re

# Initialize API
api = HfApi()

# Function to validate Hugging Face model URL
def validate_hf_model_url(url):
    pattern = r"^https://huggingface.co/([\w\-]+)/([\w\-]+)$"
    match = re.match(pattern, url)
    return match.groups() if match else None

# Function to find fine-tuned models
def get_finetuned_models_page(model_org, model_name):
    search_url = f"https://huggingface.co/models?search={model_name}"
    response = requests.get(search_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        model_links = [
            a["href"] for a in soup.find_all("a", href=True)
            if model_name.lower() in a["href"].lower()
        ]
        return [f"https://huggingface.co{link}" for link in model_links if model_org not in link]

    return []

# Recursive DFS for finding fine-tunes
def dfs_finetunes(model_url, visited, depth=0, results=None):
    if results is None:
        results = []

    if model_url in visited:
        return results
    visited.add(model_url)

    validated = validate_hf_model_url(model_url)
    if not validated:
        print(f"Invalid URL skipped: {model_url}")
        return results

    model_org, model_name = validated
    model_id = f"{model_org}/{model_name}"

    print(f"\n{'  ' * depth}Fetching metadata for: {model_id}")
    try:
        model_metadata = api.model_info(model_id).__dict__
    except Exception as e:
        print(f"Error fetching metadata: {e}")
        return results

    results.append({
        "model_id": model_id,
        "url": model_url,
        "metadata": model_metadata,
        "depth": depth
    })

    finetune_links = get_finetuned_models_page(model_org, model_name)
    print(f"{'  ' * depth}Found {len(finetune_links)} fine-tunes at depth {depth}.")

    for link in finetune_links:
        dfs_finetunes(link, visited, depth + 1, results)

    return results

# Function to save results as JSON
def save_json(results, model_name):
    filename = f"{model_name}_finetunes.json"
    data = {
        "models": results
    }
    with open(filename, "w") as f:
        json.dump(data, f, indent=4, default=str)
    print(f"Results saved to {filename}")

# Function to save results as CSV
def save_csv(results, model_name):
    filename = f"{model_name}_finetunes.csv"
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["model_id", "url", "depth"])
        writer.writeheader()
        for entry in results:
            writer.writerow({
                "model_id": entry["model_id"],
                "url": entry["url"],
                "depth": entry["depth"]
            })
    print(f"Results saved to {filename}")

# Main execution
if __name__ == "__main__":
    model_url = input("Enter the Hugging Face model URL: ").strip()
    visited = set()
    results = dfs_finetunes(model_url, visited)

    if results:
        model_name = results[0]["model_id"].split("/")[-1]  # Extract model name for file naming
        save_json(results, model_name)
        save_csv(results, model_name)
    else:
        print("No fine-tuned models found.")
