In [1]:
import json
import time
import os
import pandas as pd
from huggingface_hub import HfApi
from huggingface_hub import ModelCard
from huggingface_hub.utils import GatedRepoError
from huggingface_hub.utils import HfHubHTTPError, EntryNotFoundError
from huggingface_hub import hf_hub_url, get_hf_file_metadata
from concurrent.futures import ThreadPoolExecutor

In [2]:
api = HfApi()

In [3]:
models_api = api.list_models(full=True, cardData=True, fetch_config=True)

In [None]:
models_api = list(models_api)

In [5]:
len (models_api)

624570

In [6]:
models_names = [model.modelId for model in models_api]

In [7]:
with open('models_list.json', 'w') as f:
    json.dump(models_names, f)

In [8]:
models_api_dict = {model.modelId: model for model in models_api}
models = [models_api_dict.get(model_name, model_name) for model_name in models_names]

In [9]:
len(models_names)

624570

In [10]:
len(models)

624570

In [11]:
def retrieve_model_tags(model):

    initial_tags = []
    if model.tags is not None:
        initial_tags.extend(model.tags)
    if model.pipeline_tag is not None:
        initial_tags.append(model.pipeline_tag)
    tags = list(set(initial_tags))

    if hasattr(model, 'cardData') and model.cardData is not None and 'tags' in model.cardData:
        card_tags = model.cardData['tags']
        if card_tags is not None:
            if isinstance(card_tags, list):
                tags.extend(card_tags)
            else:
                tags.append(card_tags)
            tags = list(set(tags))  

    tags = [tag for tag in tags if tag is not None]

    return tags

In [12]:
def get_modelcard_text(model):

    error_messages = []
    
    api_token = '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'

    try:
        card_text = ModelCard.load(model.modelId, token=api_token ,ignore_metadata_errors=True).text
    except EntryNotFoundError:
        error_messages.append(f'Could not find Model Card text for {model.modelId}')
        card_text = 'error'
    except Exception as e:
        error_messages.append(f'Unexpected error on retrieving "modelcard_text" for {model.modelId}: {str(e)}')
        card_text = 'error'

    if error_messages:
        with open('error_modelcard_text.json', 'a') as f:
            json.dump(error_messages, f, indent=4)

    return card_text

In [14]:
def process_model(model):
    error_messages = []

    try:
        tags = retrieve_model_tags(model)
        files = api_calls_parameters(model) 
        
        if 'README.md' in files:
            card_text = get_modelcard_text(model)
        elif files == 'needs authorization':
            card_text = "needs authorization"
        elif files == 'not allowed':
            card_text = "disabled"
        else:
            card_text = None

        return {'modelId': model.modelId,
                'tags': tags,
                'downloads': model.downloads,
                'likes': model.likes,
                'modelcard_text': card_text}
        
    except Exception as e:
        error_message = f'{getattr(model, "modelId", "Unknown model")} could not be processed: {e}'
        error_messages.append(error_message)
    
    if error_messages:
        with open('error_process.json', 'a') as f:
            json.dump(error_messages, f, indent=4)
    return {}

In [15]:
num_threads = os.cpu_count()
print(f'Using {num_threads} threads for processing.')

Using 20 threads for processing.


In [None]:
start = time.time()

num_threads = 20  

with ThreadPoolExecutor(max_workers=num_threads) as executor:
    models_information = list(executor.map(process_model, models))

models_information = [model for model in models_information if model is not None]
df = pd.DataFrame(models_information)

if 'modelId' in df.columns:
    df.set_index('modelId', inplace=True)

end = time.time()
print(end - start)

In [33]:
df.reset_index(drop=False, inplace=True)

In [36]:
df.rename(columns={'index': 'modelId'}, inplace=True)

In [37]:
df

Unnamed: 0,modelId,tags,downloads,likes,modelcard_text
0,albert/albert-base-v1,"[license:apache-2.0, en, has_space, tf, region...",15847.0,6.0,\n# ALBERT Base v1\n\nPretrained model on Engl...
1,albert/albert-base-v2,"[jax, license:apache-2.0, en, has_space, tf, r...",2577211.0,88.0,\n# ALBERT Base v2\n\nPretrained model on Engl...
2,albert/albert-large-v1,"[license:apache-2.0, en, tf, region:us, fill-m...",1800.0,2.0,\n# ALBERT Large v1\n\nPretrained model on Eng...
3,albert/albert-large-v2,"[license:apache-2.0, en, tf, region:us, fill-m...",423704.0,13.0,\n# ALBERT Large v2\n\nPretrained model on Eng...
4,albert/albert-xlarge-v1,"[license:apache-2.0, en, tf, region:us, fill-m...",1360.0,3.0,\n# ALBERT XLarge v1\n\nPretrained model on En...
...,...,...,...,...,...
624565,Ruiz3/phi-2-kingshipAI-interpreter-price30,[region:us],0.0,0.0,\n# Model Card for Model ID\n\n<!-- Provide a ...
624566,cstr/phi3-mini-4k-llamafied-sft-v3,"[license:apache-2.0, en, region:us, text-gener...",0.0,0.0,disabled
624567,querying/whisper-small-hi,[region:us],0.0,0.0,
624568,cstr/phi3-mini-4k-llamafied-sft-v3_16bit,"[license:apache-2.0, en, region:us, text-gener...",0.0,0.0,\n# Uploaded model\n\n- **Developed by:** cst...


In [38]:
df.to_json('models_data.json', orient='index', indent=4)