# Hugging Face model navigation

## How many models

In [1]:
from huggingface_hub import HfApi 
api = HfApi()   
models = api.list_models()

# Find the list of models for a task
models = api.list_models(task="text-to-image")
print(f"Task: text-to-image, Models: {len(list(models))}")


Use `filter` instead.


Task: text-to-image, Models: 83575


## Finding the most popular text-to-image model

In [2]:
models = api.list_models(
    # Filter for text-to-image tasks
    task="text-to-image",
    author="CompVis",
    # Filter for models that can be loaded by the diffusers library
    tags="diffusers:StableDiffusionPipeline",
    # Sort according to the most popular
    sort="downloads",
    limit=10,
)

models = list(models)
print(f"Found {len(models)} models (showing ids):")
for m in models:
    print(" -", m.id)

Found 4 models (showing ids):
 - CompVis/stable-diffusion-v1-4
 - CompVis/stable-diffusion-v1-1
 - CompVis/stable-diffusion-v1-2
 - CompVis/stable-diffusion-v1-3



Use `filter` instead.


In [3]:
#from diffusers import StableDiffusionPipeline

# Load the most popular model from models
#pipe = StableDiffusionPipeline.from_pretrained(models[0].id)

In [4]:
import json 
from urllib.request import urlopen

url = "https://huggingface.co/api/tasks"
with urlopen(url) as url:
  tasks = json.load(url)
  print(tasks.keys())

dict_keys(['any-to-any', 'audio-classification', 'audio-to-audio', 'audio-text-to-text', 'automatic-speech-recognition', 'depth-estimation', 'document-question-answering', 'visual-document-retrieval', 'feature-extraction', 'fill-mask', 'image-classification', 'image-feature-extraction', 'image-segmentation', 'image-to-image', 'image-text-to-text', 'image-to-text', 'image-to-video', 'keypoint-detection', 'mask-generation', 'object-detection', 'video-classification', 'question-answering', 'reinforcement-learning', 'sentence-similarity', 'summarization', 'table-question-answering', 'tabular-classification', 'tabular-regression', 'text-classification', 'text-generation', 'text-ranking', 'text-to-image', 'text-to-speech', 'text-to-video', 'token-classification', 'translation', 'unconditional-image-generation', 'video-text-to-text', 'video-to-video', 'visual-question-answering', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'text-to-3d', 'image-t

# Preprocessing different modalities

## Text tokenizing

In [5]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')  
text = "Do you need more éclairs?"    
print(tokenizer.backend_tokenizer.normalizer.normalize_str(text))

tokenizer(text, return_tensors='pt', padding=True)

do you need more eclairs?


{'input_ids': tensor([[  101,  2079,  2017,  2342,  2062, 14925, 19771,  2869,  1029,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [6]:
# from transformers import BlipProcessor, BlipForConditionalGeneration

# checkpoint = "Salesforce/blip-image-captioning-base"  
# model = BlipForConditionalGeneration.from_pretrained(checkpoint) 
# processor = BlipProcessor.from_pretrained(checkpoint)

## Image preprocessing

In [7]:
# from datasets import load_dataset 

# image = load_dataset("nlphuji/flickr30k")['test'][11]["image"]
# inputs = processor(images=image, return_tensors="pt")
# output = model.generate(**inputs)
# print(processor.decode(output[0]))

## Audio preprocessing

In [8]:
# from datasets import load_dataset, Audio

# dataset = load_dataset("CSTR-Edinburgh/vctk")["train"] 
# dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))

# from transformers import AutoProcessor

# processor = AutoProcessor.from_pretrained("openai/whisper-small") 
# audio_pp = processor(dataset[0]["audio"]["array"], sampling_rate=16_000, return_tensors="pt")

# Pipeline tasks and evaluations

## Pipeline caption generation

In [9]:
# from transformers import BlipProcessor, BlipForConditionalGeneration
# checkpoint = "Salesforce/blip-image-captioning-base"

# from transformers import pipeline 
# pipe = pipeline("image-to-text", model=checkpoint)

# from datasets import load_dataset 

# image = load_dataset("nlphuji/flickr30k")['test'][11]["image"]
# pipe(image)

In [10]:
# from huggingface_hub import HfApi
# from transformers import pipeline

# model = list(api.list_models(task="text-to-image", limit=5))
# pipe = pipeline("text-to-image", model[0].id) 


## Passing keyword arguments

In [11]:
# from transformers import pipeline
# import soundfile as sf

# # Load a text-to-audio pipeline
# musicgen = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")

# # Make a dictionary to set the generation temperature to 0.8 and max_new_tokens to 1
# generate_kwargs = {"temperature": 0.8, "max_new_tokens": 1}

# # Generate an audio array passing the arguments
# outputs = musicgen("Classic rock riff", generate_kwargs=generate_kwargs)
# sf.write("output.wav", outputs["audio"][0][0], outputs["sampling_rate"])

## Model evaluation on a custom dataset

In [12]:
# from evaluate import evaluator
# from huggingface_hub import HfApi
# from transformers import pipeline

# model = list(api.list_models(task="text-to-image", limit=5))
# pipe = pipeline("text-to-image", model[0].id) 

# # Instantiate the task evaluator
# task_evaluator = evaluator("image-classification")

# task_evaluator.METRIC_KWARGS = {"average": "weighted"}

# # Get label map from pipeline
# label_map = pipe.model.config.label2id

# # Compute the metrics
# eval_results = task_evaluator.compute(model_or_pipeline=pipe, data=dataset, metric=evaluate.combine(metrics_dict), label_mapping=label_map)

# print(f"Precision: {eval_results['precision']:.2f}, Recall: {eval_results['recall']:.2f}")