In [None]:
# Transformers Export to TorchScript
# https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/clap

from datasets import load_dataset
from transformers import AutoProcessor, ClapModel

dataset = load_dataset("ashraq/esc50")
audio_sample = dataset["train"]["audio"][0]["array"]

model = ClapModel.from_pretrained("laion/clap-htsat-unfused", torchscript=True)
model.eval()
processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused", torchscript=True)

input_text = ["The sound of a moderate-length input string"]

inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

outputs = model(**inputs)

In [None]:
from torch import Tensor, jit

text_features_func = lambda input_ids, attention_mask: model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
tokenized_inputs = processor.tokenizer(input_text[0], return_tensors="pt", padding=True)
text_features_dummy_input = (tokenized_inputs["input_ids"], tokenized_inputs["attention_mask"])

audio_features_func = lambda audio_tensor: model.get_audio_features(**processor.feature_extractor(audio_tensor, return_tensors="pt", padding=True))
audio_features_dummy_input = Tensor(audio_sample)

print("Text features test shape: ", text_features_func(*text_features_dummy_input).shape)
print("Audio features test shape: ", audio_features_func(audio_features_dummy_input).shape)

print("Tracing text features model")
jit.trace(text_features_func, text_features_dummy_input).save("laion_clap_htsat_unfused_get_text_features.pt")
print("Tracing audio features model")
jit.trace(audio_features_func, audio_features_dummy_input).save("laion_clap_htsat_unfused_get_audio_features.pt")
