Doc: https://huggingface.co/audeering/wav2vec2-large-robust-24-ft-age-gender

In [1]:
from datasets import load_dataset

nome_dataset = "google/fleurs"
idioma_dataset = "pt_br"

dados = load_dataset(
    nome_dataset,
    name=idioma_dataset,
    split='train',
    streaming=True,
    trust_remote_code=True  # <- Adicionado aqui
)

In [2]:
dados

IterableDataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_shards: 1
})

In [3]:
from transformers import pipeline

modelo = "audeering/wav2vec2-large-robust-24-ft-age-gender"
classificador = pipeline('audio-classification', model=modelo)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-24-ft-age-gender and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [4]:
classificador.feature_extractor.sampling_rate

16000

In [5]:
import IPython

for linha in dados.take(5):
    audio = linha['audio']
    predicao = classificador(audio.copy())
    print(predicao)
    display(IPython.display.Audio(data=audio['array'], rate=audio['sampling_rate']))

[{'score': 0.33536821603775024, 'label': 'child'}, {'score': 0.33344244956970215, 'label': 'male'}, {'score': 0.3311893939971924, 'label': 'female'}]


[{'score': 0.3399848937988281, 'label': 'child'}, {'score': 0.3312135338783264, 'label': 'male'}, {'score': 0.32880160212516785, 'label': 'female'}]


[{'score': 0.3400847911834717, 'label': 'child'}, {'score': 0.3303055167198181, 'label': 'female'}, {'score': 0.3296096920967102, 'label': 'male'}]


[{'score': 0.33922868967056274, 'label': 'child'}, {'score': 0.3372753858566284, 'label': 'male'}, {'score': 0.32349586486816406, 'label': 'female'}]


[{'score': 0.33778396248817444, 'label': 'male'}, {'score': 0.3367219865322113, 'label': 'child'}, {'score': 0.32549402117729187, 'label': 'female'}]


In [6]:
import numpy as np
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)


class ModelHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config, num_labels):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x


class AgeGenderModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.age = ModelHead(config, 1)
        self.gender = ModelHead(config, 3)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):

        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits_age = self.age(hidden_states)
        logits_gender = torch.softmax(self.gender(hidden_states), dim=1)

        return hidden_states, logits_age, logits_gender

In [7]:
def process_func(
    model,
    processor,
    x: np.ndarray,
    sampling_rate: int,
    embeddings: bool = False,
    device: str = 'cpu'
) -> np.ndarray:
    r"""Predict age and gender or extract embeddings from raw audio signal."""

    # run through processor to normalize signal
    # always returns a batch, so we just get the first entry
    # then we put it on the device
    y = processor(x, sampling_rate=sampling_rate)
    y = y['input_values'][0]
    y = y.reshape(1, -1)
    y = torch.from_numpy(y).to(device)

    # run through model
    with torch.no_grad():
        y = model(y)
        if embeddings:
            y = y[0]
        else:
            y = torch.hstack([y[1], y[2]])

    # convert to numpy
    y = y.detach().cpu().numpy()

    return y

In [8]:
nome_modelo = "audeering/wav2vec2-large-robust-24-ft-age-gender"

processador = Wav2Vec2Processor.from_pretrained(nome_modelo)
modelo = AgeGenderModel.from_pretrained(nome_modelo)

In [9]:
def exibe_predicao(predicao):
    idade = predicao[0][0] * 100
    prob_mulher = predicao[0][1] * 100
    prob_homem = predicao[0][2] * 100
    prob_crianca = predicao[0][3] * 100
    print(f'Idade: {idade:.2f}')
    print(f'Pessoa inferida:\n{prob_mulher:.2f}% Mulher \n{prob_homem:.2f}% Homem\n{prob_crianca:.2f}%')

In [10]:
for linha in dados.take(5):
    x = linha['audio']['array']
    sampling_rate = linha['audio']['sampling_rate']
    predicao = process_func(model=modelo, processor=processador, x=x, sampling_rate=sampling_rate)
    exibe_predicao(predicao)
    display(IPython.display.Audio(data=x, rate=sampling_rate))

Idade: 21.44
Pessoa inferida:
0.07% Mulher 
99.88% Homem
0.05%


Idade: 22.98
Pessoa inferida:
1.59% Mulher 
98.32% Homem
0.09%


Idade: 21.20
Pessoa inferida:
0.95% Mulher 
98.95% Homem
0.10%


Idade: 21.95
Pessoa inferida:
99.51% Mulher 
0.42% Homem
0.07%


Idade: 30.41
Pessoa inferida:
99.30% Mulher 
0.66% Homem
0.04%


In [11]:
import sounddevice as sd

duracao = 10
taxa_amostragem = 16000
tamanho_vetor = int(duracao * taxa_amostragem)

gravacao = sd.rec(tamanho_vetor, samplerate=taxa_amostragem, channels=1).ravel()
sd.wait()

PortAudioError: Error querying device -1

In [None]:
predicao = process_func(model=modelo, processor=processador, x=gravacao, sampling_rate=taxa_amostragem)
exibe_predicao(predicao)

In [12]:
from datasets import load_dataset

dados = load_dataset(
    'ashraq/esc50',
    split='train[:10]'
)
for linha in dados:
    print(linha)

Repo card metadata block was not found. Setting CardData to empty.


{'filename': '1-100032-A-0.wav', 'fold': 1, 'target': 0, 'category': 'dog', 'esc10': True, 'src_file': 100032, 'take': 'A', 'audio': {'path': None, 'array': array([0., 0., 0., ..., 0., 0., 0.], shape=(220500,)), 'sampling_rate': 44100}}
{'filename': '1-100038-A-14.wav', 'fold': 1, 'target': 14, 'category': 'chirping_birds', 'esc10': False, 'src_file': 100038, 'take': 'A', 'audio': {'path': None, 'array': array([-0.01184082, -0.10336304, -0.14141846, ...,  0.06985474,
        0.04049683,  0.00274658], shape=(220500,)), 'sampling_rate': 44100}}
{'filename': '1-100210-A-36.wav', 'fold': 1, 'target': 36, 'category': 'vacuum_cleaner', 'esc10': False, 'src_file': 100210, 'take': 'A', 'audio': {'path': None, 'array': array([-0.00695801, -0.01251221, -0.01126099, ...,  0.215271  ,
       -0.00875854, -0.28903198], shape=(220500,)), 'sampling_rate': 44100}}
{'filename': '1-100210-B-36.wav', 'fold': 1, 'target': 36, 'category': 'vacuum_cleaner', 'esc10': False, 'src_file': 100210, 'take': 'B', '

In [13]:
modelo = 'laion/clap-htsat-fused'
classificador = pipeline("zero-shot-audio-classification", model=modelo)

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/614M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

Device set to use cpu


In [14]:
classificador.feature_extractor.sampling_rate

48000

In [21]:
from datasets import Audio

dados = dados.cast_column("audio", Audio(sampling_rate=48000))

dados

Dataset({
    features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
    num_rows: 10
})

In [25]:
labels = [
    "sound of a dog",
     "sound of a birds",
     "sound of a cat",
     "sound of a cow",
     "sound of a vaccum cleaner"
]

for linha in dados:
    categoria = linha['category']
    predicao = classificador(linha['audio']['array'], candidate_labels=labels)
    print(f'Tipo:{categoria}\nPredição: {predicao[0]["label"]} (score: {predicao[0]["score"]}')
    print('-----')

Tipo:dog
Predição: sound of a dog (score: 0.9993540644645691
-----
Tipo:chirping_birds
Predição: sound of a birds (score: 0.9946568012237549
-----
Tipo:vacuum_cleaner
Predição: sound of a vaccum cleaner (score: 0.9997842907905579
-----
Tipo:vacuum_cleaner
Predição: sound of a vaccum cleaner (score: 0.9999473094940186
-----
Tipo:thunderstorm
Predição: sound of a cow (score: 0.8593886494636536
-----
Tipo:thunderstorm
Predição: sound of a cow (score: 0.9299669861793518
-----
Tipo:door_wood_knock
Predição: sound of a cow (score: 0.9821761250495911
-----
Tipo:can_opening
Predição: sound of a dog (score: 0.5927196145057678
-----
Tipo:crow
Predição: sound of a cow (score: 0.5825767517089844
-----
Tipo:door_wood_knock
Predição: sound of a cow (score: 0.6561239957809448
-----
