# Audio classification with a pipeline

In [21]:
from datasets import load_dataset
from datasets import Audio

minds = load_dataset("PolyAI/minds14", name="ko-KR", split="train")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [2]:
from transformers import pipeline

classifier = pipeline(
    "audio-classification",
    model="anton-l/xtreme_s_xlsr_300m_minds14",
)

2024-01-21 11:47:10.354724: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Some weights of the model checkpoint at anton-l/xtreme_s_xlsr_300m_minds14 were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Be

In [3]:
example = minds[0]
print(example)
print(len(example))

{'path': '/storage/hf-datasets-cache/all/datasets/90720017063314-config-parquet-and-info-PolyAI-minds14-0f8c9bcc/downloads/extracted/73ab7f9ade76d3035add3081ab277e5bae46f3dbe75008135fea9e528746677a/ko-KR~ATM_LIMIT/602bef265f67b421554f65e7.wav', 'audio': {'path': '602bef265f67b421554f65e7.wav', 'array': array([ 2.45040166e-04,  1.78244198e-04, -1.80276402e-06, ...,
        5.53574704e-04,  4.92824474e-04,  2.39125075e-04]), 'sampling_rate': 16000}, 'transcription': 'app Manager 하고 싶은데 최대 금액이 얼마인지요', 'english_transcription': 'I want to do app manager, what is the maximum amount', 'intent_class': 3, 'lang_id': 8}
6


In [4]:
classifier(example["audio"]["array"])

[{'score': 0.9984392523765564, 'label': 'atm_limit'},
 {'score': 0.0002920173283200711, 'label': 'cash_deposit'},
 {'score': 0.00021480668510776013, 'label': 'joint_account'},
 {'score': 0.00018336036009714007, 'label': 'abroad'},
 {'score': 0.0001707376359263435, 'label': 'card_issues'}]

In [22]:
id2label = minds.features["intent_class"].int2str
id2label(example["intent_class"])

'pay_bill'

## Running the classifier

In [14]:
from tqdm import tqdm  # Import tqdm for progress bars

def generate_predicted(minds):
    predicted = []
    length = len(minds)
    print(length)
    for idx in tqdm(range(length)):  #iterable
        y_pred = classifier(minds[idx]["audio"]["array"])
        #print(y_pred[0])
        #print(y_pred[0]['label'], id2label(minds[idx]['intent_class']))
        predicted.append(y_pred[0]['label'])
    #print(predicted)
    return predicted # [0] being the largest score

predicted = generate_predicted(minds)
ground_truth = id2label(minds['intent_class'])

592


100%|██████████| 592/592 [13:23<00:00,  1.36s/it]


## Classification report & Confusion matrix

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

def performance_report(y_test, y_pred):
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Display the results
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    
    # Generate a classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:\n", class_report)
    
    # Generate a confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)

performance_report(ground_truth, predicted)

Accuracy: 0.89
Precision: 0.90
Recall: 0.89
F1 Score: 0.89
Classification Report:
                      precision    recall  f1-score   support

             abroad       0.81      0.77      0.79        44
            address       0.98      0.98      0.98        44
          app_error       0.80      0.92      0.86        39
          atm_limit       0.95      0.85      0.90        41
            balance       0.92      0.90      0.91        40
      business_loan       0.98      0.98      0.98        43
        card_issues       0.69      0.93      0.79        45
       cash_deposit       0.97      0.85      0.91        46
       direct_debit       0.96      1.00      0.98        44
             freeze       0.80      0.80      0.80        35
 high_value_payment       0.93      0.93      0.93        44
      joint_account       0.98      0.98      0.98        44
latest_transactions       0.89      0.83      0.86        41
           pay_bill       0.94      0.76      0.84        42



## Try on a different language

In [17]:
minds_en = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds_en = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [18]:
predicted = generate_predicted(minds_en)
ground_truth = id2label(minds_en['intent_class'])
performance_report(ground_truth, predicted)

654


100%|██████████| 654/654 [13:58<00:00,  1.28s/it]

Accuracy: 0.95
Precision: 0.95
Recall: 0.95
F1 Score: 0.95
Classification Report:
                      precision    recall  f1-score   support

             abroad       1.00      0.83      0.91        35
            address       0.98      0.98      0.98        52
          app_error       0.91      0.98      0.94        42
          atm_limit       0.98      0.94      0.96        50
            balance       0.96      0.96      0.96        48
      business_loan       0.96      1.00      0.98        46
        card_issues       0.94      0.96      0.95        48
       cash_deposit       1.00      0.93      0.96        43
       direct_debit       0.96      1.00      0.98        47
             freeze       0.98      0.91      0.95        47
 high_value_payment       0.84      0.93      0.88        45
      joint_account       0.98      1.00      0.99        49
latest_transactions       0.91      0.91      0.91        53
           pay_bill       0.96      0.96      0.96        49






# Automatic speech recognition with a pipeline

In [55]:
from transformers import pipeline

# default model'wav2vec2' did not output proper Korean characters
asr = pipeline("automatic-speech-recognition", model="anantoj/wav2vec2-xls-r-1b-korean")

config.json:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Some weights of the model checkpoint at anantoj/wav2vec2-xls-r-1b-korean were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at anantoj/wav2vec2-xls-r-1b-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN t

tokenizer_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

In [67]:
example = minds[0]
print(example)
asr(example["audio"]["array"])

{'text': ' 에이티엠에 돈을 인출하고 싶은데 최대 금액이 얼마인지요'}

In [68]:
print(example)

{'path': '/storage/hf-datasets-cache/all/datasets/90720017063314-config-parquet-and-info-PolyAI-minds14-0f8c9bcc/downloads/extracted/73ab7f9ade76d3035add3081ab277e5bae46f3dbe75008135fea9e528746677a/ko-KR~ATM_LIMIT/602bef265f67b421554f65e7.wav', 'audio': {'path': '602bef265f67b421554f65e7.wav', 'array': array([ 2.45040166e-04,  1.78244198e-04, -1.80276402e-06, ...,
        5.53574704e-04,  4.92824474e-04,  2.39125075e-04]), 'sampling_rate': 16000}, 'transcription': 'app Manager 하고 싶은데 최대 금액이 얼마인지요', 'english_transcription': 'I want to do app manager, what is the maximum amount', 'intent_class': 3, 'lang_id': 8}


## Trying with Whisper models

In [77]:
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [78]:
example = minds[1]
asr(example["audio"]["array"])

{'text': ' 제가 에이티엠기에서 최대 얼마까지 돈을 인출할 수 있을까요'}

## Audio play with Gradio for evaluation
>* Compare the transcription by the model "PolyAI/minds14" to the ASR output of the model "Whisper-large-v3"
>* other models interesting : "starcel/asr-conformer-kdialectspeech" for modeling Korean elderlies' dialects

In [81]:
example = minds.shuffle()[0]
print(example['transcription'])
transcribed = asr(example["audio"]["array"])
transcribed

안녕하세요 저는 몇 주 동안 휴가 리그 들어가는데 카드 쓸 수 있는지 물어보고 싶어서 전화 드릴까요


{'text': ' 안녕하세요. 저는 몇 주 동안 휴가 미국으로 가는데 카드 쓸 수 있는지 물어보고 싶어서 전화드렸어요.'}

In [82]:
import gradio as gr

def generate_audio(example):
    #example = minds.shuffle()[0]
    #example = minds[idx]
    audio = example["audio"]
    return (
        audio["sampling_rate"],
        audio["array"],
    ), id2label(example["intent_class"]) + ": " + example['transcription']
    
with gr.Blocks() as demo:
    with gr.Column():
        
        audio, label = generate_audio(example)
        label = label + "\n Whisper: " + str(list(transcribed.values()))
        print(label)
        output = gr.Audio(audio, label=label)

demo.launch(debug=True, share=True)



abroad: 안녕하세요 저는 몇 주 동안 휴가 리그 들어가는데 카드 쓸 수 있는지 물어보고 싶어서 전화 드릴까요
 Whisper: [' 안녕하세요. 저는 몇 주 동안 휴가 미국으로 가는데 카드 쓸 수 있는지 물어보고 싶어서 전화드렸어요.']
Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://cd1b5e15e248f7b900.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://cd1b5e15e248f7b900.gradio.live




## Your own audiofile to be transcribed

In [86]:

#ffmpeg -i yourfile.wma -acodec pcm_s16le -ar 16000 yourfile.wav

#compatible format
#MP3,  FLAC,  OGG,  AAC,  AIFF,  M4A

input_file = "Hangang_river_flows.m4a"
# Load your audio file (now in a compatible format)
with open(input_file, "rb") as audio_file:
    audio_data = audio_file.read()
    
transcribed = asr(audio_data)
transcribed

{'text': ' 서울의 중심에는 한강 하류가 동에서 서쪽으로 흐르고 있다.'}

In [87]:
with gr.Blocks() as demo:
    with gr.Column():
        label = str(list(transcribed.values()))
        print(label)
        output = gr.Audio(audio_data, label=label)

demo.launch(debug=True, share=True)

[' 서울의 중심에는 한강 하류가 동에서 서쪽으로 흐르고 있다.']
Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://21183800d920214e03.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://21183800d920214e03.gradio.live




## Audio generation

In [124]:
from transformers import pipeline
from transformers import AutoTokenizer

# Assuming you're using a model like "facebook/wav2vec2-base-960h" for TTS, which is actually an ASR model. 
# Let's use a proper TTS model example:
model_name = "suno/bark-small"  # Replace with your TTS model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

BertTokenizerFast(name_or_path='suno/bark-small', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [125]:
# Prepare your text input
text = "Ladybugs have had important roles in culture and religion, being associated with luck, love, fertility and prophecy."
#inputs = tokenizer(text, return_tensors="pt", padding=True, add_special_tokens=True)

#pipe = pipeline("text-to-speech", model=model_name, tokenizer=tokenizer)
pipe = pipeline("text-to-speech", model=model_name)
# Now inputs include the attention mask automatically
# Generate speech using the pipeline
# Note: The exact method to generate speech might vary based on the model and pipeline you're using.
#speech = tts_pipeline(inputs["input_ids"], attention_mask=inputs["attention_mask"])
#output = pipe(inputs["input_ids"], inputs["attention_mask"])
#output = pipe(inputs["input_ids"])


In [126]:
output = pipe(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [117]:
from IPython.display import Audio

Audio(output["audio"], rate=output["sampling_rate"])

In [118]:
fr_text = "Contrairement à une idée répandue, le nombre de points sur les élytres d'une coccinelle ne correspond pas à son âge, ni en nombre d'années, ni en nombre de mois. "
output = pipe(fr_text)
Audio(output["audio"], rate=output["sampling_rate"])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [119]:
song = "♪ In the jungle, the mighty jungle, the ladybug was seen. ♪ "
output = pipe(song)
Audio(output["audio"], rate=output["sampling_rate"])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


## Generating music

In [121]:
music_pipe = pipeline("text-to-audio", model="facebook/musicgen-small")

In [122]:
text = "90s rock song with electric guitar and heavy drums"

In [123]:
forward_params = {"max_new_tokens": 512}

output = music_pipe(text, forward_params=forward_params)
Audio(output["audio"][0], rate=output["sampling_rate"])