# Installation requirements

* Set Hardware to GPU

In [1]:
!pip install transformers datasets evaluate accelerate "librosa~=0.10.1" -U



# Training an audio classification model (fine tuning starting from a pretrained model)

In [2]:
from huggingface_hub import notebook_login

notebook_login() # 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Creating custom Huggingface compatible dataset using our data

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/Shareddrives/CS224s/data/AI_open_mic_dataset')



Mounted at /content/drive


## Creating the feature dataset

In [3]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
from datasets import load_dataset, Audio

humor = load_dataset("rishiA/humor_detection_1")

In [5]:
humor_ds = humor["train"].train_test_split(test_size=0.2)

In [None]:
humor_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 828
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 208
    })
})

In [6]:
humor_ds = humor_ds.cast_column("audio", Audio(sampling_rate=16_000))
humor_ds["train"][0]

{'audio': {'path': 'RP_DP_audio_18.mp3',
  'array': array([0.        , 0.        , 0.        , ..., 0.00092481, 0.00092997,
         0.        ]),
  'sampling_rate': 16000},
 'label': 1}

In [7]:
def preprocess_function(examples):
    # audio_arrays = [x["array"] for x in examples["audio"]]
    audio_arrays = [examples["audio"]["array"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [8]:
encoded_humor_ds = humor_ds.map(preprocess_function, remove_columns="audio", batched=False)
# encoded_humor_ds = encoded_humor_ds.rename_column("intent_class", "label")

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [9]:
encoded_humor_ds["train"][0]

{'label': 1,
 'input_features': [[[-1.5,
    -1.5,
    -1.5,
    -1.5,
    -1.3679521083831787,
    -0.9182568788528442,
    -0.6071164608001709,
    -0.5279613733291626,
    -0.7496684789657593,
    -0.653171181678772,
    -0.5175577402114868,
    -0.6888386011123657,
    -0.5477309226989746,
    -0.6805111169815063,
    -0.9303611516952515,
    -0.5320454835891724,
    -0.5208518505096436,
    -0.8019342422485352,
    -1.0304310321807861,
    -0.5617263317108154,
    -0.6391302347183228,
    -0.5253524780273438,
    -0.5286924839019775,
    -0.6436593532562256,
    -0.7281163930892944,
    -0.42167770862579346,
    -0.4186638593673706,
    -0.7899899482727051,
    -0.6870081424713135,
    -0.8434076309204102,
    -0.644523024559021,
    -0.5989179611206055,
    -0.6083259582519531,
    -0.7649253606796265,
    -1.010486125946045,
    -0.5199707746505737,
    -0.6696039438247681,
    -0.5403316020965576,
    -0.5760341882705688,
    -0.4746748208999634,
    -0.5428770780563354,
    -0

In [10]:
import numpy as np
np.array(encoded_humor_ds["train"][0]["input_features"]).shape

(1, 80, 100)

In [34]:
type(encoded_humor_ds["train"][0]["input_features"])

list

In [11]:
REQUIRED_LENGTH = 3000
def flatten(x):
  # print(x)
  x["input_features"] = x["input_features"][0]
  input_values = np.array(x["input_features"])
  if input_values.shape[-1] < REQUIRED_LENGTH:
    padding_length = REQUIRED_LENGTH - input_values.shape[-1]
    x["input_features"] = list(np.pad(input_values, ((0, 0), (0, padding_length)), 'constant'))
  return x
encoded_humor_ds = encoded_humor_ds.map(flatten)

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [None]:
def pad_feature(x):


In [20]:
import numpy as np

In [12]:
np.array(encoded_humor_ds["train"][0]["input_features"]).shape

(80, 3000)

In [13]:
encoded_humor_ds.push_to_hub("encoded_humor_detection_3")

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/438 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/rishiA/encoded_humor_detection_3/commit/a3a9058ac1ecd72109a3d1996c486b5ef296bcb8', commit_message='Upload dataset', commit_description='', oid='a3a9058ac1ecd72109a3d1996c486b5ef296bcb8', pr_url=None, pr_revision=None, pr_num=None)

## Start Training Here

In [2]:
from datasets import load_dataset, Audio

encoded_humor = load_dataset("rishiA/encoded_humor_detection_3")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/444 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/828 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/208 [00:00<?, ? examples/s]

In [3]:
encoded_humor_ds = encoded_humor["train"].train_test_split(test_size=0.2)

In [4]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v2")

In [5]:
import evaluate

accuracy = evaluate.load("accuracy")

In [6]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [7]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = 2
model = AutoModelForAudioClassification.from_pretrained(
    "openai/whisper-large-v2", num_labels=num_labels
)

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-large-v2 and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.projector.bias', 'model.projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer
from torch import nn
import torch.nn.functional as F

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = [7.0, 1.0]

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Ensure class_weights is a tensor and move it to the correct device
        class_weights = torch.tensor(self.class_weights).to(logits.device)

        # Compute the weighted loss
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), weight=class_weights)

        return (loss, outputs) if return_outputs else loss

In [19]:
encoded_humor_ds["train"]

Dataset({
    features: ['label', 'input_features'],
    num_rows: 828
})

In [None]:
training_args = TrainingArguments(
    output_dir="humor_model_v4",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_humor_ds["train"],
    eval_dataset=encoded_humor_ds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.push_to_hub()

events.out.tfevents.1716680628.3990fc60c73d.12088.3:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rishiA/humor_model_v1/commit/105da097b17edb869c28da2a4343d881b5dfa9d8', commit_message='End of training', commit_description='', oid='105da097b17edb869c28da2a4343d881b5dfa9d8', pr_url=None, pr_revision=None, pr_num=None)

# Using the trained model

In [None]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="rishiA/humor_model_v4")

config.json:   0%|          | 0.00/3.10k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

[{'score': 0.0801304280757904, 'label': 'card_issues'},
 {'score': 0.07759815454483032, 'label': 'balance'},
 {'score': 0.0756552666425705, 'label': 'pay_bill'},
 {'score': 0.07464209198951721, 'label': 'high_value_payment'},
 {'score': 0.07453717291355133, 'label': 'cash_deposit'}]

In [None]:
from datasets import load_dataset, Audio

humor = load_dataset("rishiA/humor_detection_1")
humor_ds = humor["train"].train_test_split(test_size=0.2)
humor_ds["test"]

In [None]:
success = 0
failure = 0
funny_ones = 0
unfunny_ones = 0
test_cases = 0
for example in humor_ds["test"]:
  test_cases += 1
  if test_cases == 50:
    break
  short_path = example["audio"]["path"]
  audio_file_path = '/content/drive/Shareddrives/CS224s/data/AI_open_mic_dataset/funny_audio_mp3/' + short_path
  audio_file_path2 = '/content/drive/Shareddrives/CS224s/data/AI_open_mic_dataset/unfunny_audio_mp3/' + short_path
  truth_label = example["label"]
  if os.path.exists(audio_file_path):
    result = classifier(audio_file_path)
    funny_ones += 1
  else:
    result = classifier(audio_file_path2)
    unfunny_ones += 1
  # print(result)
  if (result[0]['label'] == 'LABEL_1' and result[0]['score'] > result[1]['score']) or (result[0]['label'] == 'LABEL_0' and result[1]['score'] > result[0]['score']):
    pred_label = 1
  else:
    pred_label = 0
  if pred_label == truth_label:
    success += 1
  else:
    failure += 1

In [None]:
print(f"success rate is {success/(success+failure)}")

In [None]:
print(f"among the 50 test cases, {funny_ones} are funny ones, and {unfunny_ones} are unfunny ones")