In [1]:
!pip install torch
!pip install datasets
!pip install librosa
!pip install transformers

# import libraries
import datasets
from datasets import load_dataset, DatasetDict,  Audio
import pandas as pd
import os
import glob
import librosa
import io
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score
from transformers import WhisperModel, WhisperFeatureExtractor, AdamW
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, load_from_disk



In [2]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd '/content/drive/MyDrive/vandy 24fall/Transformer/public_samples'

/content/drive/MyDrive/vandy 24fall/Transformer/public_samples


In [4]:
!ls

CASES	      test.csv		train_dataset.zip	     whisper_small_best_model.pt
CASES_WAV     test_dataset	val_dataset		     whisper_tiny_best_model.pt
catalog.csv   test_dataset.zip	val_dataset.zip
CONTROLS      train.csv		whisper_best_model.pt
CONTROLS_WAV  train_dataset	whisper_large_best_model.pt


In [5]:
train_audio_dataset = load_from_disk("train_dataset")
test_audio_dataset = load_from_disk("test_dataset")
val_audio_dataset = load_from_disk(f"val_dataset")

In [6]:
model_checkpoint = "openai/whisper-small"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

      inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                   return_tensors="pt",
                                   sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
      input_features = inputs.input_features
      decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id

      labels = np.array(self.audio_data[index]['labels'])

      return input_features, decoder_input_ids, torch.tensor(labels)


In [8]:
train_dataset = SpeechClassificationDataset(train_audio_dataset,  feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset,  feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset,  feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Train

In [9]:
class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits


In [10]:
num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()



In [11]:
# Define the training function
def train(model, train_loader, val_loader, optimizer,  criterion, device, num_epochs):

    best_accuracy = 0.0

    for epoch in range(num_epochs):

        model.train()

        for i, batch in enumerate(train_loader):

            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze()
            input_features = input_features.to(device)

            decoder_input_ids = decoder_input_ids.squeeze()
            decoder_input_ids = decoder_input_ids.to(device)

            labels = labels.view(-1)
            labels = labels.type(torch.LongTensor)
            labels = labels.to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)

            loss = criterion(logits, labels)
            loss.backward()

            optimizer.step()

            if (i+1) % 8 == 0:
                print(f'Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_loader)}, Train Loss: {loss.item() :.4f}')
                train_loss = 0.0

        val_loss, val_accuracy, val_f1, _ , _ = evaluate(model, val_loader, device)

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'whisper_small_best_model.pt')

        print("========================================================================================")
        print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}, Best Accuracy: {best_accuracy:.4f}')
        print("========================================================================================")

In [12]:
def evaluate(model, data_loader,  device):

    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():

        for i, batch in enumerate(data_loader):

          input_features, decoder_input_ids, labels = batch

          input_features = input_features.squeeze()
          input_features = input_features.to(device)

          decoder_input_ids = decoder_input_ids.squeeze()
          decoder_input_ids = decoder_input_ids.to(device)

          labels = labels.view(-1)
          labels = labels.type(torch.LongTensor)
          labels = labels.to(device)

          optimizer.zero_grad()

          logits = model(input_features, decoder_input_ids)

          loss = criterion(logits, labels)
          total_loss += loss.item()

          _, preds = torch.max(logits, 1)
          all_labels.append(labels.cpu().numpy())
          all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds


In [13]:
import librosa
num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/5, Batch 8/13, Train Loss: 0.5684
Epoch 1/5, Val Loss: 0.3357, Val Accuracy: 0.8889, Val F1: 0.8889, Best Accuracy: 0.8889
Epoch 2/5, Batch 8/13, Train Loss: 0.2829
Epoch 2/5, Val Loss: 0.9989, Val Accuracy: 0.8000, Val F1: 0.7935, Best Accuracy: 0.8889
Epoch 3/5, Batch 8/13, Train Loss: 0.0031
Epoch 3/5, Val Loss: 1.5285, Val Accuracy: 0.6444, Val F1: 0.5853, Best Accuracy: 0.8889
Epoch 4/5, Batch 8/13, Train Loss: 0.0184
Epoch 4/5, Val Loss: 0.5063, Val Accuracy: 0.8444, Val F1: 0.8416, Best Accuracy: 0.8889
Epoch 5/5, Batch 8/13, Train Loss: 0.0064
Epoch 5/5, Val Loss: 0.3529, Val Accuracy: 0.8667, Val F1: 0.8666, Best Accuracy: 0.8889


In [14]:
#VALIDATION
state_dict = torch.load('whisper_small_best_model.pt')

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, val_loader, device)

#VALIDATION
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

  state_dict = torch.load('whisper_small_best_model.pt')


              precision    recall  f1-score   support

           0       0.87      0.91      0.89        22
           1       0.91      0.87      0.89        23

    accuracy                           0.89        45
   macro avg       0.89      0.89      0.89        45
weighted avg       0.89      0.89      0.89        45

0.8888888888888888


In [15]:
#TEST
# Evaluation on test data
_, _, _, all_labels, all_preds = evaluate(model, test_loader, device)

# Generate test results
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        18
           1       1.00      0.95      0.97        19

    accuracy                           0.97        37
   macro avg       0.97      0.97      0.97        37
weighted avg       0.97      0.97      0.97        37

0.972972972972973


In [18]:
import gradio as gr
import torch
import torchaudio
from transformers import WhisperFeatureExtractor, WhisperModel
import torch.nn as nn

# Define the SpeechClassifier model architecture
class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

# Load the trained model
model_checkpoint = "openai/whisper-small"
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
num_labels = 2
model = SpeechClassifier(num_labels, encoder)

# Load the trained weights
state_dict = torch.load("whisper_small_best_model.pt", map_location=torch.device("cpu"))
model.load_state_dict(state_dict)
model.eval()

# Define the prediction function
def predict_hypernasality(wav_file):
    try:
        # Load and preprocess the audio file
        waveform, sample_rate = torchaudio.load(wav_file)
        target_sample_rate = 16000
        if sample_rate != target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
            waveform = resampler(waveform)

        # Extract features
        inputs = feature_extractor(waveform.numpy(), sampling_rate=target_sample_rate, return_tensors="pt")
        input_features = inputs.input_features
        decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id

        # Predict
        with torch.no_grad():
            logits = model(input_features, decoder_input_ids)
            prediction = torch.argmax(logits, dim=1).item()

        # Map prediction to labels
        if prediction == 1:
            return "Hypernasality Detected"
        else:
            return "No Hypernasality Detected"

    except Exception as e:
        return f"Error processing the file: {e}"

# Define the Gradio interface
def build_gradio_interface():
    interface = gr.Interface(
        fn=predict_hypernasality,
        inputs=gr.Audio(type="filepath", label="Upload WAV File"),
        outputs=gr.Textbox(label="Prediction"),
        title="Hypernasality Detection",
        description="Upload a WAV file to detect whether it exhibits hypernasality.",
        allow_flagging="never"
    )
    return interface

# Launch the Gradio app
app = build_gradio_interface()
app.launch()


  state_dict = torch.load("whisper_small_best_model.pt", map_location=torch.device("cpu"))


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e906ba4e137bb92528.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


