# Data Access and Transformation for Speech Classification

Cleft palate dataset that is analyzed to identify hypernasality in speech

## 1. Importing Necessary Libraries

In [None]:
!pip install datasets
import datasets
import pandas as pd
import os
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import WhisperModel, WhisperFeatureExtractor
from datasets import load_from_disk
import torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

## 2. Loading the Data

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd #check where I am

/content


In [None]:
!cd '/content/drive/MyDrive/vandy 24fall/Transformer/public_samples'

/content/drive/MyDrive/vandy 24fall/Transformer/public_samples


In [None]:
!ls #check what do I have

CASES	     CONTROLS	   test_dataset      train_dataset	val_dataset.zip
CASES_WAV    CONTROLS_WAV  test_dataset.zip  train_dataset.zip	whisper_best_model.pt
catalog.csv  test.csv	   train.csv	     val_dataset


In [None]:
train_audio_dataset = load_from_disk("train_dataset")
test_audio_dataset = load_from_disk("test_dataset")
val_audio_dataset = load_from_disk(f"val_dataset")

In [None]:
# explore the data
# audio has 1)path: the path to the audio file on your disk 2) array: a numpy array representing the raw audio waveform, each value corresponds to a sampled amplitude of the audio signal 3) sample rate: sampling rate of the audio signal
print(train_audio_dataset[0])

{'audio': {'path': 'ACPA Santa came home since the snow fell.wav', 'array': array([-9.34825897e-11, -2.67201782e-11, -8.02570579e-11, ...,
       -5.39624239e-07,  4.85038470e-07,  0.00000000e+00]), 'sampling_rate': 16000}, 'labels': 0.0}


# Training

In [None]:
!pip install torch
!pip install datasets
!pip install librosa
!pip install transformers



In [None]:
# import libraries
import datasets
from datasets import load_dataset, DatasetDict,  Audio, load_from_disk
import pandas as pd
import os
import glob
import librosa
import io
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score
from transformers import WhisperModel, WhisperFeatureExtractor, AdamW
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report, accuracy_score

## Baseline
- Support Vector Machine (SMV) and Random Forest (RF) model act as a baseline for the LLM training.
- References: https://medium.com/@mujtabaraza194/voice-classification-using-mfcc-features-and-deep-neural-networks-a-step-by-step-guide-296670ae1e79

### SVM

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [None]:
!pwd

/content/drive/MyDrive/vandy 24fall/Transformer/public_samples


In [None]:
!ls

CASES	     CONTROLS	   test_dataset      train_dataset	val_dataset.zip
CASES_WAV    CONTROLS_WAV  test_dataset.zip  train_dataset.zip	whisper_best_model.pt
catalog.csv  test.csv	   train.csv	     val_dataset


In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
# train set
train_files = train_df["WAV_filename"].tolist()

train_folder = train_df["WAV_folder"].tolist()

train_full_paths = [os.path.join(train_folder[i], train_files[i]) for i in range(0,len(train_files))]

train_labels = train_df["hypernasality"].tolist()

In [None]:
# test set
test_files = test_df["WAV_filename"].tolist()

test_folder = test_df["WAV_folder"].tolist()

test_full_paths = [os.path.join(test_folder[i], test_files[i]) for i in range(0,len(test_files))]

test_labels = test_df["hypernasality"].tolist()

In [None]:
# Define a function to extract MFCCs from an audio file
def extract_mfcc_features(file_path, n_mfcc=13):
    # print(file_path)
    audio, sample_rate = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    mfccs_scaled = np.mean(mfccs.T, axis=0)  # Taking the average across time
    return mfccs_scaled

# Paths to your audio files
audio_files = train_full_paths + test_full_paths  # Add more paths as needed
labels = train_labels + test_labels  # Corresponding labels for your audio files

# Extract features from each audio file
features = [extract_mfcc_features(file) for file in audio_files]

# Split the dataset into training and testing sets
X_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')  # You can experiment with different kernels
svm_model.fit(x_train, y_train)

# Predictions
y_pred = svm_model.predict(x_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:", classification_report(y_val, y_pred))

CASES_WAV/ACPA ted had a dog with white feet-3.wav
CONTROLS_WAV/cdc 4 (and then go to school).wav
CONTROLS_WAV/Video 1_4 (and can I have some more material).wav
CONTROLS_WAV/NEW - video 2 (three times).wav
CONTROLS_WAV/cdc 4 (and then he brushed his teeth).wav
CONTROLS_WAV/NEW - video 2 (no they dont talk).wav
CONTROLS_WAV/cdc 2 (I said thank you ray).wav
CASES_WAV/Video 1_9 (pa, pa, pa).wav
CONTROLS_WAV/ACPA We shouldn_t play in the street.wav
CONTROLS_WAV/ACPA look at this book with us.wav
CASES_WAV/Facebook  (pick up the pie).wav
CASES_WAV/ACPA Tom had ham and eggs for breakfast.wav
CASES_WAV/Video 6_7 (buy baby a bib).wav
CONTROLS_WAV/video 1 (puppy are you ready_).wav
CONTROLS_WAV/Video 1_18 (pretend it stops running when the car is going).wav
CONTROLS_WAV/NEW - video 2 (bugs and spiders, I protect).wav
CONTROLS_WAV/video 1 (yes I am going to give him coffee for me and him).wav
CASES_WAV/Video 3_6 (muddy 2).wav
CONTROLS_WAV/cdc 4 (and then he was a boy).wav
CONTROLS_WAV/NEW - vide

In [None]:
# Evaluate the model on the test set
test_predictions = svm_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_predictions))
print("Test Classification Report:", classification_report(y_test, test_predictions))

Test Accuracy: 0.5892857142857143
Test Classification Report:               precision    recall  f1-score   support

         0.0       0.59      1.00      0.74        33
         1.0       0.00      0.00      0.00        23

    accuracy                           0.59        56
   macro avg       0.29      0.50      0.37        56
weighted avg       0.35      0.59      0.44        56



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100)  # You can adjust the number of trees
rf_model.fit(x_train, y_train)

# Make predictions - VAL
y_pred = rf_model.predict(x_val)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:", classification_report(y_val, y_pred))

Accuracy: 0.9487179487179487
Classification Report:               precision    recall  f1-score   support

         0.0       1.00      0.89      0.94        19
         1.0       0.91      1.00      0.95        20

    accuracy                           0.95        39
   macro avg       0.95      0.95      0.95        39
weighted avg       0.95      0.95      0.95        39



In [None]:
# Make predictions on the test set
test_predictions = rf_model.predict(X_test)

# Evaluate the model on the test set
print("Test Accuracy:", accuracy_score(y_test, test_predictions))
print("Test Classification Report:", classification_report(y_test, test_predictions))

Test Accuracy: 0.6071428571428571
Test Classification Report:               precision    recall  f1-score   support

         0.0       0.61      0.94      0.74        33
         1.0       0.60      0.13      0.21        23

    accuracy                           0.61        56
   macro avg       0.60      0.53      0.48        56
weighted avg       0.60      0.61      0.52        56



## Whisper


### Whisper-base

In [None]:
model_checkpoint = "openai/whisper-base"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

#### Train

In [None]:
class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 4096),
            nn.ReLU (),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

num_labels = 2

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()


# Define the training function
def train(model, train_loader, val_loader, optimizer,  criterion, device, num_epochs):

    best_accuracy = 0.0

    for epoch in range(num_epochs):

        model.train()

        for i, batch in enumerate(train_loader):

            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze()
            input_features = input_features.to(device)

            decoder_input_ids = decoder_input_ids.squeeze()
            decoder_input_ids = decoder_input_ids.to(device)

            labels = labels.view(-1)
            labels = labels.type(torch.LongTensor)
            labels = labels.to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)

            loss = criterion(logits, labels)
            loss.backward()

            optimizer.step()

            if (i+1) % 8 == 0:
                print(f'Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_loader)}, Train Loss: {loss.item() :.4f}')
                train_loss = 0.0

        val_loss, val_accuracy, val_f1, _ , _ = evaluate(model, val_loader, device)

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'whisper_best_model.pt')

        print("========================================================================================")
        print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}, Best Accuracy: {best_accuracy:.4f}')
        print("========================================================================================")

def evaluate(model, data_loader,  device):

    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():

        for i, batch in enumerate(data_loader):

          input_features, decoder_input_ids, labels = batch

          input_features = input_features.squeeze()
          input_features = input_features.to(device)

          decoder_input_ids = decoder_input_ids.squeeze()
          decoder_input_ids = decoder_input_ids.to(device)

          labels = labels.view(-1)
          labels = labels.type(torch.LongTensor)
          labels = labels.to(device)

          optimizer.zero_grad()

          logits = model(input_features, decoder_input_ids)

          loss = criterion(logits, labels)
          total_loss += loss.item()

          _, preds = torch.max(logits, 1)
          all_labels.append(labels.cpu().numpy())
          all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds




In [None]:
class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

      inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
                                   return_tensors="pt",
                                   sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
      input_features = inputs.input_features
      decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id

      labels = np.array(self.audio_data[index]['labels'])

      return input_features, decoder_input_ids, torch.tensor(labels)


train_dataset = SpeechClassificationDataset(train_audio_dataset,  feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset,  feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset,  feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/5, Batch 8/13, Train Loss: 0.6454
Epoch 1/5, Val Loss: 0.5433, Val Accuracy: 0.8444, Val F1: 0.8394, Best Accuracy: 0.8444
Epoch 2/5, Batch 8/13, Train Loss: 0.1197
Epoch 2/5, Val Loss: 0.4785, Val Accuracy: 0.8222, Val F1: 0.8221, Best Accuracy: 0.8444
Epoch 3/5, Batch 8/13, Train Loss: 0.0072
Epoch 3/5, Val Loss: 0.4797, Val Accuracy: 0.8222, Val F1: 0.8221, Best Accuracy: 0.8444
Epoch 4/5, Batch 8/13, Train Loss: 0.0307
Epoch 4/5, Val Loss: 0.5107, Val Accuracy: 0.8444, Val F1: 0.8441, Best Accuracy: 0.8444
Epoch 5/5, Batch 8/13, Train Loss: 0.0013
Epoch 5/5, Val Loss: 0.6181, Val Accuracy: 0.8444, Val F1: 0.8441, Best Accuracy: 0.8444


#### Evaluate

In [None]:
#VALIDATION
state_dict = torch.load('whisper_best_model.pt')

# Create a new instance of the model and load the state dictionary
num_labels = 2
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, val_loader, device)

#VALIDATION
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

  state_dict = torch.load('whisper_best_model.pt')


              precision    recall  f1-score   support

           0       1.00      0.68      0.81        22
           1       0.77      1.00      0.87        23

    accuracy                           0.84        45
   macro avg       0.88      0.84      0.84        45
weighted avg       0.88      0.84      0.84        45

0.8444444444444444


#### Test

In [None]:
#TEST
# Evaluation on test data
_, _, _, all_labels, all_preds = evaluate(model, test_loader, device)

# Generate test results
print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

              precision    recall  f1-score   support

           0       0.89      0.44      0.59        18
           1       0.64      0.95      0.77        19

    accuracy                           0.70        37
   macro avg       0.77      0.70      0.68        37
weighted avg       0.76      0.70      0.68        37

0.7027027027027027


More trainings using other Whisper models can be accessed through other notebooks.