In [None]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.4-py3-none-any.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.2/172.2 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.4 kt-legacy-1.0.4


In [None]:
!pip install deepspeech
import deepspeech


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepspeech
  Downloading deepspeech-0.9.3-cp39-cp39-manylinux1_x86_64.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deepspeech
Successfully installed deepspeech-0.9.3


In [None]:
# Load packages
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping
import os
import pickle
import time
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from keras.utils import np_utils
from sklearn.metrics import confusion_matrix
#from google.cloud import speech_v1
#from google.cloud.speech_v1 import enums

  from kerastuner.tuners import RandomSearch


In [None]:
# Create and initalize parameters

from google.colab import drive
drive.mount('/gdrive')

# Define the directory where the audio files are stored
data_dir = '/gdrive/MyDrive/VIVAE/full_set/'
model_dir = '/gdrive/MyDrive/VIVAE/deepspeech-0.9.3-models.pbmm' 
model_score_dir = '/gdrive/MyDrive/VIVAE/deepspeech-0.9.3-models.scorer' 

# Define the number of classes in the dataset
num_classes = 6

LOG_DIR = f"{int(time.time())}"

Mounted at /gdrive


In [None]:
phoneme_dict = {'AA': 0, 'AE': 1, 'AH': 2, 'AO': 3, 'AW': 4, 'AY': 5, 'B': 6, 'CH': 7, 'D': 8, 'DH': 9, 'EH': 10, 'ER': 11, 'EY': 12, 'F': 13, 'G': 14, 'HH': 15, 'IH': 16, 'IY': 17, 'JH': 18, 'K': 19, 'L': 20, 'M': 21, 'N': 22, 'NG': 23, 'OW': 24, 'OY': 25, 'P': 26, 'R': 27, 'S': 28, 'SH': 29, 'T': 30, 'TH': 31, 'UH': 32, 'UW': 33, 'V': 34, 'W': 35, 'Y': 36, 'Z': 37, 'ZH': 38}

In [None]:
class AudioClassifier:
    def __init__(self, data_path, LOG_DIR):
        self.data_path = data_path
        self.LOG_DIR = LOG_DIR

    def load_data(self):
        mfccs = []
        labels = []
        for filename in os.listdir(self.data_path):
            # Load the audio file
            audio_path = os.path.join(self.data_path, filename)
            audio, sr1 = librosa.load(audio_path)
            phonemes = []

            # Extract the label from the filename
            label = filename.split("_")[1]
            noise = np.random.randn(len(audio))
            norm_audio = audio / np.max(np.abs(audio))
            noise = noise / np.max(np.abs(noise))
            noisy_audio = norm_audio + 10**(-10/20) * noise

            # Convert the audio to mfccs
            mfcc = librosa.feature.mfcc(y=noisy_audio, sr=sr1, n_mfcc=20)


            # Pad the MFCC array to ensure that it has a consistent shape
            pad_width = 20 - mfcc.shape[1]
            if pad_width > 0:
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            elif pad_width < 0:
                mfcc = mfcc[:, :20]
           
            # Transcribe the audio to phonemes
            ds = deepspeech.Model(model_dir)
            ds.enableExternalScorer(model_score_dir)
            audio_data, _ = librosa.load(audio_path, sr=ds.sampleRate())
            audio_data_int16 = (audio_data * np.iinfo(np.int16).max).astype(np.int16)
            phoneme_seq = ds.stt(audio_data_int16)
            phonemes.append(phoneme_seq)
            mfccs.append(mfcc)
            labels.append(label)

        # Convert the data to numpy arrays
        X = np.array(mfccs)
        y = np.array(labels)
        X_new = np.array(phonemes)

        # Convert the labels to categorical
        self.label_map = {label: i for i, label in enumerate(set(labels))}
        y = np.array([self.label_map[label] for label in labels])
        y = to_categorical(y)

        # Split the data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X_new, y, test_size=0.2)

    def build_model(self, hp):
        model = keras.Sequential()
        model.add(layers.Flatten(input_shape=(self.X_train.shape[1], self.X_train.shape[2])))

        # Tune the number of layers and units in each layer
        for i in range(hp.Int('num_layers', 1, 4)):
            model.add(layers.Dense(units=hp.Int(f'conv_{i}_units', min_value=32, max_value=512, step=32), activation='relu'))
            model.add(layers.Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.1)))

        model.add(layers.Dense(self.y_train.shape[1], activation='softmax'))


        # Tune the learning rate for the optimizer
        hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate), metrics=['accuracy'], experimental_run_tf_function=False)
        return model

    # Define a function to sample predictions from the model with dropout
    def predict_with_dropout(self, X, n_samples):
        result = np.zeros((n_samples, X.shape[0], 6))
        for i in range(n_samples):
           result[i] = self.model.predict(X)
        return result

    def plot_history(self, history):
        fig, axs = plt.subplots(1, 2, figsize=(10, 5))
        axs[0].plot(history.history['accuracy'], label='train')
        axs[0].plot(history.history['val_accuracy'], label='test')
        axs[0].set_title('Model Accuracy')
        axs[0].set_xlabel('Epoch')
        axs[0].set_ylabel('Accuracy')
        axs[0].legend()

        axs[1].plot(history.history['loss'], label='train')
        axs[1].plot(history.history['val_loss'], label='test')
        axs[1].set_title('Model Loss')
        axs[1].set_xlabel('Epoch')
        axs[1].set_ylabel('Loss')
        axs[1].legend()

        plt.show()

    def predict(self, new_data_path):
        # Load and preprocess the new data
        mfccs = []
        for filename in os.listdir(new_data_path):
            audio_path = os.path.join(new_data_path, filename)
            audio, sr = librosa.load(audio_path)
            mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
            pad_width = 20 - mfcc.shape[1]
            if pad_width > 0:
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            elif pad_width < 0:
                mfcc = mfcc[:, :20]
            mfccs.append(mfcc)
        X_new = np.array(mfccs)

        # Convert the labels to categorical
        label_map = self.label_map
        y_new = np.array([label_map[filename.split("_")[1]] for filename in os.listdir(new_data_path)])
        y_new = to_categorical(y_new)

        # Make predictions on the new data
        preds = self.model.predict(X_new)
        conf_scores = np.max(preds, axis=1) # Assumes a classification model with predict_proba method
        # Convert predicted probabilities to class labels
        y_pred_labels = np.argmax(preds, axis=1)
        y_true_labels = np.argmax(y_new, axis=1)

        conf_matrix = confusion_matrix(y_true_labels, y_pred_labels)

        # Calculate per-class uncertainty
        class_uncertainty = []
        for i in range(len(conf_matrix)):
           correct = conf_matrix[i, i]
           incorrect = np.sum(conf_matrix[i]) - correct
           total = correct + incorrect
        if total == 0:
           class_uncertainty.append(0)
        else:
           class_uncertainty.append(1 - (correct/total))

        # Calculate average uncertainty
        avg_uncertainty = 1 - np.mean(conf_scores)

        # Print results
        print("Per-class uncertainty:", class_uncertainty)
        print("Average uncertainty:", avg_uncertainty)

        # Get the predicted class from the probabilities
        preds_classes = np.argmax(preds, axis=1)

        # Get the actual class from the one-hot encoded array
        actual_classes = np.argmax(y_new, axis=1)
        score = self.model.evaluate(X_new,y_new)
        # Print the predicted and actual classes

        print('Test loss:', score[0])
        print('New data accuracy:', score[1])

        print("Predictions:", preds_classes)
        print("Actual:", actual_classes)

        
    def search_hyperparameters1(self, X_train, y_train, X_test, y_test, max_trials=10, executions_per_trial=5, LOG_DIR='audio_classifier'):
        # Define the tuner
        tuner = RandomSearch(self.build_model, objective='val_accuracy', max_trials=max_trials, 
                             executions_per_trial=executions_per_trial, directory=LOG_DIR, project_name='audio_classifier')

        # Search for the best hyperparameter configuration
        # Define the KFold cross-validator
        kfold = KFold(n_splits=10, shuffle=True)

        # Perform cross-validation and search for the best hyperparameter configuration
        for train_index, val_index in kfold.split(X_train):
           X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
           y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
           tuner.search(X_train_fold, y_train_fold, epochs=10, validation_data=(X_val_fold, y_val_fold))

        print(tuner.get_best_hyperparameters()[0].values)
        print(tuner.get_best_models()[0].summary())

        # Get the best hyperparameters
        best_hp = tuner.get_best_hyperparameters(num_trials=3)[0]

        # Build the model with the best hyperparameters
        self.model = tuner.hypermodel.build(best_hp)

        # Define EarlyStopping callback
        earlystop = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.01, verbose=1, mode='auto')

        # Fit the model to the training data
        history = self.model.fit(X_train, y_train, batch_size=32, epochs=150 , validation_data=(X_test, y_test))

        # Plot the training history
        self.plot_history(history)

        # Predict the probabilities for each class 
        preds = self.model.predict(X_test)

        # Get the predicted class from the probabilities
        preds_classes = np.argmax(preds, axis=1)

        # Get the actual class from the one-hot encoded array
        actual_classes = np.argmax(y_test, axis=1)

        # Evaluate the model on the test data
        score = self.model.evaluate(X_test, y_test, verbose=0)
        print('Test loss:', score[0])
        print('New data accuracy:', score[1])
        
        # Print the predicted and actual classes for the first 10 samples in the test set
        print("Predictions:", preds_classes[:20])
        print("Actual:", actual_classes[:20])

    def search_hyperparameters(self, X_train, y_train, X_test, y_test, max_trials=10, executions_per_trial=5, LOG_DIR='audio_classifier', num_folds=5):
    # Define the tuner
        tuner = RandomSearch(self.build_model, objective='val_accuracy', max_trials=max_trials, 
                         executions_per_trial=executions_per_trial, directory=LOG_DIR, project_name='audio_classifier')

        # Search for the best hyperparameter configuration
        tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test), 
                    validation_split=1/num_folds)

        # Get the best hyperparameters
        best_hp = tuner.get_best_hyperparameters(num_trials=3)[0]

        # Build the model with the best hyperparameters
        self.model = tuner.hypermodel.build(best_hp)

        # Define EarlyStopping callback
        earlystop = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.01, verbose=1, mode='auto')

        # Define KFold cross-validation
        kfold = KFold(n_splits=num_folds, shuffle=True)

       # Train and evaluate the model using KFold cross-validation
        scores = []

        for train_index, val_index in kfold.split(X_train):
           train_data = X_train[train_index]
           train_labels = y_train[train_index]
           val_data = X_train[val_index]
           val_labels = y_train[val_index]

           # Fit the model to the training data for this fold
           history = self.model.fit(train_data, train_labels, batch_size=32, epochs=100 , validation_data=(val_data, val_labels))

           # Evaluate the model on the validation data for this fold
           score = self.model.evaluate(val_data, val_labels, verbose=0)
           scores.append(score[1])

        # Print the average accuracy over all folds
        print('Cross-validation accuracy:', np.mean(scores))

        # Fit the final model to all the training data
        self.model.fit(X_train, y_train, batch_size=32, epochs=100 , validation_data=(X_test, y_test))

        # Predict the probabilities for each class 
        preds = self.model.predict(X_test)

        # Get the predicted class from the probabilities
        preds_classes = np.argmax(preds, axis=1)

        # Get the actual class from the one-hot encoded array
        actual_classes = np.argmax(y_test, axis=1)

        # Evaluate the model on the test data
        score = self.model.evaluate(X_test, y_test, verbose=0)
        print('Test loss:', score[0])
        print('New data accuracy:', score[1])

        # Print the predicted and actual classes for the first 20 samples in the test set
        print("Predictions:", preds_classes[:20])
        print("Actual:", actual_classes[:20])


In [None]:
# Initialize the AudioClassifier
classifier = AudioClassifier(data_path= data_dir, LOG_DIR= LOG_DIR)

# Load the data
classifier.load_data()

classifier.search_hyperparameters(classifier.X_train, classifier.y_train, classifier.X_test, classifier.y_test)

predictions = classifier.predict_with_dropout(classifier.X_test, 100)

mean_prediction = np.mean(predictions, axis=0)
std_prediction = np.std(predictions, axis=0)

# Calculate the entropy of the predictions
entropy = -np.sum(mean_prediction * np.log(mean_prediction), axis=1)
uncertainty = np.mean(entropy)

print('Uncertainty:', uncertainty)

# Search for the best hyperparameter configuration

ValueError: ignored

In [None]:
# Define the path to the new audio files
new_data_path = '/gdrive/MyDrive/VIVAE/core_set/'

# Get the predicted class probabilities for the new data
preds = classifier.predict(new_data_path)


Per-class uncertainty: [0.9135802469135803]
Average uncertainty: 0.05178570747375488
Test loss: 17.863059997558594
New data accuracy: 0.34285715222358704
Predictions: [1 4 2 4 3 0 4 3 2 1 3 2 2 5 4 0 0 4 2 3 4 2 0 1 5 1 4 0 1 5 2 4 4 0 4 4 0
 0 3 4 0 4 4 4 2 3 4 3 4 3 0 4 3 4 4 4 4 3 3 4 0 4 0 4 0 0 2 0 4 3 3 4 4 0
 4 2 4 4 0 4 3 4 4 2 2 0 3 3 3 4 4 3 4 4 4 4 0 0 3 0 4 4 4 0 4 3 0 4 4 4 3
 0 5 4 4 3 0 4 3 4 4 4 2 1 4 4 3 4 4 2 2 0 0 0 1 3 4 4 4 4 4 1 3 3 2 4 4 4
 4 3 4 2 4 2 0 0 4 3 0 0 0 4 4 4 5 4 0 4 3 4 4 0 4 0 4 4 4 3 0 4 4 4 2 4 4
 3 0 4 4 3 4 0 4 4 1 0 4 3 2 4 1 4 2 0 3 0 3 4 4 5 4 1 0 4 4 3 4 4 3 3 4 4
 0 3 4 4 4 2 3 0 4 4 4 0 4 4 4 3 4 4 4 3 0 0 4 3 4 1 3 2 4 2 0 3 4 2 1 4 4
 0 4 5 0 3 0 0 2 0 4 4 3 4 0 5 4 0 3 4 4 4 3 0 2 0 2 4 3 4 4 4 2 2 0 2 4 2
 3 0 0 0 0 4 0 4 3 3 4 3 3 3 3 4 4 0 3 2 4 1 4 5 3 0 4 4 2 4 4 4 4 0 2 3 3
 2 4 4 4 3 4 3 4 4 3 2 1 4 4 0 4 3 0 4 3 0 3 0 4 0 4 5 0 5 4 2 4 4 4 3 0 4
 4 4 2 3 4 1 0 4 3 4 4 0 4 4 4 4 4 0 4 0 4 0 3 0 3 4 4 2 4 3 4 4 3 3 4 4 0
 4 4 3 0