**Installed necessary libraries**

In [None]:
!pip install pydub
!pip install keras-tuner

**Import necessary libraries**

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Operating system interaction
import os

# Audio processing
import librosa

# Hyperparameter tuning for Keras models
import kerastuner as kt

# Deep learning framework
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (
    Conv1D, Dense, Embedding, LSTM, Bidirectional,
    Dropout, BatchNormalization, GlobalMaxPooling1D,
    SpatialDropout1D, Flatten, Input, Concatenate, MaxPooling1D
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# Model tuning
from kerastuner.tuners import RandomSearch
from kerastuner import HyperParameters

# Data preprocessing and balancing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from collections import Counter


# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score


**Drive Mount**

In [None]:
from google.colab import drive
drive.mount('/gdrive')

**Without Noise Directory**

In [None]:
directory = '/gdrive/MyDrive/Input_large_final/Input_large/senddrive/Small_Files_Merged/Textgrid_Files'
wav_files_directory = '/gdrive/MyDrive/Input_large_final/Input_large/senddrive/Small_Files_Merged/Wav_Files'

**With Noise Directory**

In [None]:
directory = '/gdrive/MyDrive/Input_large_final/Input_large/senddrive/Small_Files_Merged/Textgrid_Noise_Files'
wav_files_directory = '/gdrive/MyDrive/Input_large_final/Input_large/senddrive/Small_Files_Merged/Wav_Noise_Files'

**Data Loading and DataFrame Creation**

In [None]:

# Create an empty DataFrame to store the results
final_df = pd.DataFrame(columns=['Transcript', 'phoneme_likelihood', 'phones'])

def parse_textgrid(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = []
    start_time, end_time, label = None, None, None
    for line in lines:
        line = line.strip()
        if line.startswith('xmin'):
            start_time = float(line.split('=')[1].strip())
        elif line.startswith('xmax'):
            end_time = float(line.split('=')[1].strip())
        elif line.startswith('text'):
            label = line.split('=')[1].strip().strip('"')
            if start_time is not None and end_time is not None and label is not None:
                data.append((start_time, end_time, label))
                start_time, end_time, label = None, None, None

    return data

def textgrid_to_dataframe(file_path):
    data = parse_textgrid(file_path)
    df = pd.DataFrame(data, columns=['Start Time', 'End Time', 'Label'])
    return df

def extract_mfcc_spectrogram(file_path):
    audio, sr = librosa.load(file_path)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta_delta = librosa.feature.delta(mfcc, order=2)
    spectrogram = np.concatenate((mfcc, mfcc_delta, mfcc_delta_delta), axis=0)
    return spectrogram


def extract_mel_spectrogram(file_path, n_mels=32, hop_length=512):
    spectrograms = []
    audio, sr = librosa.load(file_path)
    mfcc_spectrogram = extract_mfcc_spectrogram(file_path)
    mel_spectrogram = librosa.feature.melspectrogram(S=librosa.feature.inverse.mfcc_to_mel(mfcc_spectrogram),
                                                     n_mels=n_mels, hop_length=hop_length)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    spectrograms.append(mel_spectrogram_db)
    return np.array(spectrograms)


# Iterate over the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.TextGrid'):
        file_path = os.path.join(directory, filename)

        # Process the file and obtain the necessary dataframes
        df = textgrid_to_dataframe(file_path)

        # Get the indices of the matched rows
        indices = df.index[(df['Start Time'] == df['Start Time'].iloc[0]) & (df['End Time'] == df['End Time'].iloc[0])]

        # Split the DataFrame based on indices
        first_df = df.loc[:indices[-1]]
        second_df = df.loc[indices[-1]+1:]

        # Remove rows with blank or null labels from first_df
        first_df = first_df[first_df['Label'].notnull() & (first_df['Label'] != "")]

        # Remove rows with blank or null labels from second_df
        second_df = second_df[second_df['Label'].notnull() & (second_df['Label'] != "")]

        # Combine labels from first_df into a single sentence
        combined_sentence = ' '.join(first_df['Label'].tolist())

        # Create Combined_df with the combined sentence
        combined_df = pd.DataFrame({'Transcript': [combined_sentence]})

        # Find the highest occurring string in second_df
        phoneme_likelihood = second_df['Label'].mode().iloc[0]

        # Create Transcript DataFrame with the most probable phoneme
        transcript_df = pd.DataFrame({'phoneme_likelihood': [phoneme_likelihood]})

        # Create Phones DataFrame with the list of phones
        phones_df = pd.DataFrame({'Phones': [second_df['Label'].tolist()]})

        # Extract the MFCC mel spectrogram
        audio_file_path = os.path.join(wav_files_directory, filename[:-9] + '.wav')
        mfcc_spectrogram = extract_mfcc_spectrogram(audio_file_path)

        # Extract the Mel spectrograms
        mel_spectrograms = extract_mel_spectrogram(audio_file_path)

        # Create MFCC Spectrogram DataFrame with the MFCC mel spectrogram
        mfcc_spectrogram_df = pd.DataFrame({'MFCC_Spectrogram': [mfcc_spectrogram]})

        # Create Mel Spectrogram DataFrame with the mel spectrogram
        mel_spectrograms_df = pd.DataFrame({'Mel_Spectrograms': [mel_spectrograms]})

        # Concatenate the DataFrames and append to the final_df
        result_df = pd.concat([combined_df, transcript_df, phones_df, mfcc_spectrogram_df, mel_spectrograms_df], axis=1)
        final_df = pd.concat([final_df, result_df], ignore_index=True)


**Data Refinement Process**

In [None]:
# Get the labels of the 3 most frequent classes
phoneme_likelihood_counts = final_df['phoneme_likelihood'].value_counts()
most_frequent_classes = phoneme_likelihood_counts.index[:3]

# Filter 'final_df' to keep only the observations with labels in the most frequent classes
filtered_final_df = final_df[final_df['phoneme_likelihood'].isin(most_frequent_classes)]

print(filtered_final_df['phoneme_likelihood'].value_counts())

**Data Partition: Training and Test Split**

In [None]:
# Split the data into training and test dataframes (70% training, 30% test), using a fixed random state for reproducibility
train_df, test_df = train_test_split(filtered_final_df, test_size=0.3, random_state=42)

**Balancing the Training Data using Random Oversampling**

In [None]:
# Create temporary dataframe before resampling the data to balance the imbalanced dataset
X_temp = train_df
X_temp = X_temp.drop('phoneme_likelihood', axis=1)
# Oversampling

# Calculate the desired minority class count based on 0.5 times the majority class count
y = train_df['phoneme_likelihood']
majority_class_count = max(Counter(y).values())
desired_minority_class_count = int(majority_class_count )  # Changed to 0.25

# Prepare the sampling_strategy dictionary
sampling_strategy = {
    label: desired_minority_class_count
    for label, count in Counter(y).items()
    if count < desired_minority_class_count
}

# Initialize RandomOverSampler with the custom sampling strategy
oversampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=0)

# Perform oversampling on the training data
X_temp_resampled, y_resampled = oversampler.fit_resample(X_temp, y)

# Print class distribution after oversampling
print("Class distribution after oversampling:", sorted(Counter(y_resampled).items()))

**Neural Network Model Training and Evaluation with MFCC_Spectrogram**

In [None]:
X_transcript = X_temp_resampled['Transcript']
X_MFCC_spectrograms = X_temp_resampled['MFCC_Spectrogram']
y = y_resampled

#  the data into training and test sets
X_train_transcript, X_test_transcript, X_train_MFCC_spectrograms, X_test_MFCC_spectrograms, y_train, y_test = train_test_split(
    X_transcript, X_MFCC_spectrograms, y, test_size=0.2, random_state=42)

# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the transcript training data
tokenizer.fit_on_texts(X_train_transcript)

# Convert transcript text to sequences
X_train_transcript_seq = tokenizer.texts_to_sequences(X_train_transcript)
X_test_transcript_seq = tokenizer.texts_to_sequences(X_test_transcript)

# Pad transcript sequences to have the same length
max_length_transcript = max(max(len(seq) for seq in X_train_transcript_seq), max(len(seq) for seq in X_test_transcript_seq))
X_train_transcript_padded = pad_sequences(X_train_transcript_seq, maxlen=max_length_transcript, padding='post')
X_test_transcript_padded = pad_sequences(X_test_transcript_seq, maxlen=max_length_transcript, padding='post')

# Find the maximum sequence length for transcript
max_length_transcript = max(len(seq) for seq in X_train_transcript_seq + X_test_transcript_seq)

# Find the maximum number of features for spectrograms
max_features_spectrogram = max(arr.shape[1] for arr in X_train_MFCC_spectrograms + X_test_MFCC_spectrograms)

# Pad or truncate the transcript sequences to have the same length
X_train_transcript_padded = pad_sequences(X_train_transcript_seq, maxlen=max_length_transcript, padding='post')
X_test_transcript_padded = pad_sequences(X_test_transcript_seq, maxlen=max_length_transcript, padding='post')

# Pad or truncate the spectrogram arrays to have the same number of features
X_train_MFCC_spectrograms_padded = np.array([np.pad(arr[:, :max_features_spectrogram], ((0, 0), (0, max_features_spectrogram - arr.shape[1])), mode='constant') for arr in X_train_MFCC_spectrograms])
X_test_MFCC_spectrograms_padded = np.array([np.pad(arr[:, :max_features_spectrogram], ((0, 0), (0, max_features_spectrogram - arr.shape[1])), mode='constant') for arr in X_test_MFCC_spectrograms])


# Encode the response variable
label_encoder = LabelEncoder()
label_encoder.fit(y)  # Fit the label encoder on all labels in y
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Find the maximum number of features for spectrograms
max_features_spectrogram = max(max(arr.shape[1] for arr in X_train_MFCC_spectrograms), max(arr.shape[1] for arr in X_test_MFCC_spectrograms))

# Pad or truncate the spectrogram arrays to have the same number of features
X_train_MFCC_spectrograms_padded = np.array([np.pad(arr[:, :max_features_spectrogram], ((0, 0), (0, max_features_spectrogram - arr.shape[1])), mode='constant') for arr in X_train_MFCC_spectrograms])
X_test_MFCC_spectrograms_padded = np.array([np.pad(arr[:, :max_features_spectrogram], ((0, 0), (0, max_features_spectrogram - arr.shape[1])), mode='constant') for arr in X_test_MFCC_spectrograms])
# Define the model
input_transcript = Input(shape=(max_length_transcript,))
embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_transcript)
conv1d_transcript = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding)
conv1d_transcript = BatchNormalization()(conv1d_transcript)
conv1d_transcript = GlobalMaxPooling1D()(conv1d_transcript)
conv1d_transcript = Dropout(0.5)(conv1d_transcript)
dense = Dense(units=128, activation='relu', kernel_regularizer=l2(0.0001))(conv1d_transcript)
dense = Dropout(0.5)(dense)

# Add LSTM layer to the transcript part
lstm_transcript = LSTM(64)(embedding)
lstm_transcript = Dropout(0.5)(lstm_transcript)

# Merge the LSTM and Conv1D features
merged_features = Concatenate()([dense, lstm_transcript])
input_spectrogram = Input(shape=(39, max_features_spectrogram))
conv1d = Conv1D(filters=128, kernel_size=5, activation='relu')(input_spectrogram)
conv1d = GlobalMaxPooling1D()(conv1d)
conv1d = BatchNormalization()(conv1d)
conv1d = Dropout(0.5)(conv1d)

# Flatten the Conv1D output
conv1d = Flatten()(conv1d)

# Merge the transcript and spectrogram features
merged_features = Concatenate()([conv1d, merged_features])

dense = Dense(units=256, activation='relu')(merged_features)
dense = BatchNormalization()(dense)
dense = Dropout(0.5)(dense)

output = Dense(units=len(label_encoder.classes_), activation='softmax')(dense)

model = Model(inputs=[input_transcript, input_spectrogram], outputs=output)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
#early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

# Train the model
model.fit([X_train_transcript_padded, X_train_MFCC_spectrograms_padded], y_train_encoded,
          epochs=10, batch_size=64, validation_data=([X_test_transcript_padded, X_test_MFCC_spectrograms_padded], y_test_encoded))

# Calculate training and validation accuracy
_, train_accuracy = model.evaluate([X_train_transcript_padded, X_train_MFCC_spectrograms_padded], y_train_encoded)
_, test_accuracy = model.evaluate([X_test_transcript_padded, X_test_MFCC_spectrograms_padded], y_test_encoded)

print('Training Accuracy:', train_accuracy)
print('Validation Accuracy:', test_accuracy)



**Evaluating Accuracy and Predicted Labels on Test Data**

In [None]:
# Extract the input features from the new data
X_new_mfcc_spectrograms = test_df['MFCC_Spectrogram']
X_new_transcripts = test_df['Transcript']

# Convert text to sequences using the tokenizer fitted on the training data
X_new_transcript_seq = tokenizer.texts_to_sequences(X_new_transcripts)
X_new_transcript_padded = pad_sequences(X_new_transcript_seq, maxlen=max_length_transcript, padding='post')
# Pad or truncate the spectrogram arrays to have the same number of features
X_new_mfcc_spectrograms_padded = np.array([np.pad(arr[:, :max_features_spectrogram], ((0, 0), (0, max_features_spectrogram - arr.shape[1])), mode='constant') for arr in X_new_mfcc_spectrograms])
predictions = model.predict([X_new_transcript_padded, X_new_mfcc_spectrograms_padded])

# Decode the predicted labels
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print(predicted_labels)

# Calculate the accuracy on the new data
accuracy = np.mean(predicted_labels == test_df['phoneme_likelihood'])
print("Overall Accuracy on Test Data:", accuracy)

**Evaluating Accuracy and Correct Predictions per Class**

In [None]:
X_new_mfcc_spectrograms = test_df['MFCC_Spectrogram']
X_new_transcripts = test_df['Transcript']

# Convert text to sequences using the tokenizer fitted on the training data
X_new_transcript_seq = tokenizer.texts_to_sequences(X_new_transcripts)
X_new_transcript_padded = pad_sequences(X_new_transcript_seq, maxlen=max_length_transcript, padding='post')

# Pad or truncate the spectrogram arrays to have the same number of features
X_new_mfcc_spectrograms_padded = np.array([np.pad(arr[:, :max_features_spectrogram], ((0, 0), (0, max_features_spectrogram - arr.shape[1])), mode='constant') if arr.shape[1] < max_features_spectrogram else arr[:, :max_features_spectrogram] for arr in X_new_mfcc_spectrograms])
predictions = model.predict([X_new_transcript_padded, X_new_mfcc_spectrograms_padded])


# Decode the predicted labels
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Calculate the accuracy on the new data
accuracy = accuracy_score(predicted_labels, test_df['phoneme_likelihood'])
print("Accuracy on New Data:", accuracy)

# Assuming 'predictions' contains the predicted probabilities for each class
class_probabilities = predictions / np.sum(predictions, axis=1, keepdims=True)
class_probabilities = np.clip(class_probabilities, 1e-10, 1.0 - 1e-10)  # Add a small constant to avoid zero probabilities
entropy_per_sample = -np.sum(class_probabilities * np.log2(class_probabilities), axis=1)

# Normalize entropy values to be between 0 and 1
normalized_entropy = entropy_per_sample / np.log2(len(label_encoder.classes_))

# Calculate average normalized entropy per class
average_normalized_entropy_per_class = {}

for label_idx, label in enumerate(label_encoder.classes_):
    indices_for_label = np.where(test_df['phoneme_likelihood'] == label)[0]
    uncertainties_for_label = normalized_entropy[indices_for_label]
    average_uncertainty = np.mean(uncertainties_for_label)
    average_normalized_entropy_per_class[label] = average_uncertainty

print("Average Normalized Uncertainty per Class Label:")
for label, uncertainty in average_normalized_entropy_per_class.items():
    print(f"{label}: {uncertainty:.4f}")

# Calculate accuracy per class and number of correct predictions
class_labels = label_encoder.classes_
class_accuracy = {}
class_correct_predictions = {}

for label in class_labels:
    indices_for_label = np.where(test_df['phoneme_likelihood'] == label)[0]
    correct_predictions_for_label = np.sum(predicted_labels[indices_for_label] == label)
    total_samples_for_label = len(indices_for_label)
    class_accuracy[label] = correct_predictions_for_label / total_samples_for_label
    class_correct_predictions[label] = correct_predictions_for_label

print("Accuracy and Correct Predictions per Class:")
for label, acc in class_accuracy.items():
    correct_preds = class_correct_predictions[label]
    print(f"{label}: Accuracy = {acc:.4f}, Correct Predictions = {correct_preds}")


**Evaluation of Other Metrics Calculation**

In [None]:
# Inverse transform predicted labels from one-hot encoded format to original labels
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Extract the true labels from the 'phoneme_likelihood' column of the test dataframe
true_labels = test_df['phoneme_likelihood']

# Calculate precision, recall, and F1-score
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Print the calculated evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)