<a href="https://colab.research.google.com/github/hiteshJindal/Thesis_Audio_Uncertainity/blob/main/NoSpectrogram_NoNoise_Experiment_27_Aug.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import necessary libraries**

In [11]:
# Install necessary packages
!pip install pydub
!pip install keras-tuner

# Import required libraries
import pandas as pd
import numpy as np
import os
import librosa
import kerastuner as kt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv1D, Dense, Embedding, LSTM, Bidirectional, Dropout, BatchNormalization, GlobalMaxPooling1D, SpatialDropout1D, Flatten, Concatenate, Input
from collections import Counter
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Import necessary functions from sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# Install necessary packages for over-sampling
from imblearn.over_sampling import RandomOverSampler



**Drive Mount**

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


**Without Noise - directory path**

In [3]:
directory = '/gdrive/MyDrive/Input_large_final/Input_large/senddrive/Small_Files_Merged/Textgrid_Files'

**With Noise - directory path**

In [None]:
directory = '/gdrive/MyDrive/Input_large_final/Input_large/senddrive/Small_Files_Merged/Textgrid_noise_Files'

**Data Loading and DataFrame Creation**



In [6]:
# Create an empty DataFrame to store the results
final_df = pd.DataFrame(columns=['Transcript', 'phoneme_likelihood', 'Phones'])

def parse_textgrid(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = []
    start_time, end_time, label = None, None, None
    for line in lines:
        line = line.strip()
        if line.startswith('xmin'):
            start_time = float(line.split('=')[1].strip())
        elif line.startswith('xmax'):
            end_time = float(line.split('=')[1].strip())
        elif line.startswith('text'):
            label = line.split('=')[1].strip().strip('"')
            if start_time is not None and end_time is not None and label is not None:
                data.append((start_time, end_time, label))
                start_time, end_time, label = None, None, None

    return data

def textgrid_to_dataframe(file_path):
    data = parse_textgrid(file_path)
    df = pd.DataFrame(data, columns=['Start Time', 'End Time', 'Label'])
    return df

# Create an empty DataFrame to store the results
final_df = pd.DataFrame(columns=['Transcript', 'phoneme_likelihood', 'Phones'])

# Iterate over the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.TextGrid'):
        file_path = os.path.join(directory, filename)

        # Process the file and obtain the necessary dataframes
        df = textgrid_to_dataframe(file_path)

        # Get the indices of the matched rows
        indices = df.index[(df['Start Time'] == df['Start Time'].iloc[0]) & (df['End Time'] == df['End Time'].iloc[0])]

        # Split the DataFrame based on indices
        first_df = df.loc[:indices[-1]]
        second_df = df.loc[indices[-1]+1:]

        # Remove rows with blank or null labels from first_df
        first_df = first_df[first_df['Label'].notnull() & (first_df['Label'] != "")]

        # Remove rows with blank or null labels from second_df
        second_df = second_df[second_df['Label'].notnull() & (second_df['Label'] != "")]

        # Combine labels from first_df into a single sentence
        combined_sentence = ' '.join(first_df['Label'].tolist())

        # Create Combined_df with the combined sentence
        combined_df = pd.DataFrame({'Transcript': [combined_sentence]})

        # Find the highest occurring string in second_df
        phoneme_likelihood = second_df['Label'].mode().iloc[0]

        # Create Transcript DataFrame with the highest occurring string
        transcript_df = pd.DataFrame({'phoneme_likelihood': [phoneme_likelihood]})

        # Combine labels from second_df into a list
        phones_list = second_df['Label'].tolist()

        # Create Phones DataFrame with the list of phones
        phones_df = pd.DataFrame({'Phones': [phones_list]})

        # Concatenate the DataFrames and append to the final_df
        result_df = pd.concat([combined_df, transcript_df, phones_df], axis=1)
        final_df = pd.concat([final_df, result_df], ignore_index=True)



**Data Refinement Process**

In [None]:
# Get the labels of the 3 most frequent classes
phoneme_likelihood_counts = final_df['phoneme_likelihood'].value_counts()
most_frequent_classes = phoneme_likelihood_counts.index[:3]

# Filter 'final_df' to keep only the observations with labels in the most frequent classes
filtered_final_df = final_df[final_df['phoneme_likelihood'].isin(most_frequent_classes)]

print(filtered_final_df['phoneme_likelihood'].value_counts())

**Data Partition: Training and Test Split**

In [8]:
# Split the data into training and test dataframes (70% training, 30% test), using a fixed random state for reproducibility
train_df, test_df = train_test_split(filtered_final_df, test_size=0.3, random_state=42)

**Balancing the Training Data using Oversampling**

In [None]:
# Create a temporary dataframe to hold data before resampling, aiming to balance an imbalanced dataset
X_temp = train_df
X_temp = X_temp.drop('phoneme_likelihood', axis=1)

# Oversampling

# Calculate the desired count of the minority class based on 0.5 times the majority class count
y = train_df['phoneme_likelihood']
majority_class_count = max(Counter(y).values())
desired_minority_class_count = int(majority_class_count * 0.25)  # Adjusted to 25% of majority count

# Prepare a dictionary for sampling strategy where minority classes will be upsampled
sampling_strategy = {
    label: desired_minority_class_count
    for label, count in Counter(y).items()
    if count < desired_minority_class_count
}

# Initialize the RandomOverSampler with the custom sampling strategy and a fixed random state
oversampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=0)

# Apply oversampling to the training data to balance class distribution
X_temp_resampled, y_resampled = oversampler.fit_resample(X_temp, y)

# Print the distribution of classes after oversampling
print("Class distribution after oversampling:", sorted(Counter(y_resampled).items()))


**Neural Network Model Training and Evaluation**



In [None]:
# Extract the input features and response variable
X = X_temp_resampled['Transcript']
y = y_resampled

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the training data
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to have the same length
max_length = max(max(len(seq) for seq in X_train_seq), max(len(seq) for seq in X_test_seq))
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Encode the response variable
label_encoder = LabelEncoder()
label_encoder.fit(y)
y_encoded = label_encoder.transform(y)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the improved model architecture
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=200, input_length=max_length))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Define the model checkpoint callback to save the best model during training
checkpoint = ModelCheckpoint("best_model.h5", monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)

# Train the model
history = model.fit(X_train_padded, y_train_encoded, epochs=50, batch_size=64, validation_data=(X_test_padded, y_test_encoded), callbacks=[early_stopping, checkpoint])

# Calculate training and validation accuracy
train_accuracy = model.evaluate(X_train_padded, y_train_encoded)[1]
val_accuracy = model.evaluate(X_test_padded, y_test_encoded)[1]
print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)


**Neural Network Model Training and Evaluation using k-fold cross-validation**

In [None]:
# Initialize k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store training and validation accuracies
train_accuracies = []
val_accuracies = []

# Iterate through each fold
for train_index, val_index in kfold.split(X_temp_resampled):
    X_train, X_val = X_temp_resampled.iloc[train_index], X_temp_resampled.iloc[val_index]
    y_train, y_val = y_resampled[train_index], y_resampled[val_index]

    # Tokenize and preprocess text data
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train['Transcript'])
    X_train_seq = tokenizer.texts_to_sequences(X_train['Transcript'])
    X_val_seq = tokenizer.texts_to_sequences(X_val['Transcript'])
    max_length = max(max(len(seq) for seq in X_train_seq), max(len(seq) for seq in X_val_seq))
    X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
    X_val_padded = pad_sequences(X_val_seq, maxlen=max_length, padding='post')

    # Encode the response variable
    label_encoder = LabelEncoder()
    label_encoder.fit(y_train)
    y_train_encoded = label_encoder.transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)

    # Define the model architecture
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=200, input_length=max_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

    # Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

    # Define EarlyStopping and ModelCheckpoint callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    checkpoint = ModelCheckpoint("best_model.h5", monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)

    # Train the model with k-fold cross-validation
    history = model.fit(X_train_padded, y_train_encoded, epochs=100, batch_size=64,
                        validation_data=(X_val_padded, y_val_encoded),
                        callbacks=[early_stopping, checkpoint])

    # Calculate training and validation accuracy
    train_accuracy = model.evaluate(X_train_padded, y_train_encoded)[1]
    val_accuracy = model.evaluate(X_val_padded, y_val_encoded)[1]
    train_accuracies.append(train_accuracy)
    val_accuracies.append(val_accuracy)

# Print average training and validation accuracies across folds
print("Average Training Accuracy:", np.mean(train_accuracies))
print("Average Validation Accuracy:", np.mean(val_accuracies))


**Neural Network Model Training and Evaluation using both k-fold cross validation and Keras Tuner**

In [None]:
# Initialize k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store validation accuracies from each fold
val_accuracies = []

for fold_index, (train_index, val_index) in enumerate(kfold.split(X_temp_resampled)):
    X_train, X_val = X_temp_resampled.iloc[train_index], X_temp_resampled.iloc[val_index]
    y_train, y_val = y_resampled[train_index], y_resampled[val_index]

    # Initialize the tokenizer
    tokenizer = Tokenizer()

    # Fit the tokenizer on the training data
    tokenizer.fit_on_texts(X_train['Transcript'])

    # Convert text to sequences
    X_train_seq = tokenizer.texts_to_sequences(X_train['Transcript'])
    X_val_seq = tokenizer.texts_to_sequences(X_val['Transcript'])

    # Pad sequences to have the same length
    max_length = max(max(len(seq) for seq in X_train_seq), max(len(seq) for seq in X_val_seq))
    X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
    X_val_padded = pad_sequences(X_val_seq, maxlen=max_length, padding='post')

    # Encode the response variable
    label_encoder = LabelEncoder()
    label_encoder.fit(y_train)  # Fit the label encoder on train labels
    y_train_encoded = label_encoder.transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)

    # Define the improved model using Keras Tuner
    def build_model(hp):
        model = Sequential()
        model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=hp.Int('embedding_dim', min_value=32, max_value=256, step=32), input_length=max_length))
        model.add(Conv1D(filters=hp.Int('filters', min_value=32, max_value=256, step=32), kernel_size=5, activation='relu'))
        model.add(GlobalMaxPooling1D())
        model.add(BatchNormalization())
        model.add(Dropout(hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.1)))
        # Add another Conv1D layer
        model.add(Dense(units=hp.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu', kernel_regularizer=l2(0.001)))
        model.add(Dropout(hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.1)))
        model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

        model.compile(
            optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])
        return model

    # Initialize the tuner
    tuner = RandomSearch(
        build_model,
        objective='val_accuracy',
        max_trials=10,
        directory=f'tuner_results_fold_{fold_index + 1}',  # Path to store the results for each fold
        project_name='phoneme_tuning')  # A unique project name

    # Perform hyperparameter search
    tuner.search(X_train_padded, y_train_encoded, epochs=50, batch_size=64, validation_data=(X_val_padded, y_val_encoded))

    # Get the best model
    best_model = tuner.get_best_models(num_models=1)[0]

    # Calculate validation accuracy for the best model
    val_accuracy = best_model.evaluate(X_val_padded, y_val_encoded)[1]
    val_accuracies.append(val_accuracy)

# Print average validation accuracy across folds
print("Average Validation Accuracy:", np.mean(val_accuracies))

**Evaluating Accuracy and Predicted Labels on Test Data**

In [None]:
# Extract the input features (transcripts) from the test DataFrame
X_new = test_df['Transcript']

# Convert text to sequences using the tokenizer that was fitted on the training data
X_new_seq = tokenizer.texts_to_sequences(X_new)

# Pad sequences to have the same length as the sequences in the training data
X_new_padded = pad_sequences(X_new_seq, maxlen=max_length, padding='post')

# Make predictions on the new data using the trained model
predictions = model.predict(X_new_padded)

# Decode the predicted labels using the label encoder
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Print the predicted labels
print(predicted_labels)

# Calculate the accuracy on the new data by comparing predicted labels with true labels
accuracy = np.mean(predicted_labels == test_df['phoneme_likelihood'])

# Print the accuracy on the new data
print("Accuracy on New Data:", accuracy)

**Evaluating Accuracy and Correct Predictions per Class**

In [None]:
# Extract the input features from the new data
X_new = test_df['Transcript']

# Convert text to sequences using the tokenizer fitted on the training data
X_new_seq = tokenizer.texts_to_sequences(X_new)

# Pad sequences to have the same length as the training data
X_new_padded = pad_sequences(X_new_seq, maxlen=max_length, padding='post')

# Make predictions on the new data
predictions = model.predict(X_new_padded)

# Decode the predicted labels
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Calculate the accuracy on the new data
accuracy = accuracy_score(predicted_labels, test_df['phoneme_likelihood'])
print("Accuracy on New Data:", accuracy)

# Calculate class probabilities from predicted probabilities
class_probabilities = predictions / np.sum(predictions, axis=1, keepdims=True)
class_probabilities = np.clip(class_probabilities, 1e-10, 1.0 - 1e-10)  # Avoid zero probabilities
entropy_per_sample = -np.sum(class_probabilities * np.log2(class_probabilities), axis=1)

# Normalize entropy values to be between 0 and 1
normalized_entropy = entropy_per_sample / np.log2(len(label_encoder.classes_))

# Calculate average normalized entropy per class
average_normalized_entropy_per_class = {}

for label_idx, label in enumerate(label_encoder.classes_):
    indices_for_label = np.where(test_df['phoneme_likelihood'] == label)[0]
    uncertainties_for_label = normalized_entropy[indices_for_label]
    average_uncertainty = np.mean(uncertainties_for_label)
    average_normalized_entropy_per_class[label] = average_uncertainty

print("Average Normalized Uncertainty per Class Label:")
for label, uncertainty in average_normalized_entropy_per_class.items():
    print(f"{label}: {uncertainty:.4f}")

# Calculate accuracy per class and number of correct predictions
class_labels = label_encoder.classes_
class_accuracy = {}
class_correct_predictions = {}

for label in class_labels:
    indices_for_label = np.where(test_df['phoneme_likelihood'] == label)[0]
    correct_predictions_for_label = np.sum(predicted_labels[indices_for_label] == label)
    total_samples_for_label = len(indices_for_label)
    class_accuracy[label] = correct_predictions_for_label / total_samples_for_label
    class_correct_predictions[label] = correct_predictions_for_label

print("Accuracy and Correct Predictions per Class:")
for label, acc in class_accuracy.items():
    correct_preds = class_correct_predictions[label]
    print(f"{label}: Accuracy = {acc:.4f}, Correct Predictions = {correct_preds}")

**Evaluation of Metrics Calculation**

In [None]:
# Inverse transform predicted labels from one-hot encoded format to original labels
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Extract the true labels from the 'phoneme_likelihood' column of the test dataframe
true_labels = test_df['phoneme_likelihood']

# Calculate precision, recall, and F1-score using the predicted and true labels
precision = precision_score(true_labels, predicted_labels, average='weighted')  # Calculate weighted precision
recall = recall_score(true_labels, predicted_labels, average='weighted')  # Calculate weighted recall
f1 = f1_score(true_labels, predicted_labels, average='weighted')  # Calculate weighted F1-score

# Print the calculated evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)