In [None]:
import torch

# Load the checkpoint
model = torch.load('models/best_six.pth.tar')

# Check the contents of the checkpoint
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import numpy as np

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
def downsample_with_max_pooling(array, factor=(1, 4)):
    if np.all(np.array(factor, int) == 1):
        return array

    sections = []

    for offset in np.ndindex(factor):
        part = array[tuple(np.s_[o::f] for o, f in zip(offset, factor))]
        sections.append(part)

    output = sections[0].copy()

    for section in sections[1:]:
        if output.shape == section.shape:
            np.maximum(output, section, output)
        else:
            if output.shape[0] != section.shape[0]:
                c = output.shape[0] - section.shape[0]
                pad = np.zeros((c, output.shape[1]))
                s = np.vstack((section, pad))
                np.maximum(output, s, output)
            if output.shape[1] != section.shape[1]:
                c = output.shape[1] - section.shape[1]
                pad = np.zeros((output.shape[0], c))
                s = np.hstack((section, pad))
                np.maximum(output, s, output)

    return output

In [None]:
import librosa
import numpy as np
from numba import cuda

  
def pitch_and_intensity_from_audio(path, sr, duration):
    wav, sr = librosa.load(path, sr=sr)
    
    audio_segment = split_audio(wav, sr, duration)

    f0 = librosa.yin(audio_segment[0], fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr)
    # Extract intensity using Root Mean Square (RMS)

    intensity = librosa.feature.rms(y=audio_segment[0], frame_length=1024, hop_length=512)

    f0_downsampled = downsample_with_max_pooling(f0.reshape(1, -1), (1, 4))
    intensity_downsampled = downsample_with_max_pooling(intensity, (1, 4))
    
    return f0_downsampled, intensity_downsampled


def add_missing_padding(audio, sr, duration):
    signal_length = duration * sr
    audio_length = audio.shape[0]
    padding_length = signal_length - audio_length
    if padding_length > 0:
        padding = np.zeros(padding_length)
        signal = np.hstack((audio, padding))
        return signal
    return audio


def split_audio(signal, sr, split_duration):
    length = split_duration * sr

    if length < len(signal):
        frames = librosa.util.frame(signal, frame_length=length, hop_length=length).T
        return frames
    else:
        audio = add_missing_padding(signal, sr, split_duration)
        frames = [audio]
        return np.array(frames)

def mfcc_from_audio_file(path, sr, duration):
    wav, sr = librosa.load(path, sr=sr)

    audio_segment = split_audio(wav, sr, duration)

    mfcc = librosa.feature.mfcc(y=audio_segment[0], sr=sr, n_mfcc=128)

    downsampled = downsample_with_max_pooling(mfcc, (1, 4))

    return downsampled

def extract_mfcc(src_file_path, sample_rate, utterance_duration):
    # Extract MFCC features
    mfcc = mfcc_from_audio_file(src_file_path, sample_rate, utterance_duration)

    # Extract Pitch (Fundamental Frequency) and intensity
    f0, intensity = pitch_and_intensity_from_audio(src_file_path, sample_rate, utterance_duration)
    
    # Combine MFCC, Delta, Delta-Delta, Pitch, and Intensity features
    combined_features = np.concatenate((mfcc, f0, intensity), axis=0)
    
    # Get the emotion class of the audio
    return combined_features


In [None]:
import time
import torch


# Define your class labels
class_y = ['ang','hap','neu','sad']

mfcc_features3 = extract_mfcc('../../iemocap/Ses05M_impro08_M019.wav',32750, 6)#neu-sad

def prepare_input(mfcc_features):

    mfcc_features = np.expand_dims(mfcc_features, axis=0)  # Add channel dimension
    mfcc_features = np.expand_dims(mfcc_features, axis=0)
    return torch.tensor(mfcc_features, dtype=torch.float32)

mfcc_input3 = prepare_input(mfcc_features3) #neu
mfcc_input3 = mfcc_input3.to(device)

model.train()
model.to(device)

# Perform inference
cnn1_out, cnn2_out, cnn3_out, fl, lstm_out, l1_out, dr_out, l2_out, output = model(mfcc_input3)

predicted_class = torch.argmax(output, dim=1).item()
print(f"Predicted Class: {class_y[predicted_class]}")


In [None]:
mfcc_features3 = np.expand_dims(mfcc_features3, axis=0)
input_data = torch.tensor(mfcc_features3, requires_grad=True).float().to(device)
baseline_data = torch.zeros_like(input_data)  # Baseline input (e.g., zero vector)

# Compute Integrated Gradients
attributions, error = integrated_gradients(model, input_data, baseline_data, target_class=3, steps=50)

# Visualize the attributions
feature_names = [f"Feature {i+1}" for i in range(96)]

# Visualize with feature names
visualize_integrated_gradients(attributions, feature_names=feature_names, aggregation='sum')


In [None]:
def compute_insertion_deletion_scores(
    model, input_tensor, attributions, target_class, baseline=None, steps=50
):
    """
    Compute Insertion and Deletion metrics for the Integrated Gradients attributions.
    
    Parameters:
        model: Trained model for prediction.
        input_tensor: Original input for which attributions are computed.
        attributions: Feature attributions (same shape as input_tensor).
        target_class: Class index for which to evaluate.
        baseline: Reference input (default is all zeros).
        steps: Number of steps for gradual insertion/deletion.
    
    Returns:
        insertion_scores: List of scores during feature insertion.
        deletion_scores: List of scores during feature deletion.
    """
    
    # Define baseline (default: zero tensor of the same shape as input)
    if baseline is None:
        baseline = torch.zeros_like(input_tensor)
    
    # Rank features by absolute attribution values (descending order)
    ranked_indices = torch.argsort(-torch.abs(attributions.view(-1)))  # Flatten input
    
    # Insertion: Start with baseline and gradually add features
    insertion_scores = []
    insertion_input = baseline.clone()
    for i in range(steps + 1):
        k = int(i * len(ranked_indices) / steps)  # Compute the number of features to add
        if k > 0:
            indices_to_insert = ranked_indices[:k]
            insertion_input.view(-1)[indices_to_insert] = input_tensor.view(-1)[indices_to_insert]
        
        # Compute model output for target class
        output = model(insertion_input.unsqueeze(0))

        # If the output is a tuple (for multi-output models), extract the first element
        if isinstance(output, tuple):
            output = output[0]

        
        # If the output is 4D (e.g., from convolutional layers), flatten it
        if len(output.shape) == 4:
            # Apply global average pooling (or use flatten if needed)
            output = output.view(output.size(0), -1)  # Flatten all dimensions except batch
        
        # If you need to reduce to a specific target class (classification), use:
        if len(output.shape) == 2:
            output = output[:, target_class]  # For classification, select the target class
        
        # If regression, use the output directly (e.g., if it's scalar)
        if len(output.shape) == 1:
            output = output.item()  # Convert the scalar output to a Python float
        
        insertion_scores.append(output)
    
    # Deletion: Start with the original input and gradually remove features
    deletion_scores = []
    deletion_input = input_tensor.clone()
    for i in range(steps + 1):
        k = int(i * len(ranked_indices) / steps)  # Compute the number of features to remove
        if k > 0:
            indices_to_delete = ranked_indices[:k]
            deletion_input.view(-1)[indices_to_delete] = baseline.view(-1)[indices_to_delete]
        
        # Compute model output for target class
        output = model(deletion_input.unsqueeze(0))

        # Check if the output is a tuple (e.g., model has multiple outputs)
        if isinstance(output, tuple):
            output = output[0]  # Extract the first element (the main output)
        
        # If the output is 4D (e.g., from convolutional layers), flatten it
        if len(output.shape) == 4:
            output = output.view(output.size(0), -1)  # Flatten all dimensions except batch
        
        # If you need to reduce to a specific target class (classification), use:
        if len(output.shape) == 2:
            output = output[:, target_class]  # For classification, select the target class
        
        # If regression, use the output directly (e.g., if it's scalar)
        if len(output.shape) == 1:
            output = output.item()  # Convert the scalar output to a Python float
    

        deletion_scores.append(output)
    
    return insertion_scores, deletion_scores


# Example: Compute Insertion and Deletion Scores
insertion_scores, deletion_scores = compute_insertion_deletion_scores(
    model=model,
    input_tensor=input_data,
    attributions=attributions,
    target_class=3,  # Adjust based on your target class
    baseline=baseline_data,
    steps=50
)

# Visualization of Insertion and Deletion Scores
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(range(len(insertion_scores)), insertion_scores, label="Insertion", color="green")
plt.plot(range(len(deletion_scores)), deletion_scores, label="Deletion", color="red")
plt.xlabel("Steps")
plt.ylabel("Model Output")
plt.title("Insertion and Deletion Metrics")
plt.legend()
plt.tight_layout()
plt.savefig("insertion_deletion_plot.png")
plt.show()

print("Insertion and Deletion plot saved as 'insertion_deletion_plot_ang.png'.")
