<a href="https://colab.research.google.com/github/hassssan051/portrait-video-synthesis/blob/audio-to-descriptor-pred/prediction/NN_First_Frame_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import zipfile
from tqdm import tqdm
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

import torch.nn.functional as F

In [None]:
#loading MFCC features of RAVDESS dataset
datasetPath = 'DatasetForLSTM'
Zipped_inside_folder = 'RAVDESS_MFCC'
with zipfile.ZipFile('drive/MyDrive/'+Zipped_inside_folder+'.zip', 'r') as zip_ref:
    zip_ref.extractall(datasetPath)


In [None]:
from collections import defaultdict
import pickle


#getting descriptors of frames of each video and storing this information in a dictionary (true_descriptors) where key is video name and value is a list of descriptors of its frames.

clusters_info = 'Sawaiz_2/pkl_for_lstm_encoded/17_53_50_800'
file_path = '/content/drive/MyDrive/'+clusters_info+'/live_portrait_descriptors_all_encoder.pkl'

# Open the file in binary read mode and load the data
with open(file_path, 'rb') as file:
    data = pickle.load(file)

video_dict = defaultdict(list)


# Populate the video_dict with frame arrays in order
for key, value in data.items():
    # Split the key to extract video name and frame number
    parts = key.split('/')
    if 'M' not in key: #For Ravdess data
      video_name = parts[1]  # Extracts '02-01-01-01-02-02-16'
      frame_number = int(parts[2].split('.')[0])  # Extracts frame number as an integer (e.g., 1)

    else: #for MEAD
      video_name = parts[0] + "__" + parts[2] + "__" + parts[3] + "__" + parts[4]
      frame_number = int(parts[-1].split(".")[0].split("_")[-1])
    # Append the frame array to the respective video entry in the dictionary
    video_dict[video_name].append((frame_number, value))



# Sort frames for each video by frame number and concatenate them into a single array
final_video_dict = {}
for video_name, frames in video_dict.items():
    # Sort frames by frame number to ensure the order is correct
    sorted_frames = sorted(frames, key=lambda x: x[0])
    # Extract only the frame data, discarding the frame numbers
    sorted_arrays = [frame_data for _, frame_data in sorted_frames]
    # Concatenate all frames into a single numpy array
    final_video_dict[video_name] = np.vstack(sorted_arrays)
true_descriptors = final_video_dict
videos_list = list(true_descriptors.keys())
print(len(videos_list))

9282


In [None]:
file_path = '/content/drive/MyDrive/'+clusters_info+'/averaged_descriptors_encoded.pkl'

with open(file_path, 'rb') as file:
    clusters_data = pickle.load(file)

# Actual labels for the LP
file_path = '/content/drive/MyDrive/'+clusters_info+'/frame_to_cluster_mapping.pkl'
with open(file_path, 'rb') as file:
    frames_data = pickle.load(file)

In [None]:
for key, value in frames_data.items():
    sorted_value = sorted(value, key=lambda x: int(x[0].split('_')[-1].split('.')[0]) if '_' in x[0] else int(x[0].split('.')[0]))
    frames_data[key] = sorted_value

for key, val in frames_data.items():
    frames_data[key] = [ x[1] for x in val]

frames_data_new = {}
for key, val in frames_data.items():
    if 'M' in key:
      parts = key.split("/")
      video_name = parts[0] + "__" + parts[2] + "__" + parts[3] + "__" + parts[4]
      frames_data_new[video_name] = val
    else:
      frames_data_new[key] = val
frames_data = frames_data_new
#Here, frames_data is a dictionary where key is video name and value is list of cluster ids of its frames.


In [None]:
print(len(list(frames_data.keys())))

9282


In [None]:
frames_count = {}
first_frame_clusters = {}
only_mead = {}
only_ravdess = {}
for key, val in frames_data.items():
  first_frame_clusters[val[0]] = 1
  if 'M' in key:
    only_mead[val[0]] =1
  else:
    only_ravdess[val[0]] = 1
  if val[0] not in frames_count:
    frames_count[val[0]] = 0
  frames_count[val[0]] += 1

print(len(list(first_frame_clusters.keys())))
print(len(list(only_mead.keys())))
print(len(list(only_ravdess.keys())))
vals = (list(frames_count.values()))
vals.sort(reverse= True)
valsnp = np.array(vals)
print(np.mean(valsnp))


760
727
395
12.213157894736842


In [None]:
number_of_clusters = 800 # Updated
stacked_descriptors = [clusters_data[val] for val in range(number_of_clusters)]
clusters_descriptors = np.vstack(stacked_descriptors) #A numpy array of cluster decriptors

In [None]:
#Here, key is a video name and value is a list of cluster representatives of those clusters to which its frames are mapped

clusters_rep_as_ground_truth_for_a_video = {}
for video, frames in frames_data.items():
  stacked_clusters_rep = [clusters_descriptors[val] for val in frames]
  clusters_rep_as_ground_truth_for_a_video[video] = np.vstack(stacked_clusters_rep)

In [None]:
file_path = '/content/drive/MyDrive/'+clusters_info+'/cluster_rep_clusters4_level1_clusters24_level2.pkl'

with open(file_path, 'rb') as file:
    clusters_data = pickle.load(file)

# Actual labels for the LP
file_path = '/content/drive/MyDrive/'+clusters_info+'/frame_to_cluster_clusters4_level1_clusters24_level2.pkl'
with open(file_path, 'rb') as file:
    frames_data_raw = pickle.load(file)

cluster_level = 4
frames_to_clusters_indices = {}
clusters_indices= {}
# Populate the video_dict with frame arrays in order
for key, value in frames_data_raw.items():
    # Split the key to extract video name and frame number
    parts = key.split('/')
    if 'M' not in key: #For Ravdess data
      video_name = parts[1]  # Extracts '02-01-01-01-02-02-16'
      frame_number = int(parts[2].split('.')[0])  # Extracts frame number as an integer (e.g., 1)

    else: #for MEAD
      video_name = parts[0] + "__" + parts[2] + "__" + parts[3] + "__" + parts[4]
      frame_number = int(parts[-1].split(".")[0].split("_")[-1])
    # Append the frame array to the respective video entry in the dictionary
    if video_name not in frames_to_clusters_indices:
      frames_to_clusters_indices[video_name] = []
    try:
      cluster_name = value
      frames_to_clusters_indices[video_name].append((frame_number, cluster_name))
      clusters_indices[cluster_name]=0
    except:
      cluster_name = value
      frames_to_clusters_indices[video_name].append((frame_number, cluster_name))
      clusters_indices[cluster_name]=0



clusters_descriptors = []
idx = 0
for key, val in clusters_indices.items():
  clusters_indices[key] = key
  #print(key)
  clusters_descriptors.append(clusters_data[key])
  idx+=1
clusters_descriptors = np.vstack(clusters_descriptors)

# Sort frames for each video by frame number and concatenate them into a single array
frames_data = {}
for video_name, frames in frames_to_clusters_indices.items():
    # Sort frames by frame number to ensure the order is correct
    sorted_frames = sorted(frames, key=lambda x: x[0])
    # Extract only the frame data, discarding the frame numbers
    sorted_arrays = [clusters_indices[frame_data] for _, frame_data in sorted_frames]
    # Concatenate all frames into a single numpy array
    frames_data[video_name] = sorted_arrays
#Here, frames_data is a dictionary where key is video name and value is list of cluster ids of its frames.

# clusters_rep_as_ground_truth_for_a_video = {}
# for video, frames in frames_data.items():
#   stacked_clusters_rep = [clusters_descriptors[val] for val in frames]
#   clusters_rep_as_ground_truth_for_a_video[video] = np.vstack(stacked_clusters_rep)

In [None]:
def create_hierarchical_dict(clusters_data, level_number):
    """
    Create a hierarchical dictionary of clusters and their children up to the specified level.

    Args:
        clusters_data (dict): Dictionary where keys are cluster names and values are their representatives.
        level_number (int): Specifies the depth of hierarchy to include in the output.

    Returns:
        dict: Hierarchical dictionary with clusters grouped by levels.
    """
    hierarchical_dict = {}

    # Step 1: Extract Level 1 clusters
    level_1_clusters = {name: rep for name, rep in clusters_data.items() if name.startswith("Cluster_") and "." not in name}
    hierarchical_dict["Level 1"] = level_1_clusters
    for i in range(2, level_number):
      level_clusters = {name: rep for name, rep in clusters_data.items() if name.startswith("Cluster_") and name.count('.')==i}
      hierarchical_dict[f"Level {i}"] = level_clusters
    # Step 2: Extract children for each cluster iteratively up to the specified level
    def extract_children(parent_cluster):
        """Extract direct child clusters of a given parent cluster."""
        parent_prefix = parent_cluster + "."
        return {name: rep for name, rep in clusters_data.items() if name.startswith(parent_prefix) and name.count('.') == parent_cluster.count('.') + 1}

    # Build hierarchy up to the specified level
    for current_level in range(1, level_number):
        current_clusters = hierarchical_dict.get("Level 1" if current_level == 1 else f"Level {current_level}", {})
        for parent_cluster in current_clusters:
            children = extract_children(parent_cluster)
            if children:
                hierarchical_dict[parent_cluster] = children

    return hierarchical_dict

hierarchical_cluster_data =create_hierarchical_dict(clusters_data, 2)

In [None]:
def extract_wav2vec_features_and_labels_from_csv(csv_path, scaler=None): #To get audio features (MFCC/Wav2Vec) of a video
    df = pd.read_csv(csv_path, header=None)
    features = df.iloc[:-1, :].values.astype(np.float32)
    video_name = csv_path.replace(".csv",'').split("/")[-1]
    if scaler is not None:
        features = scaler.transform(features)

    return features, true_descriptors[video_name]#frames_data[video_name]#clusters_rep_as_ground_truth_for_a_video[video_name]#true_descriptors[video_name]

def load_data_from_directory(directory_path):
    return [os.path.join(directory_path, fname) for fname in os.listdir(directory_path) if fname.endswith(".csv")]

def collate_fn(batch):
    features = [item[0] for item in batch]
    # for regression
    labels = [item[1] for item in batch]
    features_padded = torch.nn.utils.rnn.pad_sequence(features, batch_first=True)
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return features_padded, labels_padded#torch.tensor(labels, dtype=torch.int64)#labels_padded

first_frames = 3
class SlidingWindowAudioDataset(Dataset):
    def __init__(self, video_paths, num_clusters=500, scaler=None):
        """
        Dataset for audio features with sliding window implementation.

        Args:
        - video_paths: List of paths to video CSVs.
        - scaler: Scaler to normalize the features.
        - window_size: Number of rows in each sliding window.
        """
        self.video_paths = video_paths
        self.scaler = scaler
        self.data = []  # To store concatenated feature-label vectors
        self.labels = []  # To store the corresponding output labels (Nth label)
        self._prepare_data()

    def _prepare_data(self):
        for video_path in self.video_paths:
            features, labels = extract_wav2vec_features_and_labels_from_csv(video_path, self.scaler)

            # Normalize features using the fitted scaler
            if self.scaler:
                features = self.scaler.transform(features)

            # For regression
            num_rows = labels.shape[0]
            # For classification
            #num_rows = len(labels)
            #For All data
            # for i in range(num_rows):
            #     try:
            #       window_features = features[i]
            #       target_label = labels[i]

            #       input_vector = window_features.flatten()

            #       # The Nth label is the target/output


            #       self.data.append(input_vector)
            #       self.labels.append(target_label)
            #     except:
            #       continue

            #For first frame only
            self.data.append(features[:first_frames].flatten())
            self.labels.append(labels[0])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.data[idx], dtype=torch.float32),  # The concatenated feature-label vector
            torch.tensor(self.labels[idx], dtype=torch.float32),  # For regression
        )


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

scaler = MinMaxScaler()
import joblib

# Save the trained scaler to a file
scaler_filename = "/content/drive/MyDrive/Sawaiz_2/results_pkl_videos/scaler_MEAD2.pkl"
# joblib.dump(scaler, scaler_filename)
# print(f"Scaler saved to {scaler_filename}")
scaler = joblib.load(scaler_filename)
print("Scaler loaded successfully!")
mead_mfcc_path = "/content/drive/MyDrive/MEAD_MFCC/"
# Load and prepare data
dataset_path = "DatasetForLSTM/"+ Zipped_inside_folder

# Update video paths
# Go through each video name and update path of the csv --> wave to vec csvs
all_video_paths = []
for video in videos_list:
  if 'M' in video: #For MEAD
    all_video_paths.append(mead_mfcc_path+video+".csv")
  else:
    all_video_paths.append("DatasetForLSTM/"+Zipped_inside_folder+"/"+video+".csv")

train_video_paths, test_video_paths = train_test_split(all_video_paths, test_size=0.05, random_state=42)
# for video_path in train_video_paths:
#     features, _ = extract_wav2vec_features_and_labels_from_csv(video_path)
    #scaler.partial_fit(features)



Using device: cpu
Scaler loaded successfully!


In [None]:
#Batch Size
batch_size = 32


# Initialize the sliding window dataset
train_dataset = SlidingWindowAudioDataset(train_video_paths, scaler=scaler)
test_dataset = SlidingWindowAudioDataset(test_video_paths, scaler=scaler)

# DataLoader remains the same
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
print(len(train_loader))

17037


# NN for Regression

In [None]:

class MultiOutputNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        """
        Multi-output neural network that outputs an N-dimensional vector.

        Args:
        - input_dim: Number of input features.
        - output_dim: Size of the output vector (N).
        """
        super(MultiOutputNN, self).__init__()

        # Define the layers of the network
        self.fc1 = nn.Linear(input_dim, 64)  # First hidden layer
        self.fc2 = nn.Linear(64,128)
        self.fc3 = nn.Linear(128, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 64)
        self.fc6 = nn.Linear(64, 32)         # Second hidden layer
        self.fc7 = nn.Linear(32, output_dim)  # Output layer (N-dimensional vector)

    def forward(self, x):
        """
        Forward pass.

        Args:
        - x: Input tensor of shape (batch_size, input_dim)

        Returns:
        - output: Tensor of shape (batch_size, output_dim)
        """
        x = torch.relu(self.fc1(x))  # Apply ReLU activation
        x = torch.relu(self.fc2(x))  # Apply ReLU activation
        x = torch.relu(self.fc3(x))  # Apply ReLU activation
        x = torch.relu(self.fc4(x))  # Apply ReLU activation
        x = torch.relu(self.fc5(x))
        x = torch.relu(self.fc6(x))
        output = self.fc7(x)         # Output layer (no activation)
        return output

model = MultiOutputNN(28*first_frames,16).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/Sawaiz_2/saved_models/NN_firstframe__first3frames_GT:frames_800_hierarchical_clusters.pth',map_location=torch.device('cpu'), weights_only=True)) #Reload path add
model.to(device)


MultiOutputNN(
  (fc1): Linear(in_features=84, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=256, bias=True)
  (fc4): Linear(in_features=256, out_features=128, bias=True)
  (fc5): Linear(in_features=128, out_features=64, bias=True)
  (fc6): Linear(in_features=64, out_features=32, bias=True)
  (fc7): Linear(in_features=32, out_features=16, bias=True)
)

In [None]:


def train(model, train_loader, optimizer, device, alpha=0.5, beta=0.5):
    model.train()
    cumulative_mse_loss = 0  # MSE loss for epoch
    num_batches = len(train_loader)
    progress_bar = tqdm(train_loader, desc="Training", leave=False)

    for i, (features, labels) in enumerate(progress_bar):
        features, labels = features.to(device), labels.to(device)

        # Forward pass
        outputs = model(features)
        if outputs.shape[1] != labels.shape[1]:
          outputs = outputs[:, :-1, :]

        # Compute MSE loss
        mse_loss = F.mse_loss(outputs, labels, reduction = 'mean')

        # Backward pass and optimization
        optimizer.zero_grad()

        mse_loss.backward()
        optimizer.step()

        # Accumulate losses
        cumulative_mse_loss += mse_loss.item()

        # Update progress bar with average batch loss so far
        progress_bar.set_postfix({

            "Average MSE": cumulative_mse_loss / (i + 1)
        })

    # Calculate average loss for epoch
    avg_mse_loss = cumulative_mse_loss / num_batches

    print(f"Training Epoch Average MSE Loss: {avg_mse_loss}", sep=" ")
    return avg_mse_loss

def test(model, test_loader, device, alpha=0.5, beta=0.5):
    model.eval()
    cumulative_mse_loss = 0  # MSE loss for epoch
    num_batches = len(test_loader)
    progress_bar = tqdm(test_loader, desc="Testing", leave=False)

    with torch.no_grad():
        for i, (features, labels) in enumerate(progress_bar):
            features, labels = features.to(device), labels.to(device)

            # Forward pass
            outputs = model(features)
            if outputs.shape[1] != labels.shape[1]:
              outputs = outputs[:, :-1, :]

            # Compute MSE loss
            mse_loss = F.mse_loss(outputs, labels, reduction = 'mean')


            cumulative_mse_loss += mse_loss.item()


            # Update progress bar with average batch loss so far
            progress_bar.set_postfix({

                "Average MSE": cumulative_mse_loss / (i + 1)

            })

    # Calculate average loss for epoch
    avg_mse_loss = cumulative_mse_loss / num_batches
    print(f"Testing Epoch Average MSE Loss: {avg_mse_loss}", sep=" ")

    return avg_mse_loss


# For Classification

In [None]:
def train(model, train_loader, optimizer, device, alpha=0.5, beta=0.5):
    model.train()
    criterion = torch.nn.CrossEntropyLoss() #For classification
    cumulative_loss = 0  # Cumulative loss for the epoch
    num_batches = len(train_loader)
    correct_predictions = 0  # For accuracy calculation
    total_predictions = 0  # For accuracy calculation
    progress_bar = tqdm(train_loader, desc="Training", leave=False)

    for i, (features, labels) in enumerate(progress_bar):
        labels = labels.long()
        features, labels = features.to(device), labels.to(device)

        # Forward pass
        outputs = model(features)
        # Compute the loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)  # Get predicted class
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

        # Accumulate the loss
        cumulative_loss += loss.item()

        # Update progress bar with average batch loss and accuracy so far
        progress_bar.set_postfix({
            "Average Loss": cumulative_loss / (i + 1),
            "Accuracy": 100. * correct_predictions / total_predictions
        })

    # Calculate average loss and accuracy for the epoch
    avg_loss = cumulative_loss / num_batches
    accuracy = 100. * correct_predictions / total_predictions

    print(f"Training Epoch Average Loss: {avg_loss}, Accuracy: {accuracy}%")
    return avg_loss, accuracy

def test(model, test_loader, device, alpha=0.5, beta=0.5):
    model.eval()
    criterion = torch.nn.CrossEntropyLoss() #For classification
    cumulative_loss = 0  # Cumulative loss for the epoch
    correct_predictions = 0  # For accuracy calculation
    total_predictions = 0  # For accuracy calculation
    num_batches = len(test_loader)
    progress_bar = tqdm(test_loader, desc="Testing", leave=False)

    with torch.no_grad():
        for i, (features, labels) in enumerate(progress_bar):
            labels = labels.long()
            features, labels = features.to(device), labels.to(device)

            # Forward pass
            outputs = model(features)

            # Compute the loss
            loss = criterion(outputs, labels)

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)  # Get predicted class
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

            # Accumulate the loss
            cumulative_loss += loss.item()

            # Update progress bar with average batch loss and accuracy so far
            progress_bar.set_postfix({
                "Average Loss": cumulative_loss / (i + 1),
                "Accuracy": 100. * correct_predictions / total_predictions
            })

    # Calculate average loss and accuracy for the epoch
    avg_loss = cumulative_loss / num_batches
    accuracy = 100. * correct_predictions / total_predictions

    print(f"Testing Epoch Average Loss: {avg_loss}, Accuracy: {accuracy}%")
    return avg_loss, accuracy



# COSINE ONLY

In [None]:



def train(model, train_loader, optimizer, device, alpha=0.5, beta=0.5):
    model.train()
    cumulative_cosine_loss = 0  # MSE loss for epoch
    num_batches = len(train_loader)
    progress_bar = tqdm(train_loader, desc="Training", leave=False)

    for i, (features, labels) in enumerate(progress_bar):
        features, labels = features.to(device), labels.to(device)

        # Forward pass
        outputs = model(features)
        if outputs.shape[1] != labels.shape[1]:
          outputs = outputs[:, :-1, :]

        # Compute Cosine Similarity loss
        cosine_loss = 1 - F.cosine_similarity(outputs, labels, dim=-1).mean()



        # Backward pass and optimization
        optimizer.zero_grad()

        cosine_loss.backward()
        optimizer.step()

        # Accumulate losses
        cumulative_cosine_loss += cosine_loss.item()

        # Update progress bar with average batch loss so far
        progress_bar.set_postfix({

            "Average COSINE": cumulative_cosine_loss / (i + 1)
        })

    # Calculate average loss for epoch
    avg_cosine_loss = cumulative_cosine_loss / num_batches

    print(f"Training Epoch Average COSINE Loss: {avg_cosine_loss}", sep=" ")
    return avg_cosine_loss

def test(model, test_loader, device, alpha=0.5, beta=0.5):
    model.eval()
    cumulative_cosine_loss = 0  # MSE loss for epoch
    num_batches = len(test_loader)
    progress_bar = tqdm(test_loader, desc="Testing", leave=False)

    with torch.no_grad():
        for i, (features, labels) in enumerate(progress_bar):
            features, labels = features.to(device), labels.to(device)

            # Forward pass
            outputs = model(features)
            if outputs.shape[1] != labels.shape[1]:
              outputs = outputs[:, :-1, :]

            # Compute Cosine Similarity loss
            cosine_loss = 1 - F.cosine_similarity(outputs, labels, dim=-1).mean()


            cumulative_cosine_loss += cosine_loss.item()


            # Update progress bar with average batch loss so far
            progress_bar.set_postfix({

                "Average COSINE": cumulative_cosine_loss / (i + 1)

            })

    # Calculate average loss for epoch
    avg_cosine_loss = cumulative_cosine_loss / num_batches
    print(f"Testing Epoch Average COSINE Loss: {avg_cosine_loss}", sep=" ")

    return avg_cosine_loss


In [None]:
# Optimizer, criterion, and scheduler
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)




In [None]:
epochs = 20

# model.load_state_dict(torch.load('drive/MyDrive/LSTM_Params/lstm_predicting_LP_descriptors.pth', weights_only=True))
model.to(device)
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss = train(model, train_loader, optimizer, device, alpha=1, beta=0.001)
    test_loss = test(model, test_loader, device, alpha=1, beta=0.001)

    #Step the scheduler with the test loss
    # scheduler.step(test_loss)

#     if test_loss < best_loss:
#         best_loss = test_loss
#         torch.save(model.state_dict(), 'drive/MyDrive/LSTM_Params/lstm_2_predicting_descriptors_best_model.pth')
#         print(f"New best model saved with loss: {best_loss:.2f}")

#     if epoch % 5 == 0:
#         torch.save(model.state_dict(), f'drive/MyDrive/LSTM_Params/lstm_2_predicting_descriptors_epoch{epoch}.pth')

In [None]:
#torch.save(model.state_dict(), '/content/drive/MyDrive/Sawaiz_2/saved_models/NN_firstframe__usingfirst3frames_800_clusters.pth')


In [None]:
class PerVideoPerFrameDatasetForAccuracy(Dataset):
    def __init__(self, video_paths, frames_to_clusters_mapping, scaler=None):
        """
        Dataset for generating sliding windows for an entire video.

        Args:
        - video_paths: List of paths to video CSVs.
        - scaler: Scaler to normalize the features.
        - window_size: Number of rows in each sliding window.
        """
        self.video_paths = video_paths
        self.scaler = scaler
        self.frames_to_clusters_mapping = frames_to_clusters_mapping

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        features, labels = extract_wav2vec_features_and_labels_from_csv(video_path)
        video_name = video_path.replace(".csv",'').split("/")[-1]
        # Normalize features using the scaler
        if self.scaler:
            features = self.scaler.transform(features)

        # for regression
        #num_rows = labels.shape[0]
        #for classification
        #num_rows = len(labels)
        all_features = []
        all_labels = []
        # For all frames
        # for i in range(num_rows):
        #     window_features = features[i]
        #     target_label = frames_data[video_name][i]

        #     # Concatenate the first (N-1) labels with the N features for input
        #     # Shape: (N, 28) for features, and (N-1,) for labels
        #     input_vector = window_features.flatten()
        #     all_features.append(input_vector)
        #     # The Nth label is the target/output

        #     all_labels.append(target_label)
            #For first frame only
            #break

        #For first frame only
        all_features.append(features[:first_frames].flatten())
        all_labels.append(frames_data[video_name][0])

        return {
            "video": video_name,
            "features": [torch.tensor(w, dtype=torch.float32) for w in all_features],
            "labels": [w for w in all_labels],
        }



In [None]:
def per_video_per_frame_collate_fn(batch):
    video_name = batch[0]['video']
    features = batch[0]["features"]  # List of sliding windows for each video
    labels = batch[0]["labels"]  # Corresponding labels
    return video_name, features, labels


# Initialize the sliding window per video dataset
train_dataset_ = PerVideoPerFrameDatasetForAccuracy(train_video_paths, frames_data, scaler=scaler)
test_dataset_ = PerVideoPerFrameDatasetForAccuracy(test_video_paths,frames_data, scaler=scaler)

# Initialize the data loaders
train_loader_ = DataLoader(train_dataset_, batch_size=1, shuffle=False, collate_fn=per_video_per_frame_collate_fn)
test_loader_ = DataLoader(test_dataset_, batch_size=1, shuffle=False, collate_fn=per_video_per_frame_collate_fn)


In [None]:
def get_predicted_labels(outputs, cluster_descriptors, device):
    # Ensure cluster_descriptors is a torch tensor and move to device
    cluster_descriptors = torch.tensor(cluster_descriptors, device=device, dtype=outputs.dtype)
    #print(cluster_descriptors.shape)
    # Initialize an empty list to store predicted labels for each frame
    predicted_labels = []
    #outputs = outputs.squeeze(0)
    # Iterate over each frame descriptor in outputs
    for frame_descriptor in outputs:
        # Calculate Euclidean distances between the frame descriptor and each cluster descriptor

        distances = torch.norm(cluster_descriptors - frame_descriptor, dim=1)

        # Find the index of the minimum distance (i.e., closest cluster descriptor)
        predicted_label = torch.argmin(distances)
        predicted_labels.append(predicted_label.item())

    # Convert predicted labels list to a tensor on the same device
    return torch.tensor(predicted_labels, device=device)

def evaluate_video_predictions(outputs, cluster_descriptors, actual_labels_names, device, cluster_nearest_neighbors):
    # Get predicted labels based on Euclidean distance comparison
    predicted_labels = get_predicted_labels(outputs, cluster_descriptors, device)
    # Get the actual labels for the frames from frames_data and convert to torch tensor on the same device
    #print(predicted_labels.shape)
    actual_labels = torch.tensor(actual_labels_names, device=device)
    # Initialize a counter for matches
    matches = 0
    total_labels = actual_labels.size(0)
    # Iterate through each label and check for matches or nearest neighbor matches
    for i in range(total_labels):
        actual_label = actual_labels[i].item()
        predicted_label = predicted_labels[i].item()

        # Check if predicted label is a match or a match with the nearest neighbor
        if predicted_label == actual_label or  predicted_label in cluster_nearest_neighbors[actual_label]:
            matches += 1
    predicted_labels = predicted_labels.cpu().numpy()
    return predicted_labels.tolist(), matches, total_labels

def evaluation_for_classification(outputs, actual_labels_names, device):
  _, predicted_labels = torch.max(outputs, 1)
  actual_labels = torch.tensor(actual_labels_names, device=device)
  # Initialize a counter for matches
  matches = 0
  total_labels = actual_labels.size(0)
  # Iterate through each label and check for matches or nearest neighbor matches
  for i in range(total_labels):
      actual_label = actual_labels[i].item()
      predicted_label = predicted_labels[i].item()

      # Check if predicted label is a match or a match with the nearest neighbor
      if predicted_label == actual_label or  predicted_label in cluster_nearest_neighbors[actual_label]:
          matches += 1
  predicted_labels = predicted_labels.cpu().numpy()
  return predicted_labels.tolist(), matches, total_labels

def find_closest_cluster(hierarchical_dict, vector, level_number):
    """
    Find the closest cluster to a given vector up to a specified level in the hierarchy.

    Args:
        hierarchical_dict (dict): Hierarchical dictionary of clusters and their children.
        vector (np.ndarray): The input vector (1, n) to compare against cluster representatives.
        level_number (int): The level number up to which to find the closest cluster.

    Returns:
        str: The name of the closest cluster.
    """
    current_level_clusters = hierarchical_dict.get("Level 1", {})
    closest_cluster = None


    for level in range(1, level_number + 1):
        closest_distance = float("inf")
        if not current_level_clusters:
            break

        for cluster_name, cluster_rep in current_level_clusters.items():
            cluster_rep = np.array(cluster_rep)  # Ensure the cluster representative is a numpy array
            distance = np.linalg.norm(vector - cluster_rep)  # Euclidean distance
            if distance < closest_distance:
                closest_distance = distance
                closest_cluster = cluster_name

        # Stop if the current level is the maximum level specified
        if level == level_number:
            break

        # Move to the next level: get children of the closest cluster
        current_level_clusters = hierarchical_dict.get(closest_cluster, {})

    return closest_cluster
def evaluation_with_hierarchical_clustering(hierarchical_dict, vector, level_number, actual_label):
  predicted_label = find_closest_cluster(hierarchical_dict, vector, level_number)
  matched = 0
  if predicted_label == actual_label:
    matched+=1
  return [predicted_label], matched, 1


In [None]:
import numpy as np
from scipy.spatial.distance import pdist, squareform

# Compute pairwise Euclidean distances
pairwise_distances = squareform(pdist(clusters_descriptors, metric='euclidean'))

# Set diagonal to infinity to ignore self-distances
np.fill_diagonal(pairwise_distances, np.inf)

# Find the indices of the top 5 nearest neighbors for each cluster
cluster_nearest_neighbors = np.argsort(pairwise_distances, axis=1)[:, :1].tolist()




In [None]:
# model.load_state_dict(torch.load('/content/drive/MyDrive/Sawaiz_2/saved_models/NN_firstframe__usingfirst3frames_800_clusters.pth', weights_only=True)) #Reload path add
# model.to(device)

In [None]:
results = {}
matched = 0
total = 0


for video_name, features_list, actual_labels in train_loader_:
    labels_for_video =[]
    for i, features in enumerate(features_list):
      features = features.to(device)
      features = features.unsqueeze(0)
      outputs = model(features)  # Obtain model predictions directly in torch
      #for hierarchical clustering
      outputs = outputs.detach().numpy()
      predicted_labels, matches, total_labels =evaluation_with_hierarchical_clustering(hierarchical_cluster_data, outputs, 2, actual_labels[0])
      # for regression
      #predicted_labels, matches, total_labels = evaluate_video_predictions(outputs, clusters_descriptors, [actual_labels[i]], device, cluster_nearest_neighbors)
      #for classification
      #predicted_labels, matches, total_labels = evaluation_for_classification(outputs, [actual_labels[i]], device)
      labels_for_video.extend(predicted_labels)
      matched+=matches
      total += total_labels
    results[video_name] = labels_for_video

print("Train Accuracy : ", (matched * 100)/total, "%")

matched_test = 0
total_test = 0

for video_name, features_list, actual_labels in test_loader_:
    labels_for_video =[]
    for i, features in enumerate(features_list):
      features = features.to(device)
      features = features.unsqueeze(0)
      outputs = model(features)  # Obtain model predictions directly in torch
      #for hierarchical clustering
      outputs = outputs.detach().numpy()
      predicted_labels, matches, total_labels =evaluation_with_hierarchical_clustering(hierarchical_cluster_data, outputs, 2, actual_labels[0])
      # for regression
      #predicted_labels, matches, total_labels = evaluate_video_predictions(outputs, clusters_descriptors, [actual_labels[i]], device, cluster_nearest_neighbors)
      #for classification
      #predicted_labels, matches, total_labels = evaluation_for_classification(outputs, [actual_labels[i]], device)
      labels_for_video.extend(predicted_labels)
      matched_test+=matches
      total_test += total_labels

    results[video_name] = labels_for_video

print("Test Accuracy : ", (matched_test*100)/total_test, "%")

print("Test Accuracy (Test + Training): ", ((matched_test+matched)*100)/(total_test+total), "%")



Train Accuracy :  5.126460247249631 %
Test Accuracy :  3.6559139784946235 %
Test Accuracy (Test + Training):  5.052790346907994 %


In [None]:
'''
4 Clusters Level in 1 and 16 Clusters Level 2: 8.98%
4 Clusters Level in 1 and 20 Cluster Level 2: 8.75%
4 Clusters Level in 1 and 24 Cluster Level 2: 5%

5 Clusters Level in 1 and 16 Clusters Level 2: 8,7%
5 Clusters Level in 1 and 20 Clusters Level 2: 4.98%
5 Clusters Level in 1 and 24 Clusters Level 2: 6.82%
'''

'''
4 Clusters: 90%
5 Clusters: 83.3%
6 Clusters: 42%
7 Clusters: 65.6%
8 Clusters: 64.1%
9 Clusters: 66.9%
10 Clusters: 39.7%
'''

8817
