<a href="https://colab.research.google.com/github/hassssan051/portrait-video-synthesis/blob/audio-to-descriptor-pred/prediction/KNN_FirstFramePredictor_MFCC_Cluster_rep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import zipfile
from tqdm import tqdm
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler

import torch.nn.functional as F
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
import pickle


In [None]:
#loading MFCC features of RAVDESS dataset
datasetPath = 'DatasetForLSTM'
Zipped_inside_folder = 'RAVDESS_MFCC'
with zipfile.ZipFile('drive/MyDrive/'+Zipped_inside_folder+'.zip', 'r') as zip_ref:
    zip_ref.extractall(datasetPath)


In [None]:

#getting descriptors of frames of each video and storing this information in a dictionar (true_descriptors) where key is video name and value is a list of descriptors of its frames.


clusters_info = 'Sawaiz_2/pkl_for_lstm_encoded/17_53_50_800'
file_path = '/content/drive/MyDrive/'+clusters_info+'/live_portrait_descriptors_all_encoder.pkl'

# Open the file in binary read mode and load the data
with open(file_path, 'rb') as file:
    data = pickle.load(file)

video_dict = defaultdict(list)


# Populate the video_dict with frame arrays in order
for key, value in data.items():
    # Split the key to extract video name and frame number
    parts = key.split('/')
    if 'M' not in key: #For Ravdess data
      video_name = parts[1]  # Extracts '02-01-01-01-02-02-16'
      frame_number = int(parts[2].split('.')[0])  # Extracts frame number as an integer (e.g., 1)

    else: #for MEAD
      video_name = parts[0] + "__" + parts[2] + "__" + parts[3] + "__" + parts[4]
      frame_number = int(parts[-1].split(".")[0].split("_")[-1])
    # Append the frame array to the respective video entry in the dictionary
    video_dict[video_name].append((frame_number, value))



# Sort frames for each video by frame number and concatenate them into a single array
final_video_dict = {}
for video_name, frames in video_dict.items():
    # Sort frames by frame number to ensure the order is correct
    sorted_frames = sorted(frames, key=lambda x: x[0])
    # Extract only the frame data, discarding the frame numbers
    sorted_arrays = [frame_data for _, frame_data in sorted_frames]
    # Concatenate all frames into a single numpy array
    final_video_dict[video_name] = np.vstack(sorted_arrays)
true_descriptors = final_video_dict
videos_list = list(true_descriptors.keys())
print(len(videos_list))

9282


# For Hierarchical Clustering

In [None]:
file_path = '/content/drive/MyDrive/'+clusters_info+'/cluster_rep_level1.pkl'

with open(file_path, 'rb') as file:
    clusters_data = pickle.load(file)

# Actual labels for the LP
file_path = '/content/drive/MyDrive/'+clusters_info+'/frame_to_cluster_level1.pkl'
with open(file_path, 'rb') as file:
    frames_data_raw = pickle.load(file)

cluster_level = 4
frames_to_clusters_indices = {}
clusters_indices= {}
# Populate the video_dict with frame arrays in order
for key, value in frames_data_raw.items():
    # Split the key to extract video name and frame number
    parts = key.split('/')
    if 'M' not in key: #For Ravdess data
      video_name = parts[1]  # Extracts '02-01-01-01-02-02-16'
      frame_number = int(parts[2].split('.')[0])  # Extracts frame number as an integer (e.g., 1)

    else: #for MEAD
      video_name = parts[0] + "__" + parts[2] + "__" + parts[3] + "__" + parts[4]
      frame_number = int(parts[-1].split(".")[0].split("_")[-1])
    # Append the frame array to the respective video entry in the dictionary
    if video_name not in frames_to_clusters_indices:
      frames_to_clusters_indices[video_name] = []
    try:
      cluster_name = value
      frames_to_clusters_indices[video_name].append((frame_number, cluster_name))
      clusters_indices[cluster_name]=0
    except:
      cluster_name = value
      frames_to_clusters_indices[video_name].append((frame_number, cluster_name))
      clusters_indices[cluster_name]=0



clusters_descriptors = []
idx = 0
for key, val in clusters_indices.items():
  clusters_indices[key] = idx
  #print(key)
  clusters_descriptors.append(clusters_data[key])
  idx+=1
clusters_descriptors = np.vstack(clusters_descriptors)

# Sort frames for each video by frame number and concatenate them into a single array
frames_data = {}
for video_name, frames in frames_to_clusters_indices.items():
    # Sort frames by frame number to ensure the order is correct
    sorted_frames = sorted(frames, key=lambda x: x[0])
    # Extract only the frame data, discarding the frame numbers
    sorted_arrays = [clusters_indices[frame_data] for _, frame_data in sorted_frames]
    # Concatenate all frames into a single numpy array
    frames_data[video_name] = sorted_arrays
#Here, frames_data is a dictionary where key is video name and value is list of cluster ids of its frames.

#Here, key is a video name and value is a list of cluster representatives of those clusters to which its frames are mapped

clusters_rep_as_ground_truth_for_a_video = {}
for video, frames in frames_data.items():
  stacked_clusters_rep = [clusters_descriptors[val] for val in frames]
  clusters_rep_as_ground_truth_for_a_video[video] = np.vstack(stacked_clusters_rep)

In [None]:
mead_mfcc_path = "/content/drive/MyDrive/MEAD_MFCC/"
# Load and prepare data
dataset_path = "DatasetForLSTM/"+ Zipped_inside_folder


#Getting MFCC features of all frames in a cluster and taking their mean
cluster_mfcc_rep = {}
for key, val in clusters_indices.items():
  cluster_mfcc_rep[val] =[]

for video, reps in frames_data.items():
  if 'M' in video:
    video_path = mead_mfcc_path+video+".csv"
  else:
    video_path = dataset_path+"/"+video+".csv"
  df = pd.read_csv(video_path, header=None)
  features = df.iloc[:, :].values.astype(np.float32)
  for frame_number, cluster_rep in enumerate(reps):
    try:
      cluster_mfcc_rep[cluster_rep].append(features[frame_number])
    except:
      continue
for key, val in cluster_mfcc_rep.items():
  np_array = np.vstack(val)
  cluster_mfcc_rep[key] = np.mean(np_array, axis = 0)

In [None]:
K = 3
clusters_labels_ = list(cluster_mfcc_rep.keys())  # Labels (keys)
vectors = np.array(list(cluster_mfcc_rep.values()))  # 28-dimensional vectors (values)

# Train the KNN model
knn = NearestNeighbors(n_neighbors=K, metric='euclidean')  # K=3 for example
knn.fit(vectors)

def KNNPrediction(test_vector, knn, labels, cluster_mfcc_rep):
  distances, indices = knn.kneighbors(test_vector)
  closest_neighbors = [labels[idx] for idx in indices.flatten()]
  closest_neighbors_mfccs = [cluster_mfcc_rep[idx] for idx in indices.flatten()]
  return closest_neighbors, closest_neighbors_mfccs


In [None]:
def get_predicted_labels(outputs, cluster_descriptors, device):
    # Ensure cluster_descriptors is a torch tensor and move to device
    cluster_descriptors = torch.tensor(cluster_descriptors, device=device,  dtype=torch.float32)
    #print(cluster_descriptors.shape)
    # Initialize an empty list to store predicted labels for each frame
    predicted_labels = []
    #outputs = outputs.squeeze(0)
    # Iterate over each frame descriptor in outputs
    for frame_descriptor in outputs:
        # Calculate Euclidean distances between the frame descriptor and each cluster descriptor
        #print(frame_descriptor.shape)

        distances = torch.norm(cluster_descriptors - frame_descriptor, dim=1)

        # Find the index of the minimum distance (i.e., closest cluster descriptor)
        predicted_label = torch.argmin(distances)
        predicted_labels.append(predicted_label.item())

    # Convert predicted labels list to a tensor on the same device
    return predicted_label.detach().cpu().numpy().tolist()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

mead_mfcc_path = "/content/drive/MyDrive/MEAD_MFCC/"
# Load and prepare data
dataset_path = "DatasetForLSTM/"+ Zipped_inside_folder

# Update video paths
# Go through each video name and update path of the csv --> wave to vec csvs
all_video_paths = []
for video in videos_list:
  if 'M' in video: #For MEAD
    all_video_paths.append(mead_mfcc_path+video+".csv")
  else:
    all_video_paths.append("DatasetForLSTM/"+Zipped_inside_folder+"/"+video+".csv")

train_video_paths, test_video_paths = train_test_split(all_video_paths, test_size=0.05, random_state=42)

Using device: cpu


In [None]:
# for i_i in [0,1,2]:
#   print(i_i)
matched_train = 0
total_train = 0
for video_path in train_video_paths:
  df = pd.read_csv(video_path, header=None)
  features = df.iloc[0, :].values.astype(np.float32)
  closest_neighbors, closest_neighbors_mfccs = KNNPrediction(features.reshape(1,-1), knn, clusters_labels_, cluster_mfcc_rep)
  original_label = frames_data[video_name][0]
  if K ==1:
    if original_label == closest_neighbors[0]:
      matched_train +=1
    total_train +=1
    continue

  descriptor_list = []
  for descriptor in closest_neighbors_mfccs:
    descriptor_list.append(descriptor)
  descriptor_list = np.vstack(descriptor_list)

  descriptor_list = np.mean(descriptor_list, axis=0, keepdims=True)
  descriptor_list = torch.tensor(descriptor_list, device=device,  dtype=torch.float32)
  predicted_labels = get_predicted_labels(descriptor_list, vectors, device)
  if original_label == predicted_labels:
    matched_train +=1
  total_train +=1

print("Accuracy on train:", (matched_train*100)/total_train,"%")

matched_test = 0
total_test = 0
for video_path in test_video_paths:
  df = pd.read_csv(video_path, header=None)
  features = df.iloc[0, :].values.astype(np.float32)
  closest_neighbors, closest_neighbors_mfccs = KNNPrediction(features.reshape(1,-1), knn, clusters_labels_, cluster_mfcc_rep)
  original_label = frames_data[video_name][0]
  if K ==1:
    if original_label == closest_neighbors[0]:
      matched_test +=1
    total_test +=1
    continue

  descriptor_list = []
  for descriptor in closest_neighbors_mfccs:
    descriptor_list.append(descriptor)
  descriptor_list = np.vstack(descriptor_list)

  descriptor_list = np.mean(descriptor_list, axis=0, keepdims=True)
  descriptor_list = torch.tensor(descriptor_list, device=device,  dtype=torch.float32)
  predicted_labels = get_predicted_labels(descriptor_list, vectors, device)
  if original_label == predicted_labels:
    matched_test +=1
  total_test +=1

print("Accuracy on test:", (matched_test*100)/total_test,"%")


print("Accuracy on Test + Train:", ((matched_test + matched_train)*100)/(total_test+total_train),"%")




In [None]:
'''
K=3
Level 1:
Level 2:
Level 3:
Level 4:

K=1
Level 1: 32%
Level 2: 20.65%
Level 3: 17%
Level 4: 15.6%
'''