In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


All necessary imports below

In [None]:
import os
import numpy as np
from transformers import Wav2Vec2Processor, HubertModel
from transformers import Wav2Vec2FeatureExtractor
import torch
import librosa
import soundfile as sf

Data Preprocessing

In [None]:
import os

output_directory = '/content/drive/My Drive/Dataset/Processed/audio/train_wav'

file_counts = {}

for filename in os.listdir(output_directory):
    if filename.endswith('.wav'):
        if filename in file_counts:
            file_counts[filename] += 1
        else:
            file_counts[filename] = 1
duplicates = {file: count for file, count in file_counts.items() if count > 1}
print("Duplicates found:")
for dup_file, count in duplicates.items():
    print(f"{dup_file}: {count} copies")

In [None]:
import os
import subprocess
from tqdm import tqdm


source_directory = '/content/drive/My Drive/Dataset/video_with_audio/train'

output_directory = '/content/drive/My Drive/Dataset/Processed/audio/train_wav'

os.makedirs(output_directory, exist_ok=True)

mp4_files = [f for f in os.listdir(source_directory) if f.endswith('.mp4')]
total_files = len(mp4_files)
print(f"Total .mp4 files to convert: {total_files}")

for filename in tqdm(mp4_files, desc='Converting .mp4 to .wav', unit='file'):
    mp4_file_path = os.path.join(source_directory, filename)
    wav_file_path = os.path.join(output_directory, filename.replace('.mp4', '.wav'))
    if not os.path.exists(wav_file_path):
        command = ['ffmpeg', '-i', mp4_file_path, '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '16000', wav_file_path]
        subprocess.run(command)
    else:
        print(f"Skipping {filename}, already converted.")

print('Conversion completed.')

In [None]:
# number of files in the source directory
source_files_count = len([f for f in os.listdir(source_directory) if os.path.isfile(os.path.join(source_directory, f))])
print(f"Number of files in the source directory: {source_files_count}")

#number of files in the destination directory
destination_files_count = len([f for f in os.listdir(output_directory) if os.path.isfile(os.path.join(output_directory, f))])
print(f"Number of files in the destination directory: {destination_files_count}")

Embedding generation

In [None]:
processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-large-superb-er")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
dev_folder_path = '/content/drive/My Drive/Dataset/Processed/audio/test_wav'

In [None]:
def load_and_preprocess_audio(file_path, target_sampling_rate=16000):
    audio, sampling_rate = sf.read(file_path)
    if sampling_rate != target_sampling_rate:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=target_sampling_rate)
    return audio

def extract_features(audio, processor, model):
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = inputs.input_values.to('cuda' if torch.cuda.is_available() else 'cpu')
    with torch.no_grad():
        outputs = model(inputs)
        embeddings = outputs.last_hidden_state
    return embeddings


In [None]:
#  directory for WAV files and output directory for embeddings
dev_folder_path = '/content/drive/My Drive/Dataset/Processed/audio/test_wav'
output_dir = '/content/drive/My Drive/Dataset/Processed/audio/test_emotion'

# output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


features_dict = {}

# Process each .wav file
wav_files = [f for f in os.listdir(dev_folder_path) if f.endswith('.wav')]
total_files = len(wav_files)
processed_count = 0

for filename in wav_files:
    output_path = os.path.join(output_dir, f'{filename[:-4]}.npy')
    if not os.path.exists(output_path):
        file_path = os.path.join(dev_folder_path, filename)
        audio = load_and_preprocess_audio(file_path)
        features = extract_features(audio, processor, model)
        features_dict[filename] = features
        np.save(output_path, features.cpu().numpy())
        print(f'Processed {filename} ({processed_count + 1}/{total_files})')
    else:
        print(f'Skipping {filename} as features are already extracted. ({processed_count + 1}/{total_files})')
    processed_count += 1

In [None]:
# checking stuff
audio_file_path = '/content/drive/My Drive/Dataset/Processed/audio/test_emotion/dia1128utt3.npy'

# Load the embeddings from the file
embeddings_audio = np.load(audio_file_path)
embeddings_audio

array([[[ 0.17506291,  0.36895058,  0.36676377, ...,  0.1190948 ,
         -0.11426108, -0.02726609],
        [ 0.13045295,  0.44948626,  0.42974222, ...,  0.10116519,
         -0.09248384,  0.0548491 ],
        [ 0.15498553,  0.41023847,  0.4211151 , ...,  0.10297509,
         -0.11946542,  0.06958491],
        ...,
        [-0.00892584,  0.44067308,  0.30550373, ...,  0.12398724,
         -0.26930833,  0.16556536],
        [ 0.00310456,  0.46488234,  0.27608773, ...,  0.13190615,
         -0.31391814,  0.02305608],
        [ 0.00833244,  0.48754168,  0.19516596, ...,  0.07726772,
         -0.31118155, -0.06543523]]], dtype=float32)

In [None]:
embeddings_audio.shape

(1, 165, 1024)

In [None]:
import os
import numpy as np

# Define the paths
input_folder = '/content/drive/My Drive/Processed/audio/test_emotion'
mean_output_folder = '/content/drive/My Drive/Processed/audio/test_emotion_mean'

os.makedirs(mean_output_folder, exist_ok=True)
for file in os.listdir(input_folder):
    if file.endswith('.npy'):
        mean_output_path = os.path.join(mean_output_folder, file)

        # Check if the mean pooled embeddings file already exists
        if not os.path.exists(mean_output_path):
            embeddings = np.load(os.path.join(input_folder, file))
            # Perform mean pooling
            mean_pooled_embeddings = np.mean(embeddings, axis=1)
            # Save the mean pooled embeddings
            np.save(mean_output_path, mean_pooled_embeddings)

            print(f'Processed {file}')
        else:
            print(f'Skipping {file} as mean pooled embeddings are already processed.')


In [None]:
#check number of wav train files
import os

folder_path = '/content/drive/My Drive/Dataset/Processed/audio/test_emotion_mean'  # Change this to your folder path

files = os.listdir(folder_path)
number_of_files = len(files)
print(f'There are {number_of_files} files in the folder.')

There are 2572 files in the folder.
