In [1]:
import torch
from WavLM import WavLM, WavLMConfig
import torchaudio
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [4]:
class Net(torch.nn.Module):
    def __init__(self, input_dim, output_dim,dropout=0.2):
        super(Net, self).__init__()
        padding_1 = (4 * (4900 - 1) - 4900 + 7) // 2
        padding_2 = (4 * (input_dim - 1) - input_dim + 7) // 2
        padding_3 = ((4 * (input_dim - 1) - input_dim + 7) //2)
        padding_4 = ((4 * (input_dim - 1) - input_dim + 7) //2)
        padding_5 = ((4 * (input_dim - 1) - input_dim + 7) //2) // 2
        self.model_seq = torch.nn.Sequential(
            nn.Conv1d(input_dim, 512, kernel_size=7, stride=4,padding=padding_1),
            nn.MaxPool1d(kernel_size=3, stride=1),
            nn.Conv1d(512, 256, kernel_size=7, stride=4,padding=padding_2),
            nn.MaxPool1d(kernel_size=3, stride=1),
            # input dimension is (batch_size, input_dim, no.of frames)
            nn.Conv1d(256, 128, kernel_size=7, stride=4,padding=padding_3),
            nn.MaxPool1d(kernel_size=3, stride=1),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout(p=dropout),
            # now make a 1D convolutional layer with suitable padding such that it matches the input dimensions
            nn.Conv1d(128, 64, kernel_size=7, stride=4,padding=padding_4),
            nn.AvgPool1d(kernel_size=3, stride=1),
            nn.BatchNorm1d(64),
            nn.GELU(),
            nn.Dropout(p=dropout),
            nn.Conv1d(64, 32, kernel_size=7, stride=4,padding=padding_5),
            nn.AvgPool1d(kernel_size=3, stride=1),
            nn.BatchNorm1d(32),
            nn.GELU(),
            nn.Dropout(p=dropout),
            nn.Conv1d(32, 16, kernel_size=7, stride=4),
            nn.AvgPool1d(kernel_size=3, stride=1),
            nn.BatchNorm1d(16),
            nn.GELU(),
            nn.Dropout(p=dropout),
        )
        self.mlp = nn.Sequential(
            nn.Linear(1024, 512),
            nn.GELU(),
            nn.Dropout(p=dropout),
            nn.Linear(512, 64),
            nn.GELU(),
            nn.Dropout(p=dropout),
            nn.Linear(64, output_dim),
            # now as I am doing multi-class classification, I will use sigmoid activation for each output neuron
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.model_seq(x)
        flatten = nn.Flatten()
        x = flatten(x)
        linear1 = nn.Linear(x.size(2)*x.size(1), 1024).to('cuda')
        x = linear1(x)
        x = self.mlp(x)
        return x


In [2]:

def get_features_wavlm(wav_path, model, cfg):
    # load the wav file
    wav_input_16khz, sr = torchaudio.load(wav_path)
    wav_input_16khz = wav_input_16khz.to('cuda')
    if cfg.normalize:
        wav_input_16khz = torch.nn.functional.layer_norm(wav_input_16khz , wav_input_16khz.shape)
    rep, layer_results = model.extract_features(wav_input_16khz, output_layer=model.cfg.encoder_layers, ret_layer_results=True)[0]
    layer_reps = [x.transpose(0, 1) for x, _ in layer_results]
    # do average of the tensors in layer_reps and give one result tensor after averaging
    rep = torch.mean(torch.stack(layer_reps), dim=0)
    rep = rep.transpose(1,2)
    print(rep.shape)
    return rep

def load_model_wavlm(wavlm_path):
    # load the pre-trained checkpoints
    checkpoint = torch.load(wavlm_path)
    cfg = WavLMConfig(checkpoint['cfg'])
    model = WavLM(cfg)
    model.load_state_dict(checkpoint['model'])
    model.eval()
    model = model.to('cuda')
    return model, cfg

def store_features_wavlm(wav_path, model, cfg, output_path):
    rep = get_features_wavlm(wav_path, model, cfg)
    rep = rep.cpu()
    print(rep.shape)
    np.save(output_path, rep.detach().numpy())

def main():
    wavlm_path = './wavlm_models/wavlm_base_plus.pt'
    wav_path = 'test1_60.wav'
    output_path = 'test.npy'
    model, cfg = load_model_wavlm(wavlm_path)
    store_features_wavlm(wav_path, model, cfg, output_path)

if __name__ == '__main__':
    main()

torch.Size([1, 768, 2999])
torch.Size([1, 768, 2999])


In [6]:
finetuning_model = Net(768,10,0.2).to('cuda')
# load the features in the test.npy file
test_features = np.load('/ssd_scratch/cvit/kolubex/temp_videos1/separated_audios/mdx_extra/1/music.npy')
# convert them to tensors
required_shape = (1, 768, 4900)

# Calculate the padding size for the third dimension
test_features = torch.from_numpy(test_features).to('cuda')
print(test_features.shape)
padding_size = required_shape[2] - test_features.shape[2]
padded_test_features = torch.nn.functional.pad(test_features, (0, padding_size))
print(padded_test_features.shape)
# pad the test_features such that it gets the shape (batch_size, 768,4900) where the input is of shape (batch_size, 768, unknown)
rep = finetuning_model(test_features)
print(rep.shape)
print(rep)



torch.Size([1, 768, 749])
torch.Size([1, 768, 4900])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [3]:
import subprocess
def get_video_duration(video_path):
    command = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", video_path]
    result = subprocess.run(command, capture_output=True, text=True)
    duration = float(result.stdout.strip())
    print(duration)
    return duration
video_path = "./scene-001.ss-0001.es-0006.mp4"
duration = get_video_duration(video_path)
print(f"Duration of video: {duration} seconds")

218.73
Duration of video: 218.73 seconds
