In [None]:
%pwd

In [1]:
%cd /home/jupyter/audio-pipelines/fairseq/data

/projects/aigames_data


In [2]:
import torch
from fairseq.models.wav2vec import Wav2VecModel
import torchaudio

from typing import List

WAV_SCP_PATH = "/home/jupyter/audio-pipelines/fairseq/dummy_wav.scp"
MAX_PADDING = 2032307 // 6

WAV2VEC_PATH = "/home/jupyter/audio-pipelines/fairseq/wav2vec_large.pt"
DEVICE = "cuda:0"
BATCH_SIZE = 32


def convert_wave(wave: torch.tensor, model: torch.nn.Module):
    with torch.no_grad():
        z = model.feature_extractor(wave.to(DEVICE))
        c = model.feature_aggregator(z)
        del z
    return c.cpu()


def load_model() -> torch.nn.Module:
    cp = torch.load(WAV2VEC_PATH)
    model = Wav2VecModel.build_model(cp["args"], task=None)
    model.load_state_dict(cp["model"])
    model = model.eval()
    return model.to(DEVICE)


def save_file(tensor: torch.tensor, output_path: str):
    torch.save(tensor, output_path)

In [3]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from typing import Tuple
import torch
from torch import nn
import math

class WavDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, scp_path: str):
        self.scp_path = scp_path
        self.wav_paths = self.extract_wav_paths()

    def extract_wav_paths(self):
        paths = []
        with open(self.scp_path, "r") as f:
            for row in f:
                wav_path = row.split(" ")[1].split("\n")[0]
                paths.append(wav_path)
        return paths

    def padding(self, wave: torch.tensor) -> Tuple[torch.tensor, int]:
        wave = wave[0, :MAX_PADDING]
        len_ = wave.shape[0]
        orginal_len = MAX_PADDING - len_

        padded = nn.ConstantPad1d((0, orginal_len), 0)(wave)
        return padded, orginal_len

    def __len__(self):
        return len(self.wav_paths)

    def __getitem__(self, i):
        wav_path = self.wav_paths[i]
        out_path = wav_path.split(".")[0] + ".torch"
        waveform, sample_rate = torchaudio.load(wav_path)
        waveform, orginal_len = self.padding(waveform)
        return waveform, orginal_len, out_path

In [4]:
def second_padding(tens: torch.tensor, orginal_lens: int) -> torch.tensor:
    for i, len_ in enumerate(orginal_lens):
        ratio = len_ / MAX_PADDING
        padding_start = math.ceil(tens.shape[2] * ratio)
        tens[i, :, padding_start:] = 0
    return tens

def save_files(tens: torch.tensor, out_paths: List[str]):
    for i, path in enumerate(out_paths):
        view = torch.unsqueeze(output[i], dim=0)
        torch.save(view, path)

In [5]:
dataset = WavDataset(WAV_SCP_PATH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=8)
model = load_model()

In [None]:
print(f"max_batches {len(dataset) // BATCH_SIZE + 1}")
for i, (waves, orginal_len, out_paths) in enumerate(dataloader):
    output = convert_wave(waves, model)
    output = second_padding(output, orginal_len)
    #     save_files(output, out_paths)
    if i % 100 == 0:
        print(i + 1)
        

max_batches 1734
1
