In [None]:
from src.requirements import *

## Audio Preprocessing
- resample
- stereo to mono
- normalize
- plot

In [None]:
from src.requirements import *

class ASRDataset(Dataset):
    def __init__(self, metadata_path, tokenizer):
        super().__init__()
        self.df = pd.read_csv(metadata_path, sep="\t")
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        waveform, sr = sf.read(row['path'], always_2d=True)
            
        waveform = torch.tensor(waveform.T, dtype=torch.float32)

        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        waveform = waveform / torch.max(torch.abs(waveform))
            
        target = self.tokenizer.encode(row['transcript'])
        
        return waveform.squeeze(0), torch.tensor(target, dtype=torch.long)

In [None]:
class Tokenizer:
    def __init__(self, corpus_path, add_blank=True):        
        with open(corpus_path, 'r', encoding='utf-8') as f:
            lines = [unicodedata.normalize('NFC', l.strip()) for l in f]

        tokens = []
        for line in lines:
            tokens.extend(self.tokenize(line))

        counter = Counter(tokens)
        self.vocab = sorted(counter.keys())

        if add_blank:
            self.vocab = ['<blank>'] + self.vocab

        self.token_to_id = {t: i for i, t in enumerate(self.vocab)}
        self.id_to_token = {i: t for t, i in self.token_to_id.items()}

    def tokenize(self, text):
        text = unicodedata.normalize('NFC', text)
        return regex.findall(r'\X', text)

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.token_to_id[t] for t in tokens if t in self.token_to_id]

    def decode(self, ids):
        return ''.join([self.id_to_token[i] for i in ids if i in self.id_to_token])

In [None]:
def collate_padding_asr(batch):
    waveforms, targets = zip(*batch)
    waveforms = rnn_utils.pad_sequence(waveforms, batch_first=True, padding_value=0)
    targets = rnn_utils.pad_sequence(targets, batch_first=True, padding_value=0)
    
    waveforms = waveforms.unsqueeze(1)
    
    input_len = torch.tensor([wave.shape[-1] for wave in waveforms], dtype=torch.long)
    target_len = torch.tensor([len(target) for target in targets], dtype=torch.long)
    
    return waveforms, targets, input_len, target_len

In [None]:
# Zipping two lists
names = ["Alice", "Octavia", "Nicole"]
scores = [85, 90, 88]

zipped_data = zip(names, scores)
print(list(zipped_data))

# Creating a dictionary
keys = ["name", "age", "city"]
values = ["Rerir", 525, "Khaenri'ah"]

person_dict = dict(zip(keys, values))
print(person_dict)

# Unzipping
zipped_pairs = [('Nefer', 1), ('Aino', 2), ('Lauma', 3), ('Kuki', 4)]
letters, numbers = zip(*zipped_pairs)
print(f"Letters: {letters}")
print(f"Numbers: {numbers}")

In [None]:
def load_corpus_text(corpus_path):
    all_text = ""
    for file in tqdm(glob.glob(corpus_path + "/**/*.txt", recursive=True)):
        with open(file, "r", encoding="utf-8") as f:
            all_text += f.read() + "\n"
    return all_text

In [None]:
text_path = os.path.join("data", "corpus.txt")
if not os.path.exists(text_path):
    path = os.path.join("data", "text")
    filename = "corpus.txt"
    text = load_corpus_text(path)
    with open(os.path.join("data", filename), "w", encoding="utf-8") as f:
        f.write(text)
tokenizer = Tokenizer(text_path)

In [None]:
asr_data = ASRDataset(os.path.join("data", "metadata.tsv"), tokenizer)
asr_dl = DataLoader(
    dataset = asr_data,
    batch_size = 8,
    pin_memory = True,
    collate_fn = collate_padding_asr,
    shuffle=True
)

In [None]:
for batch in asr_dl:
    waveform, target, _, _ = batch
    print(target.shape)
    break

In [None]:
from src.requirements import *
import IPython

idx = 3
df = pd.read_csv(os.path.join("data", "metadata.tsv"), sep="\t")
row = df.iloc[idx]
path = row['path']
transcript = row['transcript']
waveform, sr = sf.read(path, always_2d=True)
waveform = torch.tensor(waveform.T, dtype=torch.float32)
print(waveform.shape)
print(transcript)
IPython.display.Audio(path)

In [None]:
tokenizer.encode("घर जग्गा कारोबारमा आत्मविश्वास गुमेको वर्ष बैंकिङ प्रणाली सेयर बजार")

In [None]:
tokenizer.decode(tokenizer.encode("घर जग्गा कारोबारमा आत्मविश्वास गुमेको वर्ष बैंकिङ प्रणाली सेयर बजार"))

In [None]:
from src.requirements import *
from src.audio_handler import AudioDataset, collate_padding

ssl_data = AudioDataset(os.path.join("data", "metadata.tsv"))
ssl_dl = DataLoader(
    dataset = ssl_data,
    batch_size = 8,
    pin_memory = True,
    collate_fn = collate_padding,
    shuffle=True
)

In [None]:
path = os.path.join("data", "corpus.txt")
if not os.path.exists(path):
    # text = "I am the one, I'm here, I've infiltrated.\nYour time has come to be downgraded.\nI've already won as far as I can see,\nso keep your eyes on me."
    text = "घर जग्गा कारोबारमा आत्मविश्वास गुमेको वर्ष बैंकिङ प्रणाली सेयर बजार र घरजग्गाले गएको वर्ष कम्तीमा एउटा साझा समस्या भोगे विश्वासको\nमेलमिलापको केन्द्र राष्ट्रियतानेपाली कांग्रेसले पर्वका रूपमा मनाउने गरेको राष्ट्रिय एकता तथा मेलमिलाप दिवस हिजो पनि देशैभरि मनाइयो"
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)
    print("text written")
else:
    print("file exists bruh")

with open(path, "r", encoding="utf-8") as f:
    lines = [unicodedata.normalize('NFC', l.strip()) for l in f]

print(lines)

In [None]:
def tokenize(text):
        text = unicodedata.normalize('NFC', text)
        return regex.findall(r'\X', text)

tokens = []

for line in lines:
    tokens.extend(tokenize(line))

tokens

In [None]:
counter = Counter(tokens)
vocab = sorted(counter.keys())
vocab

In [None]:
vocab_w_blank = ['<blank>'] + vocab
vocab_w_blank

In [None]:
token_to_id = {t: i for i, t in enumerate(vocab)}
token_to_id

In [None]:
df = pd.read_csv(os.path.join("data", "metadata.tsv"), sep="\t")
df.head()

In [None]:
transcripts = df["transcript"].tolist()
all_chars = set("".join(transcripts))
unique_vocabs = list(all_chars)
vocab_size = len(unique_vocabs)
unique_vocabs