In [None]:
pip install outetts uroman noisereduce mecab-python3

Collecting outetts
  Downloading outetts-0.2.3-py3-none-any.whl.metadata (10 kB)
Collecting uroman
  Downloading uroman-1.3.1.1-py3-none-any.whl.metadata (18 kB)
Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Collecting mecab-python3
  Downloading mecab_python3-1.0.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting encodec (from outetts)
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-lightning (from outetts)
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting tensorboardX (from outetts)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting jsonargparse (from outetts)
  Downloading jsonargparse-4.35.0-py3-none-any.whl.metadata (12 kB)
Collecting tor

In [None]:
!pip install datasets triton snac wandb accelerate torchdata

In [None]:
from outetts.wav_tokenizer.decoder import WavTokenizer
from outetts.wav_tokenizer.encoder.utils import convert_audio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import time
import numpy as np
import torchaudio
from snac import SNAC
from tqdm import tqdm
import huggingface_hub
import shutil
import soundfile as sf
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, concatenate_datasets, Audio, load_from_disk, interleave_datasets

In [None]:
import torchaudio
import torch
import torchaudio.functional as F
import inflect
import re
import uroman as ur

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
config_path = "/content/drive/MyDrive/audio_datasets/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
model_path = "/content/drive/MyDrive/audio_datasets/wavtokenizer_large_speech_320_24k.ckpt"#"/content/wavtokenizer_medium_speech_320_24k_v2.ckpt"
wavtokenizer = WavTokenizer.from_pretrained0802(config_path, model_path)
wavtokenizer = wavtokenizer.to(device)

In [None]:
class CTCForcedAlignment:

    def __init__(self, device: str = None):
        self.device = torch.device(device if device is not None else "cuda" if torch.cuda.is_available() else "cpu")
        bundle = torchaudio.pipelines.MMS_FA
        self.sample_rate = bundle.sample_rate
        self.model = bundle.get_model(with_star=False).to(self.device)
        self.LABELS = bundle.get_labels(star=None)
        self.DICTIONARY = bundle.get_dict(star=None)
        self.lec = inflect.engine()
        self.uroman = ur.Uroman()
        #self.wakati = MeCab.Tagger("-Owakati")
        #self.wakati_use = ["ja", "zh", "ko"]
        #self.languages = languages

    def process_text(self, text: str):
        #if language not in self.languages:
        #    raise ValueError(f"Language {language} not supported, supported languages are {self.languages}")
        text = self.uroman.romanize_string(text)
        text = re.sub(r'\d+(\.\d+)?', lambda x: self.lec.number_to_words(x.group()), text.lower())
        text = re.sub(r'[-_/,\.\\]', ' ', text)
        text = re.sub(r'[^a-z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text.split()

    def _unflatten(self, list_, lengths):
        assert len(list_) == sum(lengths)
        i = 0
        ret = []
        for l in lengths:
            ret.append(list_[i : i + l])
            i += l
        return ret

    def get_word(self, waveform, spans, num_frames, transcript):
        ratio = waveform.size(1) / num_frames
        x0 = int(ratio * spans[0].start)
        x1 = int(ratio * spans[-1].end)
        return {"x0": x0, "x1": x1, "word": transcript}

    def _extract_world_level(self, aligned_tokens, alignment_scores, transcript):
        token_spans = F.merge_tokens(aligned_tokens, alignment_scores)
        word_spans = self._unflatten(token_spans, [len(word) for word in transcript])
        return word_spans

    def _align(self, emission, tokens):
        targets = torch.tensor([tokens], dtype=torch.int32, device=torch.device("cpu"))
        alignments, scores = F.forced_align(emission.cpu(), targets, blank=0)
        alignments, scores = alignments[0], scores[0]
        scores = scores.exp()
        return alignments, scores

    def align(self, waveform,sr, transcript):
        #waveform, sr = torchaudio.load(audio)
        #waveform = torch.tensor(waveform)
        all_codes=quantize_wavtokenizer_ctc(waveform,sampling_rate=sr)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        waveform = waveform.float()
        #print(waveform.shape)
        #print(sr)
        waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=self.sample_rate)
        transcript = self.process_text(transcript)

        with torch.inference_mode():
            emission, _ = self.model(waveform.to(self.device))

        tokenized_transcript = [self.DICTIONARY[c] for word in transcript for c in word]
        alignments, scores = self._align(emission, tokenized_transcript)
        word_spans = self._extract_world_level(alignments, scores, transcript)
        num_frames = emission.size(1)

        outputs = [
            self.get_word(waveform, word_spans[i], num_frames, transcript[i])
            for i in range(len(word_spans))
        ]
        #codes=quantize_wavtokenizer_ctc(audio_data,sampling_rate=16000):
    #audio_data, sample_rate = row["audio"]["array"], int(row["audio"]["sampling_rate"])


        outputs[0]["x0"] = 0
        #print(waveform.shape)
        #print(self.sample_rate)
        for i in range(len(outputs)):
            output = outputs[i]
            x0 = output["x0"]

            if i == len(outputs) - 1:
                x1 = output["x1"]
            else:
                x1 = outputs[i + 1]["x0"]
            outputs[i]["audio"] = waveform[:, x0:x1]
            outputs[i]["duration"]=len(outputs[i]["audio"][0])/self.sample_rate
            outputs[i]["codes"]=all_codes[int(x0*75/self.sample_rate) : int(x1*75/self.sample_rate)]#quantize_wavtokenizer_ctc(outputs[i]["audio"],sampling_rate=16000, quantizer=wavtokenizer)
            #convert waveform to codes
            #duration Add audio
        return outputs

    def free(self):
        del self.model

In [None]:
ctc = CTCForcedAlignment("cuda")

In [None]:
ctc.DICTIONARY

In [None]:
def resample(audio: np.ndarray, sr: int, target_sr: int):

    audio = audio.to(dtype=torch.float32)
    #.clone().detach()
    audio = audio.unsqueeze(0)
    # 1 as last arg corresponds to mono audio
    resampled = convert_audio(audio, sr, target_sr, 1)
    return resampled.to(device)

In [None]:
def quantize_wavtokenizer_ctc(audio_data,sampling_rate=16000, quantizer=wavtokenizer):
    #audio_data, sample_rate = row["audio"]["array"], int(row["audio"]["sampling_rate"])

    audio = resample(audio_data, sampling_rate, 24000).to(device)
    bandwidth_id = torch.tensor([0]).to(device)
    audio=audio.squeeze(0)
    _, codes = quantizer.encode_infer(audio, bandwidth_id=bandwidth_id)
    codes = codes.squeeze(1).to(device)#+last_text_token

    return codes[0].tolist()#+last_text_token

In [None]:
def resample(audio: np.ndarray, sr: int, target_sr: int):

    audio =audio.to(dtype=torch.float32)
    #.clone().detach()
    audio = audio.unsqueeze(0)
    # 1 as last arg corresponds to mono audio
    resampled = convert_audio(audio, sr, target_sr, 1)
    return resampled.to(device)

In [None]:
def quantize_wavtokenizer(row, quantizer=wavtokenizer):
    audio_data, sample_rate = row["audio"]["array"], int(row["audio"]["sampling_rate"])

    audio = resample(audio_data, sample_rate, 24000).to(device)
    bandwidth_id = torch.tensor([0]).to(device)
    #print(audio.shape)
    #print(audio.dim())
    _, codes = quantizer.encode_infer(audio, bandwidth_id=bandwidth_id)
    codes = codes.squeeze(1).to(device)#+last_text_token

    return codes[0].tolist()#+last_text_token

In [None]:
def decode_tokenizer(discrete_code):
    #discrete code is a list
    discrete_code=torch.tensor([discrete_code]).to(device)-last_text_token
    features = wavtokenizer.codes_to_features(discrete_code).to(device)
    bandwidth_id = torch.tensor([0]).to(device)
    audio_out = wavtokenizer.decode(features, bandwidth_id=bandwidth_id)
    return audio_out

In [None]:
def decode_tokenizer(discrete_code):
    #discrete code is a list
    discrete_code=torch.tensor([[discrete_code]]).to(device)#-last_text_token
    features = wavtokenizer.codes_to_features(discrete_code).to(device)
    bandwidth_id = torch.tensor([0]).to(device)
    audio_out = wavtokenizer.decode(features, bandwidth_id=bandwidth_id)
    return audio_out

In [None]:
class PromptProcessor():
  def __init__(self,lang):
    self.lang=lang
    self.bos = "<|im_start|>"
    self.eos = "<|im_end|>"
    self.tts_prompt = "{bos}\n{tts}\n{text_start}{words}{text_end}\n{lang}\n{audio_start}\n"
    self.stt_prompt = "{bos}\n{stt}\n{audio_start}{codes}{audio_end}\n{lang}\n{text_start}\n"
    self.special_tokens = {
            "audio_code": "<|{}|>",
            "tts":"<|tts|>",
            "stt":"<|stt|>",
            "text_start": "<|text_start|>",
            "text_end": "<|text_end|>",
            "audio_start": "<|audio_start|>",
            "audio_end": "<|audio_end|>",
            "word_start": "<|word_start|>",
            "word_end": "<|word_end|>",
            "time": "<|t_{:.2f}|>",
            "code_start": "<|code_start|>",
            "code_end": "<|code_end|>",
            "text_sep": "<|text_sep|>",
            "hausa":"<|hausa|">,
            "igbo":"<|igbo|">,
            "yoruba":"<|yoruba|>",

        }
    super().__init__()


  def create_results_prompts(self,words):
    prompt_audio= []
    prompt_text=[]
    all_tokens=[]
    for i in words:
      word = i["word"]
      duration = self.special_tokens["time"].format(i["duration"])
      tokens = "".join([self.special_tokens["audio_code"].format(c) for c in i["codes"]])
      all_tokens.append(tokens)
      prompt_audio.append(f'{word}{duration}{self.special_tokens["code_start"]}{tokens}{self.special_tokens["code_end"]}')
      prompt_text.append(f'{tokens}{duration}{self.special_tokens["word_start"]}{word}{self.special_tokens["word_end"]}')
    return "".join(all_tokens),"\n".join(prompt_audio),"\n".join(prompt_text)



  def get_prompt(self, row):
    try:
      audio=torch.from_numpy(row["audio"]["array"]).unsqueeze(0)#torch.tensor([row["audio"]["array"]])
      #print(audio)
      sample_rate=row["audio"]["sampling_rate"]
      if row["text"]:
        transcript=row["text"]
      else:
        transcript=row["transcript"]
      input_words = ctc.process_text(transcript)
      words= ctc.align(audio,sample_rate,transcript)
      #print(words)
      inputs_words_strings = f"{self.special_tokens['text_sep']}".join([i.strip() for i in input_words])
      #self.text_prompt = "{bos}\n{text_start}{words}{text_end}\n{audio_start}\n"
      prompt_tts= self.tts_prompt.format(
            bos=self.bos,
            text_start=self.special_tokens['text_start'],
            tts=self.special_tokens['tts'],
            words=inputs_words_strings,
            lang=self.special_tokens[self.lang],
            text_end=self.special_tokens['text_end'],
            audio_start=self.special_tokens['audio_start']
        )


      all_codes, tts_extra, stt_extra=self.create_results_prompts(words)
      prompt_stt=self.stt_prompt.format(
            bos=self.bos,
            audio_start=self.special_tokens['audio_start'],
            stt=self.special_tokens['stt'],
            codes=all_codes,
            lang=self.special_tokens[self.lang],

            audio_end=self.special_tokens['audio_end'],
            text_start=self.special_tokens['text_start']
        )
      prompt_stt+=stt_extra+f"\n{self.special_tokens['text_end']}\n{self.eos}\n"
      prompt_tts+=tts_extra+f"\n{self.special_tokens['audio_end']}\n{self.eos}\n"

      return {"stt":prompt_stt,"tts":prompt_tts}
    except Exception as e:
      #print(e)
      return {"stt":"An error occurred","tts":"An error occurred"}#,"An error occured"

In [None]:
ps=PromptProcessor("yoruba")

In [None]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
data_yoruba=load_dataset("saheedniyi/yts")["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/328 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/25 [00:00<?, ?files/s]

train-00000-of-00025.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

train-00001-of-00025.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00002-of-00025.parquet:   0%|          | 0.00/446M [00:00<?, ?B/s]

train-00003-of-00025.parquet:   0%|          | 0.00/405M [00:00<?, ?B/s]

train-00004-of-00025.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

train-00005-of-00025.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

train-00006-of-00025.parquet:   0%|          | 0.00/402M [00:00<?, ?B/s]

train-00007-of-00025.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

train-00008-of-00025.parquet:   0%|          | 0.00/361M [00:00<?, ?B/s]

train-00009-of-00025.parquet:   0%|          | 0.00/442M [00:00<?, ?B/s]

train-00010-of-00025.parquet:   0%|          | 0.00/580M [00:00<?, ?B/s]

train-00011-of-00025.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00012-of-00025.parquet:   0%|          | 0.00/464M [00:00<?, ?B/s]

train-00013-of-00025.parquet:   0%|          | 0.00/536M [00:00<?, ?B/s]

train-00014-of-00025.parquet:   0%|          | 0.00/442M [00:00<?, ?B/s]

train-00015-of-00025.parquet:   0%|          | 0.00/367M [00:00<?, ?B/s]

train-00016-of-00025.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

train-00017-of-00025.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

train-00018-of-00025.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

train-00019-of-00025.parquet:   0%|          | 0.00/461M [00:00<?, ?B/s]

train-00020-of-00025.parquet:   0%|          | 0.00/576M [00:00<?, ?B/s]

train-00021-of-00025.parquet:   0%|          | 0.00/502M [00:00<?, ?B/s]

train-00022-of-00025.parquet:   0%|          | 0.00/451M [00:00<?, ?B/s]

train-00023-of-00025.parquet:   0%|          | 0.00/430M [00:00<?, ?B/s]

train-00024-of-00025.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15188 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
data_yoruba

Dataset({
    features: ['audio', 'text', '__index_level_0__'],
    num_rows: 3583
})

In [None]:
i=0
for k in data_yoruba:
  if i==1:
    break
  i+=1

In [None]:
k

{'audio': {'path': 'EZR_006_Verse_014.flac',
  'array': array([-0.00054622, -0.00055361, -0.00056887, ...,  0.0001024 ,
          0.00010622,  0.00010431]),
  'sampling_rate': 48000},
 'text': 'Síwájú sí i, mo pàṣẹ pé tí ẹnikẹ́ni bá yí àṣẹ yìí padà, kí fa igi àjà ilé rẹ̀ yọ jáde, kí a sì gbe dúró, kí a sì fi òun náà kọ́ sí orí rẹ̀ kí ó wo ilé rẹ̀ palẹ̀ a ó sì sọ ọ́ di ààtàn.'}

In [None]:
ps.get_prompt(k)["tts"]

'<|im_start|>\n<|tts|>\n<|text_start|>siwaju<|text_sep|>si<|text_sep|>i<|text_sep|>mo<|text_sep|>pase<|text_sep|>pe<|text_sep|>ti<|text_sep|>enikeni<|text_sep|>ba<|text_sep|>yi<|text_sep|>ase<|text_sep|>yii<|text_sep|>pada<|text_sep|>ki<|text_sep|>fa<|text_sep|>igi<|text_sep|>aja<|text_sep|>ile<|text_sep|>re<|text_sep|>yo<|text_sep|>jade<|text_sep|>ki<|text_sep|>a<|text_sep|>si<|text_sep|>gbe<|text_sep|>duro<|text_sep|>ki<|text_sep|>a<|text_sep|>si<|text_sep|>fi<|text_sep|>oun<|text_sep|>naa<|text_sep|>ko<|text_sep|>si<|text_sep|>ori<|text_sep|>re<|text_sep|>ki<|text_sep|>o<|text_sep|>wo<|text_sep|>ile<|text_sep|>re<|text_sep|>pale<|text_sep|>a<|text_sep|>o<|text_sep|>si<|text_sep|>so<|text_sep|>o<|text_sep|>di<|text_sep|>aatan<|text_end|>\n<|yoruba|\n<|audio_start|>\nsiwaju<|t_1.84|><|code_start|><|484|><|193|><|139|><|765|><|165|><|227|><|156|><|167|><|244|><|167|><|244|><|453|><|453|><|453|><|244|><|167|><|453|><|244|><|235|><|219|><|235|><|219|><|167|><|244|><|167|><|244|><|167|><|

In [None]:
data_yoruba = data_yoruba.cast_column("audio", Audio(sampling_rate=24000))

In [None]:
data_yoruba

Dataset({
    features: ['audio', 'text'],
    num_rows: 15188
})

In [None]:
start=0
end=len(data_yoruba)

In [None]:
print(end)

15188


In [None]:
import pandas as pd

In [None]:
while start<end:
  if start+1000>end:
    end_local=end
  else:
    end_local=start+1000

  print(start)
  data_1000=data_yoruba.select(range(start,end_local)).map(
      ps.get_prompt,
      remove_columns=["audio","text"],
      )
  pd.DataFrame(data_1000).to_csv(f"/content/drive/MyDrive/naij_tokenized/yoruba_yts_{(start+1)//1000}.csv")

  start+=1000

0


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

1000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

2000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

3000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

4000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

5000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

6000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

7000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

8000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

9000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

10000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

11000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

12000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

13000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

14000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

15000


Map:   0%|          | 0/188 [00:00<?, ? examples/s]