# Convert irish words
converts orthographic representation in common voice corpus to phonemic (IPA) representaions using Ulster pronunciation as standard

In [1]:
# imports
import pandas as pd
import nltk, os


In [2]:
tsvPath = "/home/peter/Desktop/Studies/Thesis/ThesisProject/data/g2P/ulster.tsv"
g2pFile = pd.read_csv(tsvPath,sep="\t", names=["word","phonemes"])

In [3]:
g2pFile.head()

Unnamed: 0,word,phonemes
0,á,aː
1,a,ə
2,a'am,a mˠ
3,a'at,a t̪ˠ
4,ab,a bˠ


# process text for Common Voice recordings

In [4]:
cVRecPath = "../../data/commonVoiceData/transcription/owsm_v3.1/001.csv"


## task flow
1. Read the TSV into a Pandas DataFrame (or a dictionary).
    * Left column = words
    * Right column = phoneme transcriptions
2. Tokenize your input text.
    * Split the sentence into words
    * For each word, do a lookup in your DataFrame/dictionary.
3. Replace each word with its phoneme sequence using the mapping.
    * If a word is not in your dictionary, "<unk>"
4. Join the phonemes for each word in order to form the full phoneme sequence (should I use word separator? i think "|" is standard)

In [5]:
transcriptions = pd.read_csv(cVRecPath)

In [6]:
transcriptions.head()

Unnamed: 0.1,Unnamed: 0,path,sentence
0,0,audio/001/common_voice_ga-IE_17571418.mp3,Tá a mála ar an urlár
1,1,audio/001/common_voice_ga-IE_17571419.mp3,An Phoblacht Doiminiceach
2,2,audio/001/common_voice_ga-IE_17571420.mp3,Do ghnó féin déan a dhuine
3,3,audio/001/common_voice_ga-IE_17571424.mp3,Ní raibh mé
4,4,audio/001/common_voice_ga-IE_17571428.mp3,Dún do bhéal.


In [7]:
# make dict for easy look up
g2p_dict = g2pFile.set_index("word")["phonemes"].to_dict()

In [8]:
def sent2phones(sentence):
    words = [x.strip(" .,!?:;") for x in sentence.split()]
    
    phoneme_seq = []
    for word in words:
        if word in g2p_dict:
            phoneme_seq.append(g2p_dict[word])
        elif word.lower() in g2p_dict:
            phoneme_seq.append(g2p_dict[word.lower()])
        else:
            phoneme_seq.append("[UNK]")
    
    return " ".join(phoneme_seq)

In [9]:
transcriptions["phoneme_sentence"] = transcriptions["sentence"].apply(sent2phones)

In [10]:
transcriptions.head()

Unnamed: 0.1,Unnamed: 0,path,sentence,phoneme_sentence
0,0,audio/001/common_voice_ga-IE_17571418.mp3,Tá a mála ar an urlár,ˈ t̪ˠ aː ə ˈ mˠ aː l̻ˠ ə ˈ e ɾʲ ˈ ə n̻ˠ ˈ uː ɾ...
1,1,audio/001/common_voice_ga-IE_17571419.mp3,An Phoblacht Doiminiceach,ˈ ə n̻ˠ ˈ fˠ o bˠ l̻ˠ a x t̪ˠ ˈ d̪ˠ i mʲ ə nʲ ...
2,2,audio/001/common_voice_ga-IE_17571420.mp3,Do ghnó féin déan a dhuine,ˈ d̪ˠ ə ɣ n̻ˠ oː ˈ h eː nʲ ˈ dʲ eː n̻ˠ ə ˈ ɣ i...
3,3,audio/001/common_voice_ga-IE_17571424.mp3,Ní raibh mé,ˈ n̻ʲ iː ˈ ɾˠ oː vˠ ˈ mʲ eː
4,4,audio/001/common_voice_ga-IE_17571428.mp3,Dún do bhéal.,ˈ d̪ˠ uː n̻ˠ ˈ d̪ˠ ə ˈ vʲ eː l̻ˠ


Okay, great. now we have a new column with phonemic transcriptions. We can tokenize using wav2vec2 or wav2vec2phoneme tokenizer. Let's create a vocabulary unique to the phonemes present in the Ulster g2p Jim generated.

https://huggingface.co/docs/tokenizers/en/api/tokenizer

In [12]:
import json

In [13]:
vocab_file_path = "../../data/phoneme_vocab.json"

phoneme_vocab = set()
for phonemes in g2p_dict.values():
    phoneme_vocab.update(phonemes.split())
# add some special tokens that are present in https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme    
phoneme_vocab.update(["<s>", "</s>", "<unk>", "<pad>"])

# now sort and index the set as dict as "phoneme: index"
sorted_ph_vocab = {ph: i for i, ph in enumerate(sorted(phoneme_vocab))}

with open(vocab_file_path, "w") as file:
    json.dump(sorted_ph_vocab, file)

# Data preparation
https://huggingface.co/blog/fine-tune-wav2vec2-english

In [2]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2PhonemeCTCTokenizer
import torchaudio

  from .autonotebook import tqdm as notebook_tqdm


## tokenize

In [14]:
tokenizer = Wav2Vec2PhonemeCTCTokenizer(
    vocab_file_path
)

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,         # feature dimension of extracted features
    sampling_rate=16000,    # in Hz
    padding_value=0.0,      # value used to fill padding
    do_normalize=True,      # zero mean unit-variance normalize input. can improve performance
    return_attention_mask=True# whether call() should return attention mask

)
# wraps FE and tokenizer to a single processor
processor = Wav2Vec2Processor(feature_extractor,tokenizer)

## preprocess data
Okay now we start with the audio. First I need to resample my audio since it's in .mp3 format with 48000 Hz sampling rate (wav2vec2 takes .wav with 16k)
https://pytorch.org/audio/stable/transforms.html
https://pytorch.org/audio/stable/generated/torchaudio.transforms.Resample.html#torchaudio.transforms.Resample

### resample audio

In [None]:
audio_folder_path = "../../data/commonVoiceData/audio/001/"
output_path = audio_folder_path[:-1]+"wav/"

for file_path in os.listdir(audio_folder_path):
    waveform, orig_sample_rate = torchaudio.load(audio_folder_path+file_path)
    
    resampler = torchaudio.transforms.Resample(
        orig_freq=orig_sample_rate,
        new_freq=16000
        )
    
    new_waveform = resampler(waveform)
    
    torchaudio.save(output_path+file_path[:-3]+"wav", 
                    new_waveform, 
                    16000)

### preprocess dataset

In [None]:
def process sample

# Finetune Wav2Vec2 model 

In [None]:
from transformers import Wav2Vec2ModelForCTC
model = Wav2Vec2ModelForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    torch_dtype=torch.float16, 
    attn_implementation="flash_attention_2"
    )
