# Convert irish words
converts orthographic representation in common voice corpus to phonemic (IPA) representaions using Ulster pronunciation as standard

In [1]:
# imports
import pandas as pd
import nltk, os


In [2]:
tsvPath = "/home/peter/Desktop/Studies/Thesis/ThesisProject/data/g2P/ulster.tsv"
g2pFile = pd.read_csv(tsvPath,sep="\t", names=["word","phonemes"])

In [3]:
g2pFile.head()

Unnamed: 0,word,phonemes
0,á,aː
1,a,ə
2,a'am,a mˠ
3,a'at,a t̪ˠ
4,ab,a bˠ


# process text for Common Voice recordings

In [4]:
cVRecPath = "../../data/commonVoiceData/transcription/owsm_v3.1/001.csv"


## task flow
1. Read the TSV into a Pandas DataFrame (or a dictionary).
    * Left column = words (e.g., “abacais”)
    * Right column = phoneme transcriptions (e.g., “a bˠ ə k ə ʃ”)
2. Tokenize your input text (if you have entire sentences).
    * Split the sentence into words (taking care of punctuation or special symbols).
    * For each word, do a lookup in your DataFrame/dictionary.
3. Replace each word with its phoneme sequence using the mapping.

    * If a word is not in your dictionary, you’ll need a fallback strategy (e.g., [UNK] token or run a G2P model).
4. Join the phonemes for each word in order to form the full phoneme sequence for the sentence.

In [5]:
transcriptions = pd.read_csv(cVRecPath)

In [15]:
transcriptions.head()

Unnamed: 0.1,Unnamed: 0,path,sentence
0,0,audio/001/common_voice_ga-IE_17571418.mp3,Tá a mála ar an urlár
1,1,audio/001/common_voice_ga-IE_17571419.mp3,An Phoblacht Doiminiceach
2,2,audio/001/common_voice_ga-IE_17571420.mp3,Do ghnó féin déan a dhuine
3,3,audio/001/common_voice_ga-IE_17571424.mp3,Ní raibh mé
4,4,audio/001/common_voice_ga-IE_17571428.mp3,Dún do bhéal.


In [7]:
# make dict for easy look up
g2p_dict = g2pFile.set_index("word")["phonemes"].to_dict()

In [32]:
def sent2phones(sentence):
    words = [x.strip(" .,!?:;") for x in sentence.split()]
    
    phoneme_seq = []
    for word in words:
        if word in g2p_dict:
            phoneme_seq.append(g2p_dict[word])
        elif word.lower() in g2p_dict:
            phoneme_seq.append(g2p_dict[word.lower()])
        else:
            phoneme_seq.append("[UNK]")
    
    return " ".join(phoneme_seq)

In [33]:
transcriptions["phoneme_sentence"] = transcriptions["sentence"].apply(sent2phones)

In [34]:
transcriptions.head()

Unnamed: 0.1,Unnamed: 0,path,sentence,phoneme_sentence
0,0,audio/001/common_voice_ga-IE_17571418.mp3,Tá a mála ar an urlár,ˈ t̪ˠ aː ə ˈ mˠ aː l̻ˠ ə ˈ e ɾʲ ˈ ə n̻ˠ ˈ uː ɾ...
1,1,audio/001/common_voice_ga-IE_17571419.mp3,An Phoblacht Doiminiceach,ˈ ə n̻ˠ ˈ fˠ o bˠ l̻ˠ a x t̪ˠ ˈ d̪ˠ i mʲ ə nʲ ...
2,2,audio/001/common_voice_ga-IE_17571420.mp3,Do ghnó féin déan a dhuine,ˈ d̪ˠ ə ɣ n̻ˠ oː ˈ h eː nʲ ˈ dʲ eː n̻ˠ ə ˈ ɣ i...
3,3,audio/001/common_voice_ga-IE_17571424.mp3,Ní raibh mé,ˈ n̻ʲ iː ˈ ɾˠ oː vˠ ˈ mʲ eː
4,4,audio/001/common_voice_ga-IE_17571428.mp3,Dún do bhéal.,ˈ d̪ˠ uː n̻ˠ ˈ d̪ˠ ə ˈ vʲ eː l̻ˠ


In [22]:
g2p_dict["bhéal"]

'ˈ vʲ eː l̻ˠ'

Okay, great. now we have a new column with phonemic transcriptions. We can tokenize using wav2vec2 tokenizer. For that I think we need a vocabulary of the unique phonemes

In [None]:
phoneme_vocab = set()
phoneme_vocab.update(["[UNK]"])
for phonemes in g2p_dict.values():
    phoneme_vocab.update(phonemes.split())


# Data preparation

## tokenize

In [None]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer(
    
)