# fetch data and inspect

In [1]:
from datasets import load_dataset
train = load_dataset("google/fleurs", "ga_ie", split="train")
val = load_dataset("google/fleurs", "ga_ie", split="validation")
test = load_dataset("google/fleurs", "ga_ie", split="test")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
audio_input = train[0]["audio"]

In [3]:
print(audio_input)

{'path': 'train/10009174761044778838.wav', 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00030464,
       -0.00026166, -0.00036532]), 'sampling_rate': 16000}


In [4]:
train.column_names

['id',
 'num_samples',
 'path',
 'audio',
 'transcription',
 'raw_transcription',
 'gender',
 'lang_id',
 'language',
 'lang_group_id']

# Add phoneme transcriptions

In [5]:
import pandas as pd

In [6]:
g2p_path="../../data/g2P/ulster.tsv"
g2p_file = pd.read_csv(g2p_path,sep="\t", names=["word","phonemes"])
# turn df into dict for simple lookup
g2p_dict = g2p_file.set_index("word")["phonemes"].to_dict()

In [7]:
def sent2phones(row):
    sentence = row["transcription"]
    words = [x.strip(" .,!?:;") for x in sentence.split()]
    
    phoneme_seq = []
    for word in words:
        if word in g2p_dict:
            phoneme_seq.append(g2p_dict[word].replace(" ",""))
        elif word.lower() in g2p_dict:
            phoneme_seq.append(g2p_dict[word.lower()].replace(" ",""))
        else:
            phoneme_seq.append("[UNK]")
    
    return {"phoneme_sentence": "|".join(phoneme_seq)}

In [8]:
sent2phones({"transcription":"dia duit"})

{'phoneme_sentence': 'ˈdʲia|ˈd̪ˠitʲ'}

In [9]:
train

Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 2845
})

In [10]:
train = train.map(sent2phones)

In [11]:
val = val.map(sent2phones)

In [12]:
test = test.map(sent2phones)

In [24]:
train[0]

{'id': 571,
 'num_samples': 172800,
 'path': '/home/peter/.cache/huggingface/datasets/downloads/extracted/a870d8d6658a3c263aa78e1d5ab46b448872cfe59db75af5d50a98907d873444/10009174761044778838.wav',
 'audio': {'path': 'train/10009174761044778838.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00030464,
         -0.00026166, -0.00036532]),
  'sampling_rate': 16000},
 'transcription': 'nuair a bhíonn tréith feinitíopach ar leith i bpáirt ag gach duine i ndaonra áirithe tugtar daonra monómorfach orthu',
 'raw_transcription': 'Nuair a bhíonn tréith feinitíopach ar leith i bpáirt ag gach duine i ndaonra áirithe, tugtar daonra monómorfach orthu.',
 'gender': 0,
 'lang_id': 27,
 'language': 'Irish',
 'lang_group_id': 0,
 'phoneme_sentence': 'ˈn̻ˠuːɾʲ|ə|ˈvʲiːn̻ˠ|ˈtʲɾʲeː|[UNK]|ˈeɾʲ|ˈl̻ʲehʲ|ˈi|bˠaːɾˠtʲ|ˈeɟ|ˈɡah|ˈd̪ˠinʲə|ˈi|n̻ˠiːn̻ˠɾˠə|ˈaːɾʲihʲə|t̪ˠuɡt̪ˠəɾˠ|ˈd̪ˠiːn̻ˠɾˠə|[UNK]|ˈoɾˠhu'}

# Save to disk

In [57]:
fleurs_path = "/home/peter/Desktop/Studies/Thesis/ThesisProject/data/fleurs"
train.save_to_disk(fleurs_path+"/train")
val.save_to_disk(fleurs_path+"/val")
test.save_to_disk(fleurs_path+"/test")


Saving the dataset (6/6 shards): 100%|██████████| 2845/2845 [00:06<00:00, 448.87 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 369/369 [00:09<00:00, 37.59 examples/s] 
Saving the dataset (2/2 shards): 100%|██████████| 842/842 [00:06<00:00, 123.53 examples/s]


To reload, use...

from datasets import load_from_disk

reloaded_encoded_dataset = load_from_disk("path/of/my/dataset/directory")

## save as csv

In [36]:
train_df['audio'][0].keys()

dict_keys(['bytes', 'path'])

In [37]:
train_df = train.to_pandas()

train_out = train_df[['audio', 'transcription', 'phoneme_sentence']]

In [38]:
train_out['audio_path'] = train_out['audio'].apply(lambda x: x['path'])

train_out = train_out.drop(columns=['audio'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_out['audio_path'] = train_out['audio'].apply(lambda x: x['path'])


In [39]:
train_out.head()

Unnamed: 0,transcription,phoneme_sentence,audio_path
0,nuair a bhíonn tréith feinitíopach ar leith i ...,ˈn̻ˠuːɾʲ|ə|ˈvʲiːn̻ˠ|ˈtʲɾʲeː|[UNK]|ˈeɾʲ|ˈl̻ʲehʲ...,train/10009174761044778838.wav
1,250 bliain ina dhiadh tá guinness fásta ina gh...,[UNK]|ˈbʲlʲianʲ|ˈin̻ˠə|[UNK]|ˈt̪ˠaː|[UNK]|ˈfˠa...,train/10011062099947436666.wav
2,sa tuarascáil mhíosúil is déanaí dúirt opec go...,ˈsˠə|ˈt̪ˠuaɾˠəsˠkalʲ|vʲiːsˠulʲ|ˈisˠ|ˈdʲeːn̻ˠiː...,train/10011703409297587839.wav
3,ba iad na tráchtanna a rinneadh beo ar an teil...,ˈbˠə|ˈiad̪ˠ|ˈn̻ˠə|t̪ˠɾˠaːɾˠt̪ˠən̻ˠə|ə|ˈɾˠin̻ʲu...,train/10012130565025110261.wav
4,tá rith meánraoin ina spórt measartha saor tá ...,ˈt̪ˠaː|ˈɾˠihʲ|ˈmʲaːn̻ˠˈɾˠiːnʲ|ˈin̻ˠə|ˈsˠpˠoːɾˠ...,train/10014922500849696344.wav


In [40]:
train_out.to_csv("../../data/fleurs/fleurs_phonemes.csv", index=False)