### Convert to a proper format

In [7]:
from pydub import AudioSegment
import os

# Define a function to convert audio files to the correct format
def preprocess_audio(file_path, output_directory):
    try:
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_frame_rate(16000)
        output_file = os.path.join(output_directory, os.path.basename(file_path).replace('.flac', '.wav'))
        audio.export(output_file, format='wav')
        print(f"Converted {file_path} to WAV format and saved to {output_file}")
    except Exception as e:
        print(f"Error converting {file_path}: {str(e)}")

# Specify the directory containing your FLAC files
audio_directory = 'audios'

# Specify the directory where you want to save the converted WAV files
output_directory = 'audios_wav'

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# List audio files in the directory
audio_files = [os.path.join(audio_directory, filename) for filename in os.listdir(audio_directory) if filename.endswith('.flac')]

# Apply the function to all audio files
for file in audio_files:
    preprocess_audio(file, output_directory)


Converted audios/audio01_00.flac to WAV format and saved to audios_wav/audio01_00.wav
Converted audios/audio01_01.flac to WAV format and saved to audios_wav/audio01_01.wav
Converted audios/audio01_02.flac to WAV format and saved to audios_wav/audio01_02.wav
Converted audios/audio01_03.flac to WAV format and saved to audios_wav/audio01_03.wav
Converted audios/audio01_04.flac to WAV format and saved to audios_wav/audio01_04.wav
Converted audios/audio01_05.flac to WAV format and saved to audios_wav/audio01_05.wav
Converted audios/audio01_06.flac to WAV format and saved to audios_wav/audio01_06.wav
Converted audios/audio01_07.flac to WAV format and saved to audios_wav/audio01_07.wav
Converted audios/audio01_08.flac to WAV format and saved to audios_wav/audio01_08.wav
Converted audios/audio01_09.flac to WAV format and saved to audios_wav/audio01_09.wav
Converted audios/audio01_10.flac to WAV format and saved to audios_wav/audio01_10.wav
Converted audios/audio01_11.flac to WAV format and sav

### get list of converted files from `audios_wav` folder

In [1]:
import os
import glob

# Define the path to your audio files
audio_path = 'audios_wav'

# Get a list of all audio files in the directory
audio_files = glob.glob(os.path.join(audio_path, '*'))

### get list of transcriptions

In [3]:
import csv

# Specify the path to your CSV file
csv_file_path = 'dataset - dataset.csv'

# Create an empty list to store transcriptions
transcriptions = []

# Open the CSV file and parse its contents
with open(csv_file_path, newline='') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        transcriptions.append(row['transcription'])

# Now, 'transcriptions' is a list containing all the transcriptions from the CSV file
print(transcriptions)

['naku naman ang hirap nito pero sige na nga subu subukan ko', 'ayon sa kwento ng mga matatanda dito sa batangas noong unang panahon may isang magiting na batangueno na nagtagumpay sa pakikipaglaban sa mga kastila', 'siya ay isang mandirigma na matapang at mayroong malaking puso para sa kanyang bayan', 'sa bawat tagumpay niya lagi niyang sinasabi mabuhay ang batangas', 'kaya hanggang ngayon kapag may nagawa kaming maganda dito sa bayan namin lagi namin sinasabi ang kanyang pangalan at pinupuri namin ang kanyang kabayanihan', 'hindi laang naman pakikipaglaban ang sikat sa batangas', 'sikat din ang mga pagkaing gawa dito tulad ng mga bulalo goto lomi', 'at pati na rin ang mga festival tulad ng parada ng lechon at sublian festival', 'at syempre hindi mawawala ang salitang eh sa bawat usapan ng mga taga batangas', 'ganun talaga ang tono namin tono namin eh', 'pero wag kayong mag alala hindi naman kami galit', 'masayahin laang talaga ang mga batangueno', 'kaya kung gusto ninyong magpakain s

### create dataset file

In [6]:
from datasets import Dataset

# Create a dictionary with the audio files and transcriptions
data = {'audio': audio_files, 'transcription': transcriptions}

# Create the dataset
dataset = Dataset.from_dict(data)

  from .autonotebook import tqdm as notebook_tqdm


### create train-test split

In [8]:
from sklearn.model_selection import train_test_split

train_dataset, test_dataset = train_test_split(dataset, test_size=0.2)

train_data = Dataset.from_dict(train_dataset)
test_data = Dataset.from_dict(test_dataset)

### push train-test split to hf

In [14]:
from datasets import DatasetDict

# Combine train and test splits into a DatasetDict
dataset_dict = DatasetDict({'train': train_data, 'test': test_data})

# Push the combined dataset to your Hugging Face repository
dataset_dict.push_to_hub("jrs-a/batangueno-accent")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 619.27ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.41s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 642.90ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it]
Downloading metadata: 100%|██████████| 323/323 [00:00<00:00, 1.04MB/s]
