In [None]:
# import sys
# !{sys.executable} -m pip install datasets==3.6.0
# !{sys.executable} -m pip install librosa soundfile

import scipy.io.wavfile as wavfile
import tempfile
from datasets import load_dataset
import pandas as pd
from google.cloud import storage

In [None]:
# Load fleurs dataset
fleurs_valid_en = load_dataset("google/fleurs", "en_us", split="validation")
fleurs_test_en = load_dataset("google/fleurs", "en_us", split="test")

fleurs_valid_cmn = load_dataset("google/fleurs", "cmn_hans_cn", split="validation")
fleurs_test_cmn = load_dataset("google/fleurs", "cmn_hans_cn", split="test")

fleurs_valid_kr = load_dataset("google/fleurs", "ko_kr", split="validation")
fleurs_test_kr = load_dataset("google/fleurs", "ko_kr", split="test")

In [None]:
train_size = 30
test_size = 20

In [None]:
local_file_names_en = [f'./sample-audio/en-{str(idx)}.wav' for idx in range(train_size)]
local_file_names_cmn = [f'./sample-audio/cmn-{str(idx)}.wav' for idx in range(train_size)]
local_file_names_kr = [f'./sample-audio/kr-{str(idx)}.wav' for idx in range(train_size)]

local_file_names_test_en = [f'./sample-audio/test-en-{str(idx)}.wav' for idx in range(test_size)]
local_file_names_test_cmn = [f'./sample-audio/test-cmn-{str(idx)}.wav' for idx in range(test_size)]
local_file_names_test_kr = [f'./sample-audio/test-kr-{str(idx)}.wav' for idx in range(test_size)]

In [None]:
# Save at Local
for idx in range(train_size):
    now_sampling_rate_en = fleurs_valid_en['audio'][idx]["sampling_rate"]
    now_array_en = fleurs_valid_en['audio'][idx]["array"]

    now_sampling_rate_cmn = fleurs_valid_cmn['audio'][idx]["sampling_rate"]
    now_array_cmn = fleurs_valid_cmn['audio'][idx]["array"]

    now_sampling_rate = fleurs_valid_kr['audio'][idx]["sampling_rate"]
    now_array = fleurs_valid_kr['audio'][idx]["array"]

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        wavfile.write(local_file_names_en[idx], now_sampling_rate_en, now_array_en)
    
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        wavfile.write(local_file_names_cmn[idx], now_sampling_rate_cmn, now_array_cmn)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        wavfile.write(local_file_names_kr[idx], now_sampling_rate, now_array)


In [None]:
# Save at Local : Test data
for idx in range(test_size):
    now_sampling_rate_en = fleurs_test_en['audio'][idx]["sampling_rate"]
    now_array_en = fleurs_test_en['audio'][idx]["array"]

    now_sampling_rate_cmn = fleurs_test_cmn['audio'][idx]["sampling_rate"]
    now_array_cmn = fleurs_test_cmn['audio'][idx]["array"]

    now_sampling_rate = fleurs_test_kr['audio'][idx]["sampling_rate"]
    now_array = fleurs_test_kr['audio'][idx]["array"]


    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        wavfile.write(local_file_names_test_en[idx], now_sampling_rate_en, now_array_en)
    
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        wavfile.write(local_file_names_test_cmn[idx], now_sampling_rate_cmn, now_array_cmn)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        wavfile.write(local_file_names_test_kr[idx], now_sampling_rate, now_array)


In [None]:
# Saving test dataset, locally

df_test_en = pd.DataFrame(fleurs_test_en['transcription'], columns=['target']).iloc[:test_size]
df_test_en['audio'] = local_file_names_test_en
df_test_en.to_csv('./test-en.csv', index=False)

df_test_cmn = pd.DataFrame(fleurs_test_cmn['transcription'], columns=['target']).iloc[:test_size]
df_test_cmn['audio'] = local_file_names_test_cmn
df_test_cmn.to_csv('./test-cmn.csv', index=False)

In [None]:
df_test_kr = pd.DataFrame(fleurs_test_kr['transcription'], columns=['target']).iloc[:test_size]
df_test_kr['audio'] = local_file_names_test_kr

In [None]:
# Process dataset, to prevent wrong evaluation
df_test_kr.loc[6, 'target'] = "1940년 8월 15일 연합군은 프랑스 남부를 침략했고 이 침략은 드래군 작전이라 불렸다"
df_test_kr.loc[12, 'target'] = "사건 발생 이후 깁슨은 병원으로 이송되었으나 얼마 후 숨을 거뒀다"
df_test_kr.loc[13, 'target'] = "기술 결정론에 대한 대부분의 해석은 두 가지 개념을 갖습니다 기술 개발 자체가 문화나 정치 이상의 영향력을 보인다는 것과 기술은 사회적으로 통제되는 것이 아니라 오히려 사회에 본질적인 영향을 미칩니다"

df_test_kr.to_csv('./test-kr.csv', index=False)

In [None]:
# !gcloud auth application-default login

In [None]:
# Upload to GCS
def upload_wav_to_gcs(bucket_name, source_file_name, destination_blob_name, verbose=False):
    try:
        storage_client = storage.Client()
    except Exception as e:
        print(f"Error: {e}")
        return

    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    if verbose:
        print(f"File upload started: '{source_file_name}' -> 'gs://{bucket_name}/{destination_blob_name}'")

    try:
        blob.upload_from_filename(source_file_name, content_type='audio/wav')

        if verbose:    
            print(f"File '{source_file_name}' Uploaded to '{destination_blob_name}'")
        
    except Exception as e:
        print(f"Error when Uploading: {e}")

In [None]:
# Upload to GCS bucket
BUCKET_NAME = '$YOUR_BUCKET_NAME'

for idx in range(train_size):
    file_name = 'inputs/audio/' +local_file_names_en[idx].split('/')[-1]
    local_wav_file = local_file_names_en[idx]
    upload_wav_to_gcs(BUCKET_NAME, local_wav_file, file_name, verbose=True)

    file_name = 'inputs/audio/' + local_file_names_cmn[idx].split('/')[-1]
    local_wav_file = local_file_names_cmn[idx]
    upload_wav_to_gcs(BUCKET_NAME, local_wav_file, file_name, verbose=True)

    file_name = 'inputs/audio/' + local_file_names_kr[idx].split('/')[-1]
    local_wav_file = local_file_names_kr[idx]
    upload_wav_to_gcs(BUCKET_NAME, local_wav_file, file_name, verbose=True)



In [None]:
gcs_file_names_en = [f'gs://{BUCKET_NAME}/inputs/audio/en-{str(idx)}.wav' for idx in range(train_size)]
gcs_file_names_cmn = [f'gs://{BUCKET_NAME}/inputs/audio/cmn-{str(idx)}.wav' for idx in range(train_size)]
gcs_file_names_kr = [f'gs://{BUCKET_NAME}/inputs/audio/kr-{str(idx)}.wav' for idx in range(train_size)]

In [None]:
df_train_en = pd.DataFrame(fleurs_valid_en['transcription'], columns=['target']).iloc[:train_size]
df_train_en['audio'] = gcs_file_names_en

df_train_cmn = pd.DataFrame(fleurs_valid_cmn['transcription'], columns=['target']).iloc[:train_size]
df_train_cmn['audio'] = gcs_file_names_cmn

df_train_kr = pd.DataFrame(fleurs_valid_kr['transcription'], columns=['target']).iloc[:train_size]
df_train_kr['audio'] = gcs_file_names_kr

In [None]:
# Process dataset, to prevent wrong training
df_train_kr.loc[4, 'target'] = "폴란드 남자 시각 장애인 스키 선수 마키 크레젤과 가이드 안나 오가진스카가 슈퍼대회전에서 13위로 경기를 마쳤다 한국의 박종석은 남자 좌식스키 슈퍼대회전에서 24위를 차지했다"
df_train_kr.loc[6, 'target'] = "스키여행은 스키광 스키범 이라고 불리는 애호가들이 즐기는 여행 방식이다 그 사람들은 휴가 일정을 어느 한 특정 장소에서 줄곧 스키를 타는 시간으로 계획한다"
df_train_kr.loc[11, 'target'] = "몰 박사는 일부 환자가 병원에서 질병에 감염되었을 수 있다고 생각하며 그중 적어도 2명은 건강한 병원 근로자였을 것으로 생각했습니다"
df_train_kr.loc[15, 'target'] = "대부분의 경우 해외 갭 이어 코스에 등록하면 실제로 고국에서 고등 교육에 진학할 가능성을 높일 수 있습니다"
df_train_kr.loc[22, 'target'] = "108 접시의 춰펀 버헝 힌두교에서는 신에게 바치는 단것 과일 견과류 요리 등 56가지 다른 먹을거리들가 바바시함 에게 차려졌습니다"

In [None]:
def upload_df_to_gcs_as_jsonl(df: pd.DataFrame, bucket_name: str, destination_blob_name: str):
    jsonl_string = df.to_json(orient='records', lines=True, force_ascii=False)
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(jsonl_string, content_type='application/json')

    print(f"Uploaded : {bucket_name} / {destination_blob_name}")

In [None]:
train_data_path_en = 'inputs/train_audio_en.jsonl'
upload_df_to_gcs_as_jsonl(df_train_en, BUCKET_NAME, train_data_path_en)

train_data_path_cmn = 'inputs/train_audio_cmn.jsonl'
upload_df_to_gcs_as_jsonl(df_train_cmn, BUCKET_NAME, train_data_path_cmn)

train_data_path = 'inputs/train_audio_kr.jsonl'
upload_df_to_gcs_as_jsonl(df_train_kr, BUCKET_NAME, train_data_path)