In [12]:
import os
import pandas as pd
import tempfile
import multiprocessing
from functools import partial
import librosa 
from pydub import AudioSegment

In [4]:
temp_dir = tempfile.gettempdir()
common_voice_path=os.path.abspath("/data")
audio_files_path = os.path.join(common_voice_path, "clips")
data_file_filename = "validated.tsv"
data_file_path = os.path.join(common_voice_path, data_file_filename)
df = pd.read_csv(data_file_path, sep='\t', low_memory=False)
df = df[(df['down_votes'] < 1) & (df['gender'] == 'male') & (df['up_votes'] > 1)]
df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
9,00c3f0e7c691ef30257d1bfa9adc410535b7ba3f48e344...,common_voice_en_18295850.mp3,The long-lived bridge still stands today.,2,0,twenties,male,
70,05ba9bb1a4ac391849fa4461547967768f4d7df8ee52d7...,common_voice_en_19967535.mp3,The cemetery is now managed by three trusts.,2,0,fifties,male,african
81,06546553aed17027b4e638d4afb56f39b216026088cf40...,common_voice_en_17147389.mp3,Women form less than half of the group.,2,0,twenties,male,us
100,0838a82655be5a61349c2d2d86b60c22b5b84fea9826cb...,common_voice_en_18127728.mp3,Sunburn can be avoided by applying sunscreen o...,2,0,twenties,male,
104,0899979e8d43a9faf448ddb5f4fc9a38a0fb4c120eaf34...,common_voice_en_17850951.mp3,Still waters run deep.,2,0,twenties,male,other


# Description
We would like to build a one to one dataset for indian to us conversion. 

We will take the highest frequency speaker from both accents and attempt to convert between each other.

The general steps are:
1. Find the speakers with the highest number of sentences spoken
2. Convert data from mp3 to wav
3. Store into individual folder to be reused by other experiments

In [5]:
from data_util import build_speaker_dataset

In [6]:
indian_df = build_speaker_dataset(df[(df['accent'] == 'indian') ], 1)

In [7]:
us_df = build_speaker_dataset(df[df['accent'] == 'us'], 1)

In [8]:
print(f"Retrieved {len(indian_df)} records with indian accents and {len(us_df)} records with us accents")

Retrieved 576 records with indian accents and 7827 records with us accents


In [13]:
def mp3_to_wav(src_path, target_path):
    """
    Read mp3 file from source path, convert it to wav and write it to target path. 
    Necessary libraries: ffmpeg, libav.

    :param src_path: source mp3 file path
    :param target_path: target wav file path
    """
    basepath, filename = os.path.split(src_path)
    os.chdir(basepath)
    AudioSegment.from_mp3(src_path).export(target_path, format='wav')

In [24]:
def process_each_file(audio_file_name, audio_files_path, output_dir):
    audio_file_name_mp3 = os.path.join(audio_files_path, audio_file_name)
    audio_file_name_wav = audio_file_name.replace(".mp3", ".wav")
    target_file_path = os.path.join(output_dir, audio_file_name_wav)
    mp3_to_wav(audio_file_name_mp3, target_file_path)
    return target_file_path

def convert_to_wav(df, audio_files_path, output_dir):
    input_mp3_files = df['path'].to_numpy()
    
    cpu_count = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(cpu_count)
    specs = pool.map(partial(process_each_file, audio_files_path=audio_files_path, output_dir=output_dir), input_mp3_files)


In [27]:
def create_dataset(df, output_dir):
    os.makedirs(output_dir, exist_ok = True)
    return convert_to_wav(df, audio_files_path, output_dir)

In [2]:
indian_output_dir = "/data/wav/v2/indian"
us_output_dir = "/data/wav/v2/us"
us_output_dir_small = "/data/wav/v2/us_small"

In [30]:
indian_wav_files = create_dataset(indian_df, indian_output_dir)

In [31]:
us_wav_files = create_dataset(us_df, us_output_dir)

In [1]:
# Create a smaller subset of the us voice with the same size as the indian dataset

In [3]:
import os 

desired_num_records = len(os.listdir(indian_output_dir))
full_us_records = os.listdir(us_output_dir)
desired_us_records = full_us_records[:desired_num_records]

In [7]:
!rm -rf /data/wav/v2/us_small
!mkdir -p /data/wav/v2/us_small

In [8]:
import shutil

for file_name in desired_us_records:
    shutil.copyfile(os.path.join(us_output_dir, file_name), os.path.join(us_output_dir_small, file_name))

In [9]:
len(os.listdir(us_output_dir_small))

577