# Preprocessing module for a simple STT module

In [5]:
import os, csv
import librosa
import numpy as np



## First we need to get the list of files

In [6]:
data_directory = "./data"

def the_wavs(dir_in):
    list_dirs = []
    tmp_list = []
    counter = 0
    for (dirpath, dirnames, _) in os.walk(dir_in):
        for direc in dirnames:
            direc = os.path.join(dirpath,direc)
            list_files = list_paths(direc)
            list_text = list_paths(direc,extension='.csv')
            tmp_list = [direc,list_text[0],list_files]
            list_dirs.append(tmp_list)
                
    return list_dirs
            

def list_paths(dirpath, extension='.wav'):
    # get the paths of the files (preferably a wav file) in the 
    # input directory path
    # - This returns a list of file paths
    output = []
    for filename in os.listdir(dirpath):
        if filename.endswith(extension):
            output.append(filename)
    return output

In [7]:
# So this variable contains all the information I need
# on the data to process
file_infos = the_wavs(data_directory)

In [8]:
file_infos[0]

['./data/KR_M_YDH',
 'ko_narration_2400_3.0_자체수정.csv',
 ['1667.wav',
  '1046.wav',
  '1437.wav',
  '2216.wav',
  '506.wav',
  '1421.wav',
  '2170.wav',
  '2291.wav',
  '345.wav',
  '507.wav',
  '1259.wav',
  '1376.wav',
  '903.wav',
  '2143.wav',
  '2398.wav',
  '2373.wav',
  '1765.wav',
  '643.wav',
  '498.wav',
  '572.wav',
  '694.wav',
  '1183.wav',
  '749.wav',
  '1296.wav',
  '867.wav',
  '1260.wav',
  '473.wav',
  '1343.wav',
  '1118.wav',
  '2200.wav',
  '1174.wav',
  '1829.wav',
  '1428.wav',
  '2380.wav',
  '1281.wav',
  '2354.wav',
  '1129.wav',
  '1642.wav',
  '2069.wav',
  '629.wav',
  '164.wav',
  '1675.wav',
  '1368.wav',
  '1783.wav',
  '2107.wav',
  '1050.wav',
  '2351.wav',
  '2310.wav',
  '1354.wav',
  '1686.wav',
  '1575.wav',
  '1955.wav',
  '1508.wav',
  '884.wav',
  '941.wav',
  '2321.wav',
  '2386.wav',
  '1175.wav',
  '2366.wav',
  '2007.wav',
  '1153.wav',
  '1546.wav',
  '1199.wav',
  '2255.wav',
  '831.wav',
  '2129.wav',
  '2081.wav',
  '1556.wav',
  '2297.w

## Match each sentence to the relevant file

In [9]:
def csv_to_dict(csv_path):
    dict_out = {}
    with open(csv_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            dict_out[row[0]] = row[1]
    return dict_out

def match_sentence(file_info, extension='.wav'):
    # This takes a file info from file_infos defined above
    # and returns a list of .wav file path and text pairs
    dir_path = file_info[0]
    text_dict = csv_to_dict(os.path.join(dir_path,file_info[1]))
    path_wavs = [ os.path.join(dir_path,a) for a in file_info[2]]
    list_wav_names = [a.replace('.wav','') for a in file_info[2]]
    
    wavs = zip(path_wavs, list_wav_names)
    list_out = []
    for (path_ut, utterance) in wavs:
        try:
            text = text_dict[utterance]
            list_out.append([path_ut,text])
        except KeyError:
            continue
    return list_out

def data_dict(file_infos):
    return [match_sentence(a) for a in file_infos]

In [10]:
paths_and_sentences = data_dict(file_infos)
print(len(paths_and_sentences))
print(len(paths_and_sentences[0]))
print(paths_and_sentences[0][0])
print(paths_and_sentences[0][0][0])

15
2394
['./data/KR_M_YDH/1667.wav', '옛날 아름다운 시골 마을에 어릴 때부터 서로 위하고 아끼는 형제가 있었답니다.']
./data/KR_M_YDH/1667.wav


## Read the wav files

In [11]:
def wav_list_from_data_dict_result(data_dict_result,output_file='file_and_wav_preprocessed.txt'):
    list_tmp = []
    list_files = []
    len_max = 0
    for speaker in data_dict_result:
        for speaker_utterances in speaker:
            wav_file = speaker_utterances[0]
            if os.path.isfile(wav_file):
                wav, sr = librosa.load(wav_file,sr=None)
                list_tmp.append(wav.tolist())
                list_files.append(wav_file)
                if len(wav) > len_max:
                    len_max = len(wav)
    file_and_wav = zip(list_files,list_tmp)
    output_file = open(output_file,"w")
    output_file.write("file_name, wav_vector \n")
    for file,wav in file_and_wav:
        while len(wav) < len_max:
            wav.append(0.0)
        output_file.write(file + " ;")
        output_file.write(wav)
        output_file.write("\n")
    output_file.close()
        

In [None]:
wav_list_from_data_dict_result(paths_and_sentences)

## Encode the text
Here, we will only encode Korean texts as our potential customers are expected to be Korean. Korean Hangul is a featural writing system where each character of a sentence is a combination of consonants and vowels. This requires the programmer to first separate all the characters into its consonant- and vowel-components prior to the usual encoding steps often used for an alphabetic writing system.

In [3]:
# Hanguls

onsets = (
    "ㄱ", "ㄲ", "ㄴ", "ㄷ", "ㄸ", "ㄹ", "ㅁ", "ㅂ",
    "ㅃ", "ㅅ", "ㅆ", "ㅇ", "ㅈ", "ㅉ", "ㅊ", "ㅋ",
    "ㅌ", "ㅍ", "ㅎ")
consonants = (
    "ㄱ", "ㄲ", "ㄴ", "ㄷ", "ㄸ", "ㄹ", "ㅁ", "ㅂ",
    "ㅃ", "ㅅ", "ㅆ", "ㅈ", "ㅉ", "ㅊ", "ㅋ",
    "ㅌ", "ㅍ", "ㅎ")
nuclei = (
    "ㅏ", "ㅐ", "ㅑ", "ㅒ", "ㅓ", "ㅔ", "ㅕ", "ㅖ",
    "ㅗ", "ㅘ", "ㅙ", "ㅚ", "ㅛ", "ㅜ", "ㅝ", "ㅞ",
    "ㅟ", "ㅠ", "ㅡ", "ㅢ", "ㅣ")
codas = (
    "", "ㄱ", "ㄲ", "ㄳ", "ㄴ", "ㄵ", "ㄶ", "ㄷ",
    "ㄹ", "ㄺ", "ㄻ", "ㄼ", "ㄽ", "ㄾ", "ㄿ", "ㅀ",
    "ㅁ", "ㅂ", "ㅄ", "ㅅ", "ㅆ", "ㅇ", "ㅈ", "ㅊ",
    "ㅋ", "ㅌ", "ㅍ", "ㅎ")

extended_hanguls = set(onsets+nuclei+codas)

In [2]:
def is_Hangul(a_char):
    """Confirms whether the input is a Hangeul or not"""
    return 0xAC00 <= ord(a_char[:1]) <= 0xD7A3

def Hangul_decomposition(a_hangul):
    """ Decomposes the input Hangul character into onset, nucleus and coda.
    If the character lacks a coda, its supposed coda is represented by a Z
    """
    if not is_Hangul(a_hangul):
        return a_hangul
    num_val = ord(a_hangul) - 0xAC00
    onset = num_val // (21*28)
    nucleus = num_val % (21*28) // 28
    coda = num_val % 28
    if coda == 0:
        return onsets[onset] + nuclei[nucleus]
    else:
        return onsets[onset] + nuclei[nucleus] + codas[coda]