# Split audio and generate transcription

Using the previosly generated transcriptions, this will split our dataset into training and test sets. Then, it will create a unique transcription file for the available sentences and a unique audio file for each sentence.

In [1]:
import json
from pydub import AudioSegment
import os
import re
import shutil
from math import floor
from random import shuffle

In [2]:
transcriptions_directory = './transcriptions'
audio_files = []

for file in os.listdir(transcriptions_directory):
    filename = os.fsdecode(file)
    if (filename.endswith('.json')):
        audio_files.append(filename.replace('.json', ''))

print('There are {} audio files'.format(len(audio_files)))

There are 40 audio files


In [3]:
def min_sec_to_miliseconds (time):
    p = re.compile('\d+')
    numbers = p.findall(time)
    minutes = int(numbers[0])
    seconds = int(numbers[1])
    return (60*minutes+seconds)*1000

In [4]:
def load_sentences (audio_file, path):
    with open(path+audio_file+'.json') as f:
        sentences = json.loads(f.read())
    return sentences

In [5]:
def store_sentence (sentence, transcription_file, audio_file):
    with open(transcription_file, "a") as f:
        for s in sentence['sentences']:
            f.write("<s> {} </s> ({})\n".format(s, audio_file))

In [6]:
def split_audio_trans (audio_file, sentences, inpath, outpath, transcription_file, padding, fileids):

    # Read audio file from mp3
    print(inpath+audio_file)
    readAudio = AudioSegment.from_mp3(inpath+audio_file)
    
    # Set frame rate to 16kHz
    audioFR = readAudio.set_frame_rate(16000)
    audioFR = audioFR.set_channels(1)
    
    for sentence in sentences:
        start_time = int(min_sec_to_miliseconds(sentence['start_time']))
        end_time = int(min_sec_to_miliseconds(sentence['end_time']) + padding*1000)
        fragment = audioFR[start_time:end_time]
        
        # Store audio fragment
        out = fragment.export(outpath + '{}_{}_{}.wav'.format(audio_file, start_time, end_time), format="wav")
        fileids.write('{}_{}_{}\n'.format(audio_file, start_time, end_time))
        
        store_sentence(sentence, transcription_file, '{}_{}_{}'.format(audio_file, start_time, end_time))

In [7]:
def create_folders(audio_file, path):
    try:
        if os.path.isdir(path+audio_file):
            shutil.rmtree(path+audio_file)
        
        os.mkdir(path+audio_file)
        os.mkdir(path+audio_file+'/audios')
        
    except:
        print('Error creating folders for {}'.format(audio_file))

In [8]:
def split_test_train (filenames: list, 
                      test_size: float):
    """
    Divides a given list of file names into train and test set using the desired proportion for the test. It shuffles
    the input to avoid patterns in input order.
    
    :param filenames: list of the files that is going to be splitted
    :param test_size: proportion between 0 and 1 that will be used for test set
    
    :return train, test: lists containing filenames for training and testing respectively
    """
    try:
        shuffle(filenames)
        train_len = floor(len(filenames)*(1-test_size))

        train = filenames[:train_len]
        test = filenames[train_len:]   

        return train, test
    
    except:
        print("Error dividing dataset!")
        return None, None

In [11]:
training_audio, test_audio = split_test_train(audio_files, 0.2)

In [12]:
if os.path.isfile('/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/trans.transcription'):
    os.remove('/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/trans.transcription')

with open('/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/corele.fileids', 'w') as f:
    for audio_file in training_audio:
        print('Processing [{}/{}]: {}'.format(audio_files.index(audio_file), len(training_audio), audio_file))
        padding = 0

        if ("A2" in audio_file):
            padding=1
        else:
            padding=0.5

        sentences = load_sentences(audio_file, './transcriptions/')
        split_audio_trans(audio_file, sentences, './audios/', '/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/', '/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/corele_raw.transcription', padding, f)

Processing [0/32]: KORWB1.mp3
./audios/KORWB1.mp3
Processing [1/32]: POLMB1.mp3
./audios/POLMB1.mp3
Processing [2/32]: DUTMA2.mp3
./audios/DUTMA2.mp3
Processing [3/32]: HUNWA2.mp3
./audios/HUNWA2.mp3
Processing [4/32]: CHIWA2_2.mp3
./audios/CHIWA2_2.mp3
Processing [5/32]: DUTWA2_2.mp3
./audios/DUTWA2_2.mp3
Processing [6/32]: FINWA2.mp3
./audios/FINWA2.mp3
Processing [7/32]: FREMA2.mp3
./audios/FREMA2.mp3
Processing [8/32]: ITAMA2.mp3
./audios/ITAMA2.mp3
Processing [9/32]: PORMA2.mp3
./audios/PORMA2.mp3
Processing [10/32]: POLWB1.mp3
./audios/POLWB1.mp3
Processing [11/32]: POLMA2_2.mp3
./audios/POLMA2_2.mp3
Processing [12/32]: ITAWA2.mp3
./audios/ITAWA2.mp3
Processing [13/32]: ITAMB1.mp3
./audios/ITAMB1.mp3
Processing [14/32]: CHIMB1.mp3
./audios/CHIMB1.mp3
Processing [15/32]: FREMB1.mp3
./audios/FREMB1.mp3
Processing [16/32]: FREWB1.mp3
./audios/FREWB1.mp3
Processing [17/32]: PORWA2_2.mp3
./audios/PORWA2_2.mp3
Processing [18/32]: ITAWB1.mp3
./audios/ITAWB1.mp3
Processing [19/32]: GERWA

In [13]:
fname = "/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/corele_raw.transcription"
count = 0
with open(fname, 'r') as f:
    for line in f:
        count += 1
print("Total number of lines is:", count)

Total number of lines is: 2943


In [14]:
test_audio

['ENGWB1_2.mp3',
 'JAPWB1_3.mp3',
 'PORWA2_1.mp3',
 'TURWB1.mp3',
 'ENGWA2.mp3',
 'ENGWB1_1.mp3',
 'JAPWA2.mp3',
 'CHIWB1.mp3']

In [17]:
with open('/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/corele.fileids', 'w') as f:
    for audio_file in test_audio:
        print('Processing [{}/{}]: {}'.format(audio_files.index(audio_file), len(test_audio), audio_file))
        padding = 0

        if ("A2" in audio_file):
            padding=1
        else:
            padding=0.5

        sentences = load_sentences(audio_file, './transcriptions/')
        split_audio_trans(audio_file, sentences, './audios/', '/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/test/', '/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/corele_raw_test.transcription', padding, f)

Processing [32/8]: ENGWB1_2.mp3
./audios/ENGWB1_2.mp3
Processing [33/8]: JAPWB1_3.mp3
./audios/JAPWB1_3.mp3
Processing [34/8]: PORWA2_1.mp3
./audios/PORWA2_1.mp3
Processing [35/8]: TURWB1.mp3
./audios/TURWB1.mp3
Processing [36/8]: ENGWA2.mp3
./audios/ENGWA2.mp3
Processing [37/8]: ENGWB1_1.mp3
./audios/ENGWB1_1.mp3
Processing [38/8]: JAPWA2.mp3
./audios/JAPWA2.mp3
Processing [39/8]: CHIWB1.mp3
./audios/CHIWB1.mp3
