In [6]:
import srt
import json
import datetime
import string
import speechace
import textprocessing
import re
from pydub import AudioSegment
import matplotlib.pyplot as plt
import numpy as np
import jupyternotify
import re
import codecs
import glob
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

<IPython.core.display.Javascript object>

In [7]:
def vtt_to_srt(input, output):
    outlines= []
    with codecs.open(input,'r', 'utf-8') as f:
        lines = f.readlines()
    index = 0
    for line in lines[:-1]:
        if line == "WEBVTT\n":
            continue
        if line.startswith('Kind:'):
            continue
        if line.startswith('Language:'):
            continue
        if 'Translator:' in line:
            continue
        if 'Reviewer:' in line:
            continue
        altered_line = re.sub(r'([\d]+)\.([\d]+)', r'\1,\2', line)
        outlines.append(altered_line)
        if line == "\n":
            outlines.append(str(index)+'\n')
            index += 1
                        
    outfile = codecs.open(output, "w", "utf-8")
    for line in outlines[1:]:
        outfile.write(line)
    outfile.close()

In [8]:
def fix_srt_overlap(subtitles):
    # FIX OVERLAPPING SUBTITLES!!
    if subtitles[0].end > subtitles[1].start:
        for i in range(len(subtitles)-1):
            subtitles[i].end = subtitles[i+1].start
    return subtitles

def split_subtitles(subtitles, target_length = 30):
    # split files into chunks for use with speechace API
    indices_to_split = []
    running_length = 0
    for i,subtitle in enumerate(subtitles):
        sub_length = subtitle.end - subtitle.start
        running_length += sub_length.total_seconds()
        if running_length >= target_length:
            indices_to_split.append(i)
            running_length = 0
    # add final index 
    indices_to_split.append(len(subtitles))
    return indices_to_split

def prepare_subtitle_chunks(subtitles, indices_to_split):
    # split text into chunks

    start_chunk = 0
    subs_for_speechace = []
    for i in indices_to_split:
        # get text and remove newline characters
        text = ' '.join(str(i.content.encode('utf-8').replace('\n',' ')) for i in subtitles[start_chunk:i])
        # remove HTML color codes
        text = re.sub('<[^<]+?>', '', text)
        # turn numbers into text
        text = textprocessing.replace_numbers(text)
        # remove punctuation
        text = ' '.join(word.strip(string.punctuation) for word in text.split())
        start_chunk = i
        subs_for_speechace.append(text)
    return subs_for_speechace

def prepare_audio_chunks(audio, indices_to_split):
    # split audio into chunks
    start_chunk = 0
    audio_for_speechace = []
    index = 0
    for i in indices_to_split:
        #print start_chunk,'-',i-1
        start = 1000*subtitles[start_chunk].start.total_seconds()
        end = 1000*subtitles[i-1].end.total_seconds()
        audio_chunk = audio[start:end]
        #print start, end
        audio_chunk.export(data_dir+"audio_chunk{0}.wav".format(index), format="wav")
        audio_for_speechace.append(data_dir+"audio_chunk{0}.wav".format(index))

        index += 1
        start_chunk = i
    return audio_for_speechace


def activate_speechace(subs_for_speechace, audio_for_speechace, max_calls = 999):
    # API calls!!!
    data_for_speechace = zip(audio_for_speechace, subs_for_speechace)
    word_list = []
    bad_chunks = []
    unknown_word_list = []
    print len(data_for_speechace)
    start_time = datetime.datetime.now()
    last_time = start_time
    total_time = 0
    for i,chunk in enumerate(data_for_speechace):
        print "call #" + str(i)
        if i >= max_calls:
            break
        speechace_data = speechace.query(chunk[0],chunk[1])
        if 'status' not in speechace_data:
            print speechace_data
            print "NO GO #1"
            bad_chunks.append(i)
            continue
        if speechace_data['status'] == 'success':
            word_list.extend(speechace.create_word_list(speechace_data))
        else:
            if speechace_data['short_message'] == 'error_unknown_words':
                print "found unknown_words!"
                unknown_words = speechace_data['detail_message'].encode('utf-8').split(':')[-1].split(',')
                new_text = chunk[1]
                for unknown_word in unknown_words:
                    unknown_word_list.append(unknown_word.strip())
                    new_text = new_text.replace(unknown_word,'')
                speechace_data = speechace.query(chunk[0],new_text)
                if 'status' not in speechace_data:
                    print "NO GO"
                    bad_chunks.append(i)
                    continue
                if speechace_data['status'] == 'success':
                    word_list.extend(speechace.create_word_list(speechace_data))
                else:
                    print speechace_data['short_message']
                    print "NO GO"
                    bad_chunks.append(i)
            else:
                print speechace_data['short_message']
                print "NO GO"
                bad_chunks.append(i)

        current_time = datetime.datetime.now()        
        print "took", (current_time-last_time).total_seconds()
        print "total time is", (current_time-start_time).total_seconds()
        print
        last_time = current_time
    return word_list, bad_chunks, unknown_word_list


In [9]:
data_dir = '/Users/lantonel/SilverTongue/data/'
input_dir = '/Users/lantonel/SilverTongue/training_dataset/'

all_inputs = [name.split('/')[-1].split('.')[0] for name in glob.glob(input_dir+"*vtt")]
print len(all_inputs), "total talks"

#done_list = ['N4wFyRGilp4', 'NA7krbsdXFA']
done_list = [name.split('/')[-1].split('.')[0] for name in glob.glob(input_dir+"*.json")]
#print len(done_list),"already processed"

inputs = [input for input in all_inputs if input not in done_list]

#inputs = ['NqOjj1FCcVY']
print len(inputs),"to process"

print len(all_inputs) - len(inputs), "already processed!!"
#print all_inputs
#print done_list
#print inputs

355 total talks
0 to process
355 already processed!!


In [5]:
%%notify
for input_name in inputs:
    print "processing input with ID", input_name
    audio_file = input_dir + input_name + ".m4a"
    vtt_file = input_dir + input_name + ".en.vtt"
    srt_file = input_dir + input_name + ".en.srt"

    # fix subtitle file and convert to srt
    vtt_to_srt(vtt_file, srt_file)
    
    # process subtitle file and split into chunks
    with codecs.open(srt_file,'r', 'utf-8') as myfile:
        data = myfile.read()
    subtitle_generator = srt.parse(data)
    subtitles = list(subtitle_generator)
    subtitles = fix_srt_overlap(subtitles)
    indices_to_split = split_subtitles(subtitles)
    subs_for_speechace = prepare_subtitle_chunks(subtitles, indices_to_split)


    # process audio file and split into chunks
    audio = AudioSegment.from_file(audio_file).split_to_mono()[0].set_frame_rate(16000)
    print "file is ",len(audio)/1000.,"seconds long (",len(audio)/60000.,") minutes"
    audio_for_speechace = prepare_audio_chunks(audio, indices_to_split)
    
    word_list, bad_chunks, unknown_word_list = activate_speechace(subs_for_speechace, 
                                                                  audio_for_speechace, 
                                                                  max_calls = 999)
    print "unknown_word_list:",unknown_word_list
    with open(input_dir+input_name+'.json', mode='w') as outputFile:
        json.dump(word_list, outputFile, indent=4)
    with open(input_dir+input_name+'_unknown_words.json', mode='w') as outputFile:
        json.dump(unknown_word_list, outputFile, indent=4)
    word_scores = [word["quality"] for word in word_list]

    
    plot = plt.hist(word_scores, 
                    100, 
                    range=[0, 100], 
                    histtype='step'
                   )

    axes = plt.gca()
    axes.set_xlabel("Word Score")
    axes.set_ylabel("A.U. (unit norm)")
    axes.set_xlim(0,100)
    plt.savefig('word_scores_'+input_name+'.pdf', transparent=True,bbox_inches='tight')
    plt.clf()
    indices = [i for i in range(len(word_list))]

    plot = plt.scatter(indices,word_scores)
    plt.savefig('word_scores_vs_index_'+input_name+'.pdf', transparent=True,bbox_inches='tight')
    plt.clf()


processing input with ID o8NPllzkFhE
file is  1290.027 seconds long ( 21.50045 ) minutes
39
call #0
took 13.894299
total time is 13.894299

call #1
found unknown_words!
took 21.182864
total time is 35.077163

call #2
took 14.43843
total time is 49.515593

call #3
took 11.097227
total time is 60.61282

call #4
found unknown_words!
took 13.943384
total time is 74.556204

call #5
took 11.795466
total time is 86.35167

call #6
took 13.094896
total time is 99.446566

call #7
took 12.641454
total time is 112.08802

call #8
found unknown_words!
took 16.521135
total time is 128.609155

call #9
took 12.93163
total time is 141.540785

call #10
took 13.13819
total time is 154.678975

call #11
found unknown_words!
took 15.322184
total time is 170.001159

call #12
took 12.2672
total time is 182.268359

call #13
found unknown_words!
took 15.189717
total time is 197.458076

call #14
found unknown_words!
took 15.054787
total time is 212.512863

call #15
found unknown_words!
error_unknown_words
NO GO
t

took 14.977582
total time is 54.560128

call #4
took 12.710638
total time is 67.270766

call #5
took 14.399137
total time is 81.669903

call #6
took 14.202445
total time is 95.872348

call #7
took 14.165459
total time is 110.037807

call #8
took 14.203114
total time is 124.240921

call #9
took 14.641836
total time is 138.882757

call #10
took 6.811929
total time is 145.694686

unknown_word_list: []
processing input with ID TvcNw4F0Y4Y
file is  449.771 seconds long ( 7.49618333333 ) minutes
13
call #0
took 12.099443
total time is 12.099443

call #1
took 13.107484
total time is 25.206927

call #2
took 13.55629
total time is 38.763217

call #3
took 14.29309
total time is 53.056307

call #4
took 12.525386
total time is 65.581693

call #5
took 13.482778
total time is 79.064471

call #6
took 13.637647
total time is 92.702118

call #7
took 13.249725
total time is 105.951843

call #8
took 12.649332
total time is 118.601175

call #9
took 13.529238
total time is 132.130413

call #10
found unknow

<IPython.core.display.Javascript object>