In [2]:
#######################################################################################################################
# Project: Deep Virtual Rapport Agent (rapport model)
#
#     Jan Ondras (jo951030@gmail.com)
#     Institute for Creative Technologies, University of Southern California
#     April-October 2019
#
#######################################################################################################################
# Plot the outputs from the Amazon speech-to-text
#     Supports channel identification (input was stereo audio)
#######################################################################################################################


import json
import glob
import numpy as np
import pandas as pd
import soundfile as sf
from matplotlib import pyplot as plt

# Path to STT outputs
stt_outputs_dir = f'/home/ICT2000/jondras/dvra_datasets/mimicry/voice_activity_detection/speech_to_text_amazon/'
mono_audio_dir = '/home/ICT2000/jondras/dvra_datasets/mimicry/audio/audio_separated_16kHz_flac'

# Iterate over audio files
audio_filenames = sorted(glob.glob(f'{mono_audio_dir}/*.flac'))
for i in range(0, len(audio_filenames), 2):
        
    stt_output_filename = stt_outputs_dir + audio_filenames[i].split('/')[-1][:10] + '.json'
    print(audio_filenames[i], audio_filenames[i + 1], stt_output_filename)
#     if int(audio_basename[7:9]) < 54:
#         continue

    # Load both audio files
    audio_signal_1, samplerate_1 = sf.read(audio_filenames[i])
    print(f'\tSampling rate: {samplerate_1} Length: {len(audio_signal_1)}')
    audio_signal_2, samplerate_2 = sf.read(audio_filenames[i + 1])
    print(f'\tSampling rate: {samplerate_2} Length: {len(audio_signal_2)}')
    x_audio_signal = np.arange(0, len(audio_signal_1) / samplerate_1, 1. / samplerate_1)[:len(audio_signal_1)]
    
    # Load json outputs from STT and generate binary voice activity for each speaker
    # Speaker tags are 1 and 2 only
    speaker_tags = [1, 2]
    # TODO
    
    print()
        
print(f'Processed {i + 1} Amazon speech2text output files.')

/media/DataDrive/MimicryDB/audio_separated_16kHz_flac/sessid_01_P1_sid_09.flac /media/DataDrive/MimicryDB/audio_separated_16kHz_flac/sessid_01_P2_sid_02.flac /media/DataDrive/MimicryDB/speech_to_text_amazon/sessid_01_.json
	Sampling rate: 16000 Length: 9715665
	Sampling rate: 16000 Length: 9715665
/media/DataDrive/MimicryDB/audio_separated_16kHz_flac/sessid_02_P1_sid_09.flac /media/DataDrive/MimicryDB/audio_separated_16kHz_flac/sessid_02_P2_sid_17.flac /media/DataDrive/MimicryDB/speech_to_text_amazon/sessid_02_.json
	Sampling rate: 16000 Length: 15785457
	Sampling rate: 16000 Length: 15785457
/media/DataDrive/MimicryDB/audio_separated_16kHz_flac/sessid_03_P1_sid_17.flac /media/DataDrive/MimicryDB/audio_separated_16kHz_flac/sessid_03_P2_sid_02.flac /media/DataDrive/MimicryDB/speech_to_text_amazon/sessid_03_.json
	Sampling rate: 16000 Length: 7112811
	Sampling rate: 16000 Length: 7112811
/media/DataDrive/MimicryDB/audio_separated_16kHz_flac/sessid_04_P1_sid_12.flac /media/DataDrive/Mimic

KeyboardInterrupt: 

In [1]:
# Plot the outputs from the Google Speech-2-Text
# Speaker identification (input was mono audio)

import json
import glob
import numpy as np
import pandas as pd
import soundfile as sf
from matplotlib import pyplot as plt

# Path to STT outputs
stt_outputs_dir = f'/media/DataDrive/MimicryDB/speech_to_text_amazon'
mono_audio_dir = '/media/DataDrive/MimicryDB/audio_separated_16kHz_flac/'

# Iterate over csv output files and audio files
for i, stt_output_file in enumerate(sorted(glob.glob(f'{stt_outputs_dir}/*.json'))):
        
    output_filename_split = stt_output_file.split('/')[-1][:-4].split('_')
    audio_basename = '_'.join(output_filename_split[:5])
    print(audio_basename)
#     if int(audio_basename[7:9]) < 54:
#         continue

    # Load audio signal
    audio_signal, samplerate = sf.read(f'{mono_audio_dir}/{audio_basename}.flac')
    print(f'\tSampling rate: {samplerate}')
    x_audio_signal = np.arange(0, len(audio_signal) / samplerate, 1. / samplerate)[:len(audio_signal)]
    
    # Load json dump outputs from STT and generate binary voice activity for each speaker
    # Speaker tags are 1 and 2 only
    speaker_tags = [1, 2]
    bin_voice_activity = dict()
    for speaker_tag in speaker_tags:
        bin_voice_activity[speaker_tag] = np.zeros(len(audio_signal), dtype=int)
        
    with open(stt_output_file, "r") as json_file:
        data = json.load(json_file)
#         print(data)

        # Check if speaker tags were generated (if yes, take words from the last result only)
        if 'speakerTag' in data['results'][-1]['alternatives'][0]['words'][0].keys():
            for word in data['results'][-1]['alternatives'][0]['words']:
                # Skip "s" at the end of the string
                start_time = float(word['startTime'][:-1])
                end_time = float(word['endTime'][:-1])
                bin_voice_activity[word['speakerTag']] = np.where(
                    (x_audio_signal >= start_time) & (x_audio_signal <= end_time), 
                    1, bin_voice_activity[word['speakerTag']])

        # If no speaker tags were assigned, take words from all results and assign speaker tag 1
        else:
            speaker_tag = 1
            for result in data['results']:
                for word in result['alternatives'][0]['words']:
                    # Skip "s" at the end of the string
                    start_time = float(word['startTime'][:-1])
                    end_time = float(word['endTime'][:-1])
                    bin_voice_activity[speaker_tag] = np.where(
                        (x_audio_signal >= start_time) & (x_audio_signal <= end_time), 
                        1, bin_voice_activity[speaker_tag])
                    
    # Plot raw audio signal and bin_voice_activity
#     plt.figure(figsize=[15,6])
    plt.figure(figsize=[25,6])
    plt.title(audio_basename)
    plt.plot(x_audio_signal, audio_signal, 'g-', alpha=0.3, label='audio signal')#,linewidth=2.0)
    for speaker_tag in speaker_tags:
        plt.plot(x_audio_signal, bin_voice_activity[speaker_tag] - speaker_tag*1.5 + 1.5, label=f'bin_voice_activity_{speaker_tag}')#,linewidth=2.0)
#     plt.xlim(110, 135)
    plt.ylim(-1.6, 1.6)
    plt.xlabel('Time (s)')
    plt.legend()
    plt.savefig(f'./speaker_diarization_plots_google/{audio_basename}.png')
#     plt.show()
    
#     break
        
print(f'Processed {i + 1} Google speech2text output files.')

sessid_01_P1_sid_09
sessid_01_P2_sid_02
sessid_02_P1_sid_09
sessid_02_P2_sid_17
sessid_03_P1_sid_17
sessid_03_P2_sid_02
sessid_04_P1_sid_12
sessid_04_P2_sid_23
sessid_05_P1_sid_12
sessid_05_P2_sid_21
sessid_06_P1_sid_23
sessid_06_P2_sid_21
sessid_07_P1_sid_09
sessid_07_P2_sid_01
sessid_08_P1_sid_09
sessid_08_P2_sid_04
sessid_09_P1_sid_01
sessid_09_P2_sid_04
sessid_10_P1_sid_09
sessid_10_P2_sid_34
sessid_11_P1_sid_09
sessid_11_P2_sid_15
sessid_12_P1_sid_15
sessid_12_P2_sid_11
sessid_13_P1_sid_09
sessid_13_P2_sid_19
sessid_14_P1_sid_19
sessid_14_P2_sid_06
sessid_15_P1_sid_09
sessid_15_P2_sid_16
sessid_16_P1_sid_24
sessid_16_P2_sid_16
sessid_17_P1_sid_09
sessid_17_P2_sid_43
sessid_18_P1_sid_03
sessid_18_P2_sid_43
sessid_19_P1_sid_09
sessid_19_P2_sid_22
sessid_20_P1_sid_09
sessid_20_P2_sid_50
sessid_21_P1_sid_50
sessid_21_P2_sid_22
sessid_22_P1_sid_12
sessid_22_P2_sid_18
sessid_23_P1_sid_12
sessid_23_P2_sid_39
sessid_24_P1_sid_18
sessid_24_P2_sid_39
sessid_25_P1_sid_12
sessid_25_P2_sid_27
