In [None]:
# Parameter settings and library importation

# Used to move files around and other file system related tasks
import os
import shutil

# Used to record audio streams
import pyaudio
import wave
import datetime

# Used to process audio data
import contextlib
from pydub import AudioSegment

# Used to read Matlab files from python
import matlab.engine

import speech_recognition as sr


In [None]:
# Used to produce a less messy formatting for the current time
def replace_special_chars(z, special_chars, new_char):
    removeSpecialChars = z.translate ({ord(c): new_char for c in special_chars})
    return removeSpecialChars

In [None]:
def slice_into_smaller_windows(fname, window_size):
    target_frames = window_size * 1000
    
    with contextlib.closing(wave.open(fname,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
        
        audio = AudioSegment.from_wav(fname)
        
        print(duration)
        
        fold = int(duration/window_size)        
        for i in range(0, fold):
            begin = i * target_frames
            end = (i + 1) * target_frames
            if end > duration*1000:
                break
            else:
                sub_audio = audio[begin:end]
                #print(sub_audio.getnframes()/sub_audio.getframerate())
                sub_audio.export(fname[:len(fname)-4]+'-'+str(i)+'.wav', format='wav')
            
    os.remove(fname)

In [None]:
fname = 'C://Users//yg9ca//Desktop//PCR_pipeline//test_audios//test_wav.wav'
slice_into_smaller_windows(fname, 5)

In [None]:
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 10

# Used to receive a single session of audio input from the microphone
def record_single_session(CHUNK, FORMAT, CHANNELS, RATE, RECORD_SECONDS):

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("Recording in process...")
    
    CURRENT_TIME = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    WAVE_OUTPUT_FILENAME = replace_special_chars(CURRENT_TIME, ': ', '-') + '.wav'

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Recording finished...")

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    os.rename(WAVE_OUTPUT_FILENAME, './/Recordings//' + WAVE_OUTPUT_FILENAME)
    print("Generated audio file " + WAVE_OUTPUT_FILENAME)
    
    return './/Recordings//' + WAVE_OUTPUT_FILENAME

In [None]:
rootDirName = '.'
nMixtures = 1024

# Get the result for speaker ID
def speakerID(fname, rootDirName, nMixtures, percent_of_speech):
    
    eng = matlab.engine.start_matlab()
    
    if percent_of_speech < 0.5:
        # Not enough speech in the segment of + fname + to perform speaker ID.
        sid = 0
    else:    
        # Get speaker ID result
        sid = 0
        try:
            sid = eng.PCR_main (fname)
        except:
            pass
        print('speaker ID result for ' + fname + ' is ' + str(sid) )
        
    eng.quit()
    
    return sid


In [None]:
def percent_of_speech(fname):
    eng = matlab.engine.start_matlab()
    # Get rid of background noises
    percent_of_speech = eng.absolute_silence(fname, fname)
    
    eng.quit()
    return percent_of_speech

In [None]:
'''
while(True):
    # fname = record_single_session(CHUNK, FORMAT, CHANNELS, RATE, RECORD_SECONDS)
    print(speakerID(fname, rootDirName, nMixtures, 0.5))
'''


In [None]:
#eng.plot_VAD('.//Recordings//2019-06-12-13-23-13.wav', nargout=0)

# this is the main() of the pipeline
def speech_recognition():
    while(True):
        r = sr.Recognizer()
        with sr.Microphone() as source:                # use the default microphone as the audio source
            audio = r.listen(source)                   # listen for the first phrase and extract it into audio data

        try:
            transcription = r.recognize_google(audio)    # recognize speech using Google Speech Recognition
            print(transcription)
            CURRENT_TIME = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            WAVE_OUTPUT_FILENAME = replace_special_chars(CURRENT_TIME, ': ', '-') + '.wav'
            fname = './/Recordings//' + WAVE_OUTPUT_FILENAME
            
            with open(fname, "wb") as f:
                f.write(audio.get_wav_data())
                print('Recognizable voice detected. Saved as ' + WAVE_OUTPUT_FILENAME)
                
                #id = speakerID(fname, rootDirName, nMixtures, percent_of_speech(fname))
                #print(id)
                #slice_into_smaller_windows(fname, 5)

            break
            
        except:                            # speech is unintelligible
            pass

In [None]:
# speech_recognition()

In [None]:
# this is the main() of the pipeline
def speech_recognition_evaluation(folder):
    
    silence = []
    speech = []

    for filename in os.listdir(folder):
        if filename.endswith('.wav'):
            audio = folder + '//' + filename
            
            percent = percent_of_speech(audio)
            
            if percent < 0.25:
                silence.append(audio)
                os.rename(audio, 'C://Users//yg9ca//Desktop//PCR_pipeline//test_audios//silence//'+filename)
            else:
                speech.append(audio)
                os.rename(audio, 'C://Users//yg9ca//Desktop//PCR_pipeline//test_audios//speech//'+filename)

            print(filename + ' ' + str(percent))
        
    return silence, speech

In [None]:

silence, speech = speech_recognition_evaluation('C://Users//yg9ca//Desktop//PCR_pipeline//test_audios')

print(silence)
print(speech)



In [None]:

def single_audio_processing(fname):
    speech_percentage = percent_of_speech(fname)
    sid = speakerID(fname, rootDirName, nMixtures, percent_of_speech(fname))

    
    print('speech percentage = ' + str(speech_percentage))
    print('sid = ' + str(sid))
    

# single_audio_processing('C://Users//yg9ca//Desktop//PCR_pipeline//test//test_wav.wav')