## Speech to Text
---

#### Import Libraries

In [1]:
import os
import io
import time
import pandas as pd
import numpy as np
from pydub import AudioSegment
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types

In [2]:
pd.options.display.max_colwidth = 1000

#### Set Google Seepch API key

In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/YOUR-USER/Documents/Future/GOOGLE/YOUR-PROJECT.json'
client = speech.SpeechClient()

#### Check Number of Files Under File Path

In [4]:
path = '../testing/wav_converted_files/'
wav_files = []
for file in os.listdir(path):
    if file.endswith('.wav'):
        wav_files.append(file)
print(f'Number of Files: {len(wav_files)}')

Number of Files: 79


#### Tests a Specific Audio File

In [5]:
sound_file = AudioSegment.from_file('../testing/wav_converted_files/' + str(wav_files[0]), format="wav")

print(f'Sample Width: {sound_file.sample_width}')
print(f'Channel Count: {sound_file.channels}')
print(f'Duration: {len(sound_file) / 1000}s')
print(f'Sample Rate: {sound_file.frame_rate}')

Sample Width: 2
Channel Count: 1
Duration: 5.45s
Sample Rate: 22050


#### Speech to Text - Google API + Streets Context

In [6]:
# Refresh stored variables from previous notebooks
%store -r

In [7]:
def google_speech_to_text(path):
    
    '''Converts audio files under selected folder to text and returns a DataFrame'''

    # Create list to house data on every loop
    list_results = []

    # Loop through all files in path provided
    for n, file in enumerate(os.listdir(path)):
    
        t1 = time.time()
        
        # Select only the ones with extension '.wav'
        if file.endswith('.wav'):
            
            # Instantiate dictionary
            d = {}
            
            # Instantiates a client
            client = speech.SpeechClient()

            # Loads the audio into memory
            with io.open(path + file, 'rb') as audio_file:
                content = audio_file.read()
                audio = types.RecognitionAudio(content=content)

            # Configure recognition parameters
            config = types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=22050,
                language_code='en-US',
                audio_channel_count=1,
                model = 'video',
                speech_contexts= [{'phrases': streets_list}],
            )

            # Detects speech in the audio file
            response = client.recognize(config, audio)

            # Create string to house pieces returned by result
            string = ''
            list_confidence = []

            # Loop through results
            for result in response.results:

                # if lenght is greater than 0
                if len(result.alternatives[0].transcript) > 0:

                    # Append to list
                    string = string + result.alternatives[0].transcript
                    list_confidence.append(result.alternatives[0].confidence)
            
            # Create key/value pair for dictionary                      
            d['transcripts'] = string
            d['confidence'] = np.mean(list_confidence)

            # Append dictionary to list
            list_results.append(d)
            
            # Print RunTime
            print(f'File {n} RunTime: {round(time.time() - t1, 2)}s')
    
    # Create DataFrame with list_results
    df = pd.DataFrame(list_results)

    # Return DataFrame
    return df

#### Create DataFrame from Speech To Text

In [8]:
df = google_speech_to_text('../testing/wav_converted_files/')

File 0 RunTime: 8.68s
File 1 RunTime: 4.26s
File 2 RunTime: 5.84s
File 3 RunTime: 4.22s
File 4 RunTime: 5.44s
File 5 RunTime: 6.09s
File 6 RunTime: 7.92s
File 7 RunTime: 27.17s
File 8 RunTime: 5.91s
File 9 RunTime: 16.46s
File 10 RunTime: 7.4s
File 11 RunTime: 23.72s
File 12 RunTime: 9.09s
File 13 RunTime: 38.26s
File 15 RunTime: 10.95s
File 16 RunTime: 6.15s
File 17 RunTime: 5.05s
File 18 RunTime: 10.27s
File 19 RunTime: 5.3s
File 20 RunTime: 5.75s
File 21 RunTime: 7.47s
File 22 RunTime: 4.81s
File 23 RunTime: 9.08s
File 24 RunTime: 5.09s
File 25 RunTime: 8.04s
File 26 RunTime: 6.36s
File 27 RunTime: 5.33s
File 28 RunTime: 11.47s
File 29 RunTime: 7.03s
File 30 RunTime: 13.3s
File 31 RunTime: 5.56s
File 32 RunTime: 7.95s
File 33 RunTime: 7.1s
File 34 RunTime: 5.89s
File 35 RunTime: 12.9s
File 36 RunTime: 25.9s
File 37 RunTime: 31.24s
File 38 RunTime: 9.7s
File 39 RunTime: 8.45s
File 40 RunTime: 11.4s
File 41 RunTime: 6.86s
File 42 RunTime: 12.88s
File 43 RunTime: 6.11s
File 44 RunTime:

#### Drop NaN Values / Blank Transcripts

In [9]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [10]:
df['transcripts'] = df['transcripts'].map(lambda x: x.replace(':','').strip())

#### Check Total Confidence on Speech to Text

In [11]:
round(np.mean(df['confidence']),5)

0.78996

#### Tokenize Transcripts

In [12]:
df['tokens'] = df['transcripts'].map(lambda x: x.lower().split(' '))

In [14]:
df.head()

Unnamed: 0,transcripts,confidence,tokens
0,we going to call Joe at the corner of BP's and Taraval,0.807508,"[we, going, to, call, joe, at, the, corner, of, bp's, and, taraval]"
1,Flora ways for a 14 year old male conscious breathing laceration to the inside of his,0.832525,"[flora, ways, for, a, 14, year, old, male, conscious, breathing, laceration, to, the, inside, of, his]"
2,40th the Safeway Taraval 730 Taraval top of seventies and eighties is possibly related and that because of EMA,0.79367,"[40th, the, safeway, taraval, 730, taraval, top, of, seventies, and, eighties, is, possibly, related, and, that, because, of, ema]"
3,for 390 building across the hide in marking the language,0.766427,"[for, 390, building, across, the, hide, in, marking, the, language]"
4,negative about we split up Partners it's my music Concourse I am at a company G work,0.827363,"[negative, about, we, split, up, partners, it's, my, music, concourse, i, am, at, a, company, g, work]"


#### Save Clean DataFrame to .csv

In [15]:
df.to_csv('../data/transcripts.csv', index_label=False)