-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpeechRecognition.py
174 lines (143 loc) · 6.52 KB
/
SpeechRecognition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import speech_recognition as sr
import logging
import time
import sys
import math
import pyaudio
from queue import Queue
from numpy import linspace,sin,pi,int16
from threading import Thread
import googleapiclient
def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
class SpeechRecognizer(object):
"""docstring for SpeechRecognizer"""
def __init__(self, sourceName, configParser, rawTextQueue):
super(SpeechRecognizer, self).__init__()
self.source_name = sourceName
self.config = dict(configParser.items('speech-recognizer'))
self.thresholds = {
'pause_threshold': float(self.config['pause_threshold']),
'phrase_threshold': float(self.config['phrase_threshold']),
'non_speaking_duration': float(self.config['non_speaking_duration']),
'dynamic_energy_threshold': str2bool(self.config['dynamic_energy_threshold'])
}
print(self.config)
self.raw_text_queue = rawTextQueue
# Create the Logger
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.DEBUG)
# stream to stdout
log_handler_std = logging.StreamHandler()
log_handler_std.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
log_handler_std.setFormatter(formatter)
self.logger.addHandler(log_handler_std)
# initialize recognizer
self.recognizer = sr.Recognizer()
self.setThresholds(self.thresholds)
# initialize audio source
self.source = sr.Microphone(self.getAudioSourceIndex())
# initialize the audio queue
self.audio_queue = Queue()
# initialize the raw result queue
self.raw_results_queue = Queue()
# initialize the recognizer thread
self.recognize_thread = Thread(target=self.recognize_worker)
self.recognize_thread.daemon = True
# initialize the printer thread
self.print_thread = Thread(target=self.printResults)
self.print_thread.daemon = True
def recognize(self, audio, engine='google'):
if (engine == 'google'):
return self.recognizer.recognize_google(audio)
elif (engine == 'google_cloud'):
GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""
CONTENTS OF GOOGLE CREDENTIALS JSON HERE
"""
return self.recognizer.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS)
elif (engine == 'bing'):
BING_KEY = "BING_KEY_HERE"
return self.recognizer.recognize_bing(audio, key=BING_KEY)
else:
return self.recognizer.recognize_sphinx(audio)
def recognize_worker(self):
# this runs in a background thread
while True:
audio = self.audio_queue.get() # retrieve the next audio processing job from the main thread
if audio is None: break # stop processing if the main thread is done
try:
results = self.recognize(audio)
self.raw_results_queue.put(results)
self.raw_text_queue.put(results)
# process the phrase here
# send it to the command engine
except sr.UnknownValueError:
# self.logger.error("Could not understand audio")
pass
except sr.RequestError as e:
self.logger.error("Could not request results from service; {0}, retrying with sphinx".format(e))
# try a different service
try:
results = self.recognize(audio, 'sphinx')
self.raw_results_queue.put(results)
self.raw_text_queue.put(results)
except sr.UnknownValueError:
# self.logger.error("Could not understand audio")
pass
self.audio_queue.task_done() # mark the audio processing job as completed in the queue
def startRecognition(self):
self.to_recognize = True
# start the thread
self.recognize_thread.start()
# self.print_thread.start()
self.logger.info('recognition started')
# start listening
with self.source as source:
if str2bool(self.config['adjust_ambient']):
self.logger.debug('adjust for ambient noise')
self.recognizer.adjust_for_ambient_noise(self.source)
while self.to_recognize: # repeatedly listen for phrases and put the resulting audio on the audio processing job queue
self.audio_queue.put(self.recognizer.listen(source, phrase_time_limit = int(self.config['phrase_limit'])))
# self.logger.debug('audio put in the queue')
# gracefully stop the recognition
self.audio_queue.join() # block until all current audio processing jobs are done
self.audio_queue.put(None) # tell the recognize_thread to stop
self.raw_results_queue.put(None)
self.raw_text_queue.put(None)
self.recognize_thread.join() # wait for the recognize_thread to actually stop
self.print_thread.join()
def stopRecognition(self):
self.to_recognize = False
def printResults(self):
self.logger.info('printing the raw results')
while True:
result = self.raw_results_queue.get()
self.raw_results_queue.task_done()
if result is None:
break
print(result, end=' ')
sys.stdout.flush()
def setThresholds(self, thresholds):
for key, value in thresholds.items():
self.recognizer.key = value
self.logger.debug(key + ' = ' + str(value))
def getAudioSourceIndex(self, name=''):
if (name == ''):
name = self.source_name
p = pyaudio.PyAudio()
info = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
self.logger.debug('number of devices: ' + str(numdevices))
idx = -1;
for i in range(0, numdevices):
if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
if (p.get_device_info_by_host_api_device_index(0, i).get('name') == name):
idx = i
self.logger.debug("Input Device id " + str(i) + " - " + p.get_device_info_by_host_api_device_index(0, i).get('name'))
# idx = 5
if (idx != -1):
self.logger.info(name + ' found')
else:
self.logger.error(name + ' source not found')
return idx