# "Azure speech recognition for Irish"
> "Authentication is a bit of a pain; recognition is fantastic, but let down by number inverse normalisation"

- toc: false
- branch: master
- comments: true
- categories: [azure, irish, asr]

In [67]:
%%capture
!pip install azure-cognitiveservices-speech
!pip install youtube-dl

In [None]:
%%capture
!youtube-dl https://www.youtube.com/watch?v=cfjdfaqWY3Y

In [1]:
import azure.cognitiveservices.speech as speechsdk

Use either Key1 or Key2 (on [Azure Portal](https://portal.azure.com/), in "Keys and Endpoints" from the menu on the left hand side of the screen).

In [None]:
_SUBS=input('put your subscription key here: ')

In [3]:
_LOC='westeurope'

In [4]:
speech_config = speechsdk.SpeechConfig(region=_LOC, subscription=_SUBS)


In [None]:
!wget https://upload.wikimedia.org/wikipedia/commons/6/60/MSF_chapter_3.ogg https://upload.wikimedia.org/wikipedia/commons/e/ee/MSF_chapter_4.ogg https://upload.wikimedia.org/wikipedia/commons/b/b3/MSF_chapter_5.ogg https://upload.wikimedia.org/wikipedia/commons/2/21/MSF_chapter_6.ogg https://upload.wikimedia.org/wikipedia/commons/7/71/MSF_chapter_7.ogg https://upload.wikimedia.org/wikipedia/commons/d/d5/MSF_chapter_8.ogg

In [None]:
!ffmpeg -i MSF_chapter_5.ogg -acodec pcm_s16le -ac 1 -ar 16000 MSF_chapter_5.wav

In [20]:
speech_config.speech_recognition_language = 'ga-IE'
speech_config.request_word_level_timestamps()
speech_config.output_format = speechsdk.OutputFormat(1)
speech_config.endpoint_id=f'https://{_LOC}.api.cognitive.microsoft.com/sts/v1.0/issuetoken'

In [40]:
# https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/python/console/speech_sample.py
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
import time
import json
def speech_recognize_continuous_from_file(speech_config, filename):
    """performs continuous speech recognition with input from an audio file"""
    speech_config = speech_config
    audio_config = speechsdk.audio.AudioConfig(filename=filename)
    outfilename = filename.replace('.wav', '.json')
    outfile = open(outfilename, 'a')


    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language='ga-IE', audio_config=audio_config)

    done = False

    def stop_cb(evt):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    def cancelled(evt):
      result = evt.result
      cancellation_details = result.cancellation_details
      print("Speech Recognition canceled: {}".format(cancellation_details.reason))
      if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))

    def recognised(evt):
      response = json.loads(evt.result.json)
      outfile.write('{}\n'.format(evt.result.json))

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(recognised)
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(cancelled)
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    speech_recognizer.stop_continuous_recognition()
    outfile.close()

In [41]:
for i in "345678":
  speech_recognize_continuous_from_file(speech_config, f'MSF_chapter_{i}.wav')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
RECOGNIZING: SpeechRecognitionEventArgs(session_id=a2608c3e4eca400096948ca387ab593c, result=SpeechRecognitionResult(result_id=b6cfd91490274f34a4fb0a3db510af66, text="nuair a chonaic 6 cad é an gnó a bhí don", reason=ResultReason.RecognizingSpeech))
RECOGNIZING: SpeechRecognitionEventArgs(session_id=a2608c3e4eca400096948ca387ab593c, result=SpeechRecognitionResult(result_id=5258c6a2374f4e41ab1b2a40483bb573, text="nuair a chonaic 6 cad é an gnó a bhí don cad a bhí", reason=ResultReason.RecognizingSpeech))
RECOGNIZING: SpeechRecognitionEventArgs(session_id=a2608c3e4eca400096948ca387ab593c, result=SpeechRecognitionResult(result_id=fed59bd81f5845f69754ad270098b0fc, text="nuair a chonaic 6 cad é an gnó a bhí don cad a bhí bhí", reason=ResultReason.RecognizingSpeech))
RECOGNIZING: SpeechRecognitionEventArgs(session_id=a2608c3e4eca400096948ca387ab593c, result=SpeechRecognitionResult(result_id=0fb547ef62554b98b119281e383e2e01, text

Debugging with `curl`

In [None]:
 !curl -v -X POST "https://{_LOC}.api.cognitive.microsoft.com/sts/v1.0/issueToken" -H "Ocp-Apim-Subscription-Key: {_SUBS}" -H "Content-type: application/x-www-form-urlencoded" -H "Content-Length: 0"

In [45]:
_TOK=''

In [None]:
!curl -v -X POST "https://{_LOC}.stt.speech.microsoft.com/speech/recognition/interactive/cognitiveservices/v1?language=ga-IE" -H "Authorization: Bearer {_TOK}" -H "Transfer-Encoding: chunked" -H "Content-type: audio/wav; codec=audio/pcm; samplerate=16000" --data-binary @cfjdfaqWY3Y.wav

Next step, get at the innards (TODO)

In [None]:
transcript_display_list = []
transcript_ITN_list = []
confidence_list = []
words = []

def parse_azure_result(evt):
  import json
  response = json.loads(evt.result.json)
  transcript_display_list.append(response['DisplayText'])
  confidence_list_temp = [item.get('Confidence') for item in response['NBest']]
  max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
  confidence_list.append(response['NBest'][max_confidence_index]['Confidence'])
  transcript_ITN_list.append(response['NBest'][max_confidence_index]['ITN'])
  words.extend(response['NBest'][max_confidence_index]['Words'])
  logger.debug(evt)
    
