fix: use sdk to complete azure text-to-speech

hahahumble · Apr 21, 2023 · 6fb0ae2 · 6fb0ae2 · vercel · Apr 21, 2023
2 parents 6e196db + 4f0801a
commit 6fb0ae2
Show file tree

Hide file tree

Showing 9 changed files with 76 additions and 49 deletions.
diff --git a/src/apis/amazonPolly.ts b/src/apis/amazonPolly.ts
@@ -10,7 +10,7 @@ const speechParams = {
   Engine: 'neural',
 };
 
-export default async function generateSpeechUrl(
+export default async function speechSynthesizeWithPolly(
   text: string,
   voiceId: string = 'Matthew',
   engine: string = 'neural',

diff --git a/src/apis/azureTTS.ts b/src/apis/azureTTS.ts
@@ -1,41 +1,32 @@
-import axios, { AxiosRequestConfig } from 'axios';
+import * as sdk from 'microsoft-cognitiveservices-speech-sdk';
+import { azureSynthesisErrorNotify } from '../components/Notification';
 
-const textToSpeech = async (
+const speechSynthesizeWithAzure = async (
   subscriptionKey: string,
   region: string,
   text: string,
   voiceName: string,
   language: string
 ) => {
-  const request: AxiosRequestConfig = {
-    method: 'POST',
-    url: `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`,
-    headers: {
-      'Content-Type': 'application/ssml+xml',
-      'X-Microsoft-OutputFormat': 'riff-16khz-16bit-mono-pcm',
-      Authorization: `Bearer ${await getAccessToken(subscriptionKey, region)}`,
+  console.time('Azure speech synthesis');
+  const speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey, region);
+  speechConfig.speechRecognitionLanguage = language;
+  speechConfig.speechSynthesisVoiceName = voiceName;
+  const player = new sdk.SpeakerAudioDestination();
+  const audioConfig = sdk.AudioConfig.fromSpeakerOutput(player);
+  const speechSynthesizer = new sdk.SpeechSynthesizer(speechConfig, audioConfig);
+  speechSynthesizer.speakTextAsync(
+    text,
+    result => {
+      console.timeEnd('Azure speech synthesis');
+      speechSynthesizer.close();
     },
-    data: `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='${language}'><voice name='${voiceName}'>${text}</voice></speak>`,
-    responseType: 'arraybuffer',
-  };
-
-  const response = await axios(request);
-
-  return new Blob([response.data], { type: 'audio/wav' });
-};
-
-const getAccessToken = async (subscriptionKey: string, region: string) => {
-  const request: AxiosRequestConfig = {
-    method: 'POST',
-    url: `https://${region}.api.cognitive.microsoft.com/sts/v1.0/issueToken`,
-    headers: {
-      'Ocp-Apim-Subscription-Key': subscriptionKey,
-    },
-  };
-
-  const response = await axios(request);
-
-  return response.data;
+    error => {
+      console.log(error);
+      azureSynthesisErrorNotify();
+      speechSynthesizer.close();
+    }
+  );
+  return player;
 };
-
-export default textToSpeech;
+export default speechSynthesizeWithAzure;
diff --git a/src/apis/azureToken.ts b/src/apis/azureToken.ts
@@ -0,0 +1,18 @@
+import axios from 'axios';
+
+export async function getAzureToken(subscriptionKey: string, region: string): Promise<string> {
+  const url = `https://${region}.api.cognitive.microsoft.com/sts/v1.0/issueToken`;
+
+  try {
+    const response = await axios.post(url, null, {
+      headers: {
+        'Ocp-Apim-Subscription-Key': subscriptionKey,
+        'Content-Type': 'application/x-www-form-urlencoded',
+      },
+    });
+
+    return response.data;
+  } catch (error) {
+    throw new Error(`Error getting token: ${error}`);
+  }
+}
diff --git a/src/components/Notification.tsx b/src/components/Notification.tsx
@@ -111,6 +111,12 @@ export const azureSynthesisErrorNotify = () => {
   });
 };
 
+export const invalidAzureKeyNotify = () => {
+  toast.error(i18next.t('notification.invalid-azure-key') as string, {
+    style: notificationStyle,
+  });
+};
+
 // AWS
 export const awsErrorNotify = () => {
   toast.error(i18next.t('notification.polly-synthesis-error') as string, {

diff --git a/src/locales/en.json b/src/locales/en.json
@@ -157,6 +157,7 @@
     "azure-synthesis-error": "There was an error with Azure speech synthesis",
     "azure-recognition-error": "There was an error with Azure speech recognition",
     "polly-synthesis-error": "There was an error with Amazon Polly speech synthesis",
+    "invalid-azure-key": "Invalid Azure key or region",
     "cannot-be-empty": "This field cannot be empty",
     "invalid-access-code": "Invalid access code"
   }

diff --git a/src/locales/es.json b/src/locales/es.json
@@ -157,6 +157,7 @@
     "azure-synthesis-error": "Se ha producido un error con la síntesis de voz de Azure",
     "azure-recognition-error": "Se ha producido un error en el reconocimiento de voz de Azure",
     "polly-synthesis-error": "Se ha producido un error con la síntesis de voz de Amazon Polly",
+    "invalid-azure-key": "Invalid Azure key or region",
     "cannot-be-empty": "This field cannot be empty",
     "invalid-access-code": "Invalid access code"
   }

diff --git a/src/locales/zh-CN.json b/src/locales/zh-CN.json
@@ -157,6 +157,7 @@
     "azure-synthesis-error": "Azure 语音合成错误",
     "azure-recognition-error": "Azure 语音识别错误",
     "polly-synthesis-error": "Amazon Polly 语音合成错误",
+    "invalid-azure-key": "无效的 Azure Key 或 Region",
     "cannot-be-empty": "不能为空",
     "invalid-access-code": "无效的访问密码"
   }

diff --git a/src/pages/Home.tsx b/src/pages/Home.tsx
@@ -30,6 +30,7 @@ function Home() {
     azureRecognitionErrorNotify: Notify.azureRecognitionErrorNotify,
     awsErrorNotify: Notify.awsErrorNotify,
     emptyAzureKeyNotify: Notify.emptyAzureKeyNotify,
+    invalidAzureKeyNotify: Notify.invalidAzureKeyNotify,
     cannotBeEmptyNotify: Notify.cannotBeEmptyNotify,
     invalidAccessCodeNotify: Notify.invalidAccessCodeNotify,
   };

diff --git a/src/utils/speechSynthesis.ts b/src/utils/speechSynthesis.ts
@@ -1,5 +1,7 @@
-import generateSpeechUrl from '../apis/amazonPolly';
-import textToSpeech from '../apis/azureTTS';
+import speechSynthesizeWithPolly from '../apis/amazonPolly';
+import speechSynthesizeWithAzure from '../apis/azureTTS';
+import { SpeakerAudioDestination } from 'microsoft-cognitiveservices-speech-sdk';
+import { getAzureToken } from '../apis/azureToken';
 
 interface SpeechSynthesisOptions {
   text: string;
@@ -26,7 +28,7 @@ interface getPollyVoicesOptions {
 
 const synthesis = window.speechSynthesis;
 let pollyAudio: HTMLAudioElement | null = null;
-let azureAudio: HTMLAudioElement | null = null;
+let azureAudio: SpeakerAudioDestination | null = null;
 
 async function getPollyVoices({
   text,
@@ -36,7 +38,7 @@ async function getPollyVoices({
   accessKeyId,
   secretAccessKey,
 }: getPollyVoicesOptions) {
-  return await generateSpeechUrl(text, voiceName, engine, region, accessKeyId, secretAccessKey);
+  return await speechSynthesizeWithPolly(text, voiceName, engine, region, accessKeyId, secretAccessKey);
 }
 
 function pollyEngineName(engine: string | undefined) {
@@ -129,18 +131,24 @@ export function speechSynthesis({
           });
         break;
       case 'Azure TTS':
-        textToSpeech(secretAccessKey || '', region || 'eastus', text, voiceName, language)
-          .then(audioBlob => {
-            azureAudio = new Audio(URL.createObjectURL(audioBlob));
-            azureAudio.play().then(() => {
-              // resolve();
-            });
-            azureAudio.onended = () => {
+        if (secretAccessKey == '') {
+          reject('Azure access key is empty');
+          notify.emptyAzureKeyNotify();
+          return;
+        }
+        // Check if secret access key and region is valid
+        getAzureToken(secretAccessKey || '', region || 'eastus')
+          .then(token => {})
+          .catch(error => {
+            notify.invalidAzureKeyNotify();
+            reject(error);
+          });
+        speechSynthesizeWithAzure(secretAccessKey || '', region || 'eastus', text, voiceName, language)
+          .then(player => {
+            azureAudio = player;
+            player.onAudioEnd = () => {
               resolve();
             };
-            azureAudio.onerror = error => {
-              reject(error);
-            };
           })
           .catch(error => {
             console.error(error);
@@ -164,7 +172,7 @@ export function stopSpeechSynthesis() {
   }
   if (azureAudio) {
     azureAudio.pause();
-    azureAudio.currentTime = 0;
+    azureAudio.close();
   }
 }
 
@@ -192,6 +200,6 @@ export function resumeSpeechSynthesis() {
     pollyAudio.play();
   }
   if (azureAudio) {
-    azureAudio.play();
+    azureAudio.resume();
   }
 }