Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AWS Polly,Transcribe #62

Merged
merged 2 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,17 @@ xattr -rd com.apple.quarantine /path/to/Polyglot.app
+ 选择“F0”价格层,并单击“创建”
+ 创建完成后,转到新服务的“概述”页面,即可查看密钥和终结点

</details>
<details>
<summary>3. aws语音服务如何申请?</summary>

要申请 AWS 语音服务的 Identity Pool ID ,可以按照以下步骤进行:

+ 登录 AWS 门户 (https://console.aws.amazon.com/)
+ 转到“Cognito 认证服务”页面并单击“Create Identity Pool”按钮
+ 在“IAM Role”窗口中,创建Role, 添加Polly, Transcribe 权限即可
+ 创建完成后,转到“概述”页面,即可Identity Pool ID

</details>

<!-- ## 捐赠
Expand Down
6 changes: 6 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,19 @@
"release:version": "npx standard-version && git push origin --follow-tags"
},
"dependencies": {
"@aws-sdk/client-cognito-identity": "^3.363.0",
"@aws-sdk/client-polly": "^3.363.0",
"@aws-sdk/client-transcribe-streaming": "^3.363.0",
"@aws-sdk/credential-provider-cognito-identity": "^3.363.0",
"@iconify-json/svg-spinners": "^1.1.1",
"@vueuse/core": "^9.13.0",
"api2d": "^0.1.18",
"aws-sdk": "^2.1409.0",
"dexie": "^3.2.3",
"electron-updater": "^5.3.0",
"element-plus": "^2.3.3",
"eventsource-parser": "^0.1.0",
"microphone-stream": "^6.0.1",
"microsoft-cognitiveservices-speech-sdk": "^1.26.0",
"pinia": "^2.0.33",
"pinia-plugin-persistedstate": "^3.1.0",
Expand Down
10 changes: 10 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,16 @@ export const supportLanguageMap = {
'zh-TW': '中文(台湾普通话)',
} as Record<string, string>

export const awsRegions = [
'us-east-1',
'us-east-2',
'us-west-1',
'us-west-2',
'ap-east-1',
'ap-southeast-1',
'eu-central-1',
]

export const azureRegions = [
'australiaeast',
'australiasoutheast',
Expand Down
2 changes: 2 additions & 0 deletions src/constant.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ export const OPEN_KEY = 'openKey'
export const OPEN_PROXY = 'openProxy'
export const AZURE_REGION = 'azureRegion'
export const AZURE_KEY = 'azureKey'
export const AWS_REGION = 'awsRegion'
export const AWS_COGNITO_IDENTITY_POOL_ID = 'awsCognitoIdentityPoolId'
export const AZURE_TRANSLATE_KEY = 'azureTranslateKey'
export const VOICE_API_NAME = 'voiceApiName'
export const IS_ALWAYS_RECOGNITION = 'isAlwaysRecognition'
Expand Down
7 changes: 6 additions & 1 deletion src/hooks/useGlobalSetting.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { AUTO_PLAY, AZURE_KEY, AZURE_REGION, AZURE_TRANSLATE_KEY, CHAT_API_NAME, CHAT_REMEMBER_COUNT, IS_ALWAYS_RECOGNITION, OPEN_KEY, OPEN_MAX_TOKEN, OPEN_MODEL, OPEN_PROXY, SELF_AVATAR_URL, TTS_PASSWORD, VOICE_API_NAME } from '@/constant'
import { AUTO_PLAY,AWS_COGNITO_IDENTITY_POOL_ID, AWS_KEY,AWS_REGION,AWS_SECRET_KEY,AZURE_KEY, AZURE_REGION, AZURE_TRANSLATE_KEY, CHAT_API_NAME, CHAT_REMEMBER_COUNT, IS_ALWAYS_RECOGNITION, OPEN_KEY, OPEN_MAX_TOKEN, OPEN_MODEL, OPEN_PROXY, SELF_AVATAR_URL, TTS_PASSWORD, VOICE_API_NAME } from '@/constant'

import { getAvatarUrl } from '@/utils'

Expand All @@ -7,6 +7,9 @@ export const useGlobalSetting = () => {
const openProxy = useLocalStorage(OPEN_PROXY, '')
const azureRegion = useLocalStorage(AZURE_REGION, 'eastasia')
const azureKey = useLocalStorage(AZURE_KEY, '')
const awsRegion = useLocalStorage(AWS_REGION, 'us-east-1')
const awsCognitoIdentityId = useLocalStorage(AWS_COGNITO_IDENTITY_POOL_ID, '')

const openModel = useLocalStorage(OPEN_MODEL, 'gpt-3.5-turbo')
const selfAvatar = useLocalStorage(SELF_AVATAR_URL, getAvatarUrl('self.png'))
const chatApiName = useLocalStorage(CHAT_API_NAME, 'openAI')
Expand All @@ -22,6 +25,8 @@ export const useGlobalSetting = () => {
openKey,
openProxy,
openModel,
awsRegion,
awsCognitoIdentityId,
azureRegion,
azureKey,
selfAvatar,
Expand Down
176 changes: 169 additions & 7 deletions src/hooks/useSpeechService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@ import {
SpeechSynthesizer,
} from 'microsoft-cognitiveservices-speech-sdk'

import MicrophoneStream from 'microphone-stream';
import { CognitoIdentityClient } from "@aws-sdk/client-cognito-identity";
import {fromCognitoIdentityPool} from "@aws-sdk/credential-provider-cognito-identity";
import { Polly,SynthesizeSpeechInput,DescribeVoicesCommand } from "@aws-sdk/client-polly";
import {
TranscribeStreamingClient,
StartStreamTranscriptionCommand,
} from '@aws-sdk/client-transcribe-streaming';


const defaultAzureRegion = import.meta.env.VITE_REGION
const defaultAzureKey = import.meta.env.VITE_SCRIPTION_KEY
const accessPassword = import.meta.env.VITE_TTS_ACCESS_PASSWORD
Expand All @@ -17,8 +27,13 @@ interface Config {
isFetchAllVoice?: boolean
}
export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'zh-CN', 'zh-HK', 'ko-KR', 'de-DE'], isFetchAllVoice = true }: Config = {}) => {
const { azureKey, azureRegion, ttsPassword } = useGlobalSetting()
const { azureKey, azureRegion, ttsPassword,voiceApiName } = useGlobalSetting()
const { awsCognitoIdentityId, awsRegion, } = useGlobalSetting()


if(voiceApiName.value==="AWS"){
isFetchAllVoice=false;
}
const resultAzureKey = computed(() => {
if (!azureKey.value) {
if (accessPassword !== ttsPassword.value)
Expand Down Expand Up @@ -58,6 +73,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
const audioBlob = ref<Blob>(new Blob())

const allVoices = ref<VoiceInfo[]>([])
const allAWSVoices = ref<any[]>([])

const recognizer = ref<SpeechRecognizer>(new SpeechRecognizer(speechConfig.value))
const synthesizer = ref<SpeechSynthesizer>(new SpeechSynthesizer(speechConfig.value))
Expand All @@ -74,8 +90,28 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
immediate: true,
})

// 语音识别

// AWS polly and transcribe SDK 初始化
const audioAWS = new Audio();
let micStream: MicrophoneStream | undefined = undefined
const polly = new Polly({
region: awsRegion.value ?? "us-east-1",
credentials: fromCognitoIdentityPool({
client: new CognitoIdentityClient({ region: awsRegion.value ?? "us-east-1" }),
identityPoolId: awsCognitoIdentityId.value
}),
});

const transcribe = new TranscribeStreamingClient({
region: awsRegion.value ?? "us-east-1",
credentials: fromCognitoIdentityPool({
client: new CognitoIdentityClient({ region: awsRegion.value ?? "us-east-1" }),
identityPoolId: awsCognitoIdentityId.value
}),
});


// AZure 语音识别
const audioRecorder = async () => {
// 暂时通过 mediaRecorder 方式实现录音保存,后续可能会改为直接通过 SpeechRecognizer 实现保存

Expand Down Expand Up @@ -250,16 +286,41 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
catch (error) {
allVoices.value = []
}
const res = await synthesizer.value.getVoicesAsync()
if (res.errorDetails) {
console.error(`获取语音列表失败:${res.errorDetails}, 请检查语音配置`)
return []
}
return res.voices
}else{
return []
}

const res = await synthesizer.value.getVoicesAsync()
if (res.errorDetails) {
console.error(`获取语音列表失败:${res.errorDetails}, 请检查语音配置`)
return []

}

// 获取AWS 语音列表
async function getAWSVoices() {
const params = {
LanguageCode: "en-US"
};

try {
const data = await polly.describeVoices(params)
if(data.Voices){
allAWSVoices.value=data.Voices.map((item)=>{
return {"id":item.Id,"gender":item.Gender}
})
}
return data.Voices??[];
} catch (error) {
console.error("Error retrieving AWS voices:", error);
return [];
}
return res.voices

}


function applySynthesizerConfiguration() {
// 通过playback结束事件来判断播放结束
player.value = new SpeakerAudioDestination()
Expand All @@ -279,6 +340,100 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
synthesizer.value = new SpeechSynthesizer(speechConfig.value, speakConfig)
}

/* AWS Vocie service */
const startAWSRecognizeSpeech = async (cb?: (text: string) => void) => {

micStream = new MicrophoneStream();
// // this part should be put into an async function

micStream.setStream(
await window.navigator.mediaDevices.getUserMedia({
video: false,
audio: true,
})
);


//构造audioSream
isRecognizing.value = true
const MAX_AUDIO_CHUNK_SIZE = 48000

const audioStream = async function* () {
for await (const chunk of micStream as unknown as Iterable<Buffer>) {
if (chunk.length <= MAX_AUDIO_CHUNK_SIZE) {
yield {
AudioEvent: {
AudioChunk: pcmEncodeChunk(chunk),
},
}
}
}
};

//PCM 编码
const pcmEncodeChunk = (chunk: any) => {
const input = MicrophoneStream.toRaw(chunk);
var offset = 0;
var buffer = new ArrayBuffer(input.length * 2);
var view = new DataView(buffer);
for (var i = 0; i < input.length; i++, offset += 2) {
var s = Math.max(-1, Math.min(1, input[i]));
view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
}
return Buffer.from(buffer);
};
//Transcribe stream command 初始化
const command = new StartStreamTranscriptionCommand({
LanguageCode: language.value,
MediaEncoding: "pcm",
MediaSampleRateHertz: 44100,
AudioStream: audioStream(),
});

const response = await transcribe.send(command);
let resultText = ""
if (response.TranscriptResultStream) {
for await (const event of response.TranscriptResultStream) {
if (event.TranscriptEvent) {
const results = event.TranscriptEvent?.Transcript?.Results;
results?.map((result: any) => {
(result.Alternatives || []).map((alternative: any) => {
const transcript = alternative.Items.map((item: any) => item.Content).join(" ");
resultText = transcript;
cb && cb(transcript)
});
});
}
}
isRecognizing.value = false
}
return resultText

}

const stopAWSRecognizeSpeech = () => {
micStream?.stop()
}


//语音合成
const awsTextToSpeak = async (text: string, voice?: string) => {
const params: SynthesizeSpeechInput = {
Text: text,
OutputFormat: 'mp3',
VoiceId: 'Joanna', // Replace with the desired voice ID (e.g., Joanna, Matthew, etc.)
};

const response = await polly.synthesizeSpeech(params);

if (response.AudioStream) {
const buffer = await response.AudioStream.transformToByteArray();
audioAWS.src = URL.createObjectURL(new Blob([buffer], { type: 'audio/mpeg' }));
audioAWS.play();
}
}


return {
languages,
language,
Expand All @@ -289,16 +444,23 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
isRecognizReadying,
startRecognizeSpeech,
stopRecognizeSpeech,
startAWSRecognizeSpeech,
stopAWSRecognizeSpeech,
recognizeSpeech,
textToSpeak,
awsTextToSpeak,
ssmlToSpeak,
stopTextToSpeak,
getVoices,
getAWSVoices,
allVoices,
allAWSVoices,
isSynthesizing,
rate,
style,
audioBlob,
player,
audioAWS,

}
}
Loading