Skip to content

Commit

Permalink
feat: audio support record (#29)
Browse files Browse the repository at this point in the history
  • Loading branch information
liuxinqi authored and liou666 committed May 6, 2023
1 parent 0674a39 commit 1235b52
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 16 deletions.
44 changes: 35 additions & 9 deletions src/hooks/useSpeechService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import type { VoiceInfo } from 'microsoft-cognitiveservices-speech-sdk'
import {
AudioConfig,
CancellationErrorCode,
ResultReason,

Check warning on line 5 in src/hooks/useSpeechService.ts

View workflow job for this annotation

GitHub Actions / build (16.x, ubuntu-latest)

'ResultReason' is defined but never used

Check warning on line 5 in src/hooks/useSpeechService.ts

View workflow job for this annotation

GitHub Actions / build (16.x, windows-latest)

'ResultReason' is defined but never used
SpeakerAudioDestination,
SpeechConfig,
SpeechRecognizer,
Expand Down Expand Up @@ -52,6 +53,10 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
// const isFetchAllVoices = ref(false) // 是否在请求所有语音列表
const rate = ref(1) // 语速 (0,2]

let mediaRecorder: MediaRecorder | null
const chunks: Blob[] = []
const audioBlob = ref<Blob>(new Blob())

const allVoices = ref<VoiceInfo[]>([])

const recognizer = ref<SpeechRecognizer>(new SpeechRecognizer(speechConfig.value))
Expand Down Expand Up @@ -89,13 +94,29 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
immediate: true,
})

// watch([azureKey, azureRegion], () => {
// if (isFetchAllVoice && allVoices.value.length === 0)
// getVoices()
// })

// 语音识别
const startRecognizeSpeech = (cb?: (text: string) => void) => {

const audioRecorder = async () => {
// 暂时通过 mediaRecorder 方式实现录音保存,后续可能会改为直接通过 SpeechRecognizer 实现保存

const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
mediaRecorder = new MediaRecorder(stream)

mediaRecorder.ondataavailable = (e) => {
chunks.push(e.data)
}

mediaRecorder.onstop = (e) => {
const blob = new Blob(chunks, { type: 'audio/wav' })
audioBlob.value = blob
mediaRecorder = null
chunks.length = 0
}

mediaRecorder.start()
}

const startRecognizeSpeech = async (cb?: (text: string) => void) => {
isRecognizReadying.value = true

recognizer.value.canceled = () => {
Expand All @@ -105,9 +126,10 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
console.log('Recognize result: ', e.result.text)
cb && cb(e.result.text)
}
recognizer.value.recognizing = (s, e) => {
console.log('Recognize recognizing', e.result.text)
recognizer.value.recognizing = (s, event) => {
console.log('Recognize recognizing', event.result.text)
}

recognizer.value.sessionStopped = (s, e) => {
console.log('\n Session stopped event.')
recognizer.value.stopContinuousRecognitionAsync()
Expand All @@ -122,7 +144,8 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
isRecognizing.value = false
}

recognizer.value.startContinuousRecognitionAsync(() => {
recognizer.value.startContinuousRecognitionAsync(async () => {
await audioRecorder()
isRecognizing.value = true
isRecognizReadying.value = false
console.log('Recognize...')
Expand All @@ -137,6 +160,8 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z

// 停止语音识别
const stopRecognizeSpeech = (): Promise<void> => {
mediaRecorder!.stop()

isRecognizReadying.value = false
return new Promise((resolve, reject) => {
recognizer.value.stopContinuousRecognitionAsync(() => {
Expand Down Expand Up @@ -267,5 +292,6 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
allVoices,
isSynthesizing,
rate,
audioBlob,
}
}
64 changes: 59 additions & 5 deletions src/pages/Home/components/Content.vue
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<script setup lang="ts">
import Button from '@/components/Button.vue'
import { generatTranslate, generateText } from '@/server/api'
import { verifyOpenKey } from '@/utils'
import { base64ToBlob, blobToBase64, verifyOpenKey } from '@/utils'
import { useConversationStore } from '@/stores'
interface Translates {
Expand All @@ -27,6 +27,7 @@ const {
stopRecognizeSpeech,
ssmlToSpeak,
isSynthesizing,
audioBlob,
} = useSpeechService({ langs: store.allLanguage as any, isFetchAllVoice: false })
// states
Expand Down Expand Up @@ -106,10 +107,11 @@ async function onSubmit() {
store.changeConversations([
...currentChatMessages.value,
{ content: message.value, role: 'user' },
{ content: message.value, role: 'user', audioBlob: await blobToBase64(audioBlob.value) },
])
const tempCurrentChatMessages = currentChatMessages.value.map(x => ({ content: x.content, role: x.role })) // 发送的请求中需去除audioBlob
const systemMessage = currentChatMessages.value[0]
const relativeMessage = [...chatMessages.value, { content: message.value, role: 'user' }].slice(-(Number(chatRememberCount.value))) // 保留最近的几条消息
const relativeMessage = [...tempCurrentChatMessages, { content: message.value, role: 'user' }].slice(-(Number(chatRememberCount.value))) // 保留最近的几条消息
const prompts = [systemMessage, ...relativeMessage] as ChatMessage[]
message.value = ''
Expand All @@ -134,13 +136,40 @@ async function onSubmit() {
store.changeLoading(false)
}
// assistant speak
function speak(content: string, index: number) {
restartAudio()
if (isPlaying.value || isSynthesizing.value) return
speakIndex.value = index
text.value = content
ssmlToSpeak(content)
}
// user speak
let audio = new Audio()
function restartAudio() {
audio.pause()
audio.currentTime = 0
isPlaying.value = false
// audio.play()
}
function userSpeak(audioData: string, index: number) {
if (isPlaying.value || isSynthesizing.value) return
speakIndex.value = index
audio = new Audio(URL.createObjectURL(base64ToBlob(audioData)))
audio.play()
audio.onplay = () => {
isPlaying.value = true
}
audio.onended = () => {
isPlaying.value = false
speakIndex.value = -1
}
}
const recognize = async () => {
try {
console.log('isRecognizing', isRecognizing.value)
Expand Down Expand Up @@ -194,15 +223,15 @@ const translate = async (text: string, i: number) => {
<div class="w-10 h-10">
<img w-full h-full object-fill rounded-full :src="item.role === 'user' ? selfAvatar : currentAvatar" alt="">
</div>

<div style="flex-basis:fit-content" mx-2>
<p p-2 my-2 chat-box>
{{ item.content }}
</p>
<p v-show="item.role === 'assistant' && translates[item.content + i]?.isShow " p-2 my-2 chat-box>
<p v-show=" translates[item.content + i]?.isShow " p-2 my-2 chat-box>
{{ translates[item.content + i]?.result }}
</p>
<!-- assistant -->
<p v-if="item.role === 'assistant'" mt-2 flex>
<template v-if="speakIndex !== i">
<span class="chat-btn" @click="speak(item.content, i)">
Expand All @@ -224,6 +253,31 @@ const translate = async (text: string, i: number) => {
<i icon-btn i-eos-icons:bubble-loading />
</span>
</p>
<!-- user -->
<p v-else mt-2 flex>
<template v-if="item.audioBlob">
<template v-if="speakIndex !== i">
<span class="chat-btn" @click="userSpeak(item.audioBlob, i)">
<i icon-btn rotate-270 i-ic:sharp-wifi />
</span>
</template>
<template v-else>
<span v-if="isPlaying" class="chat-btn" @click="restartAudio()">
<i icon-btn rotate-270 i-svg-spinners:wifi-fade />
</span>
<span v-else class="chat-btn" @click="userSpeak(item.audioBlob, i)">
<i icon-btn rotate-270 i-ic:sharp-wifi />
</span>
</template>
</template>
<span v-if="!isTranslating || translateIndex !== i" ml-1 class="chat-btn" @click="translate(item.content, i)">
<i icon-btn i-carbon:ibm-watson-language-translator />
</span>
<span v-else ml-1 class="chat-btn">
<i icon-btn i-eos-icons:bubble-loading />
</span>
</p>
</div>
</div>
</template>
Expand Down
4 changes: 2 additions & 2 deletions src/stores/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ export interface Conversation {
key: Key // 名称 唯一标识
name: string // 名称
desc: string
chatMessages: ChatMessage[] // 聊天信息
chatMessages: ChatMessageWithAudioUrl[] // 聊天信息
language: string // tts stt
voice: string // 参考 https://aka.ms/speech/tts-languages
avatar: string // 用户头像
Expand Down Expand Up @@ -66,7 +66,7 @@ export const useConversationStore = defineStore('conversation', {
},
},
actions: {
changeConversations(chatMessages: ChatMessage[]) {
changeConversations(chatMessages: ChatMessageWithAudioUrl[]) {
this.chatMessages(this.currentKey)!.chatMessages = chatMessages
},
changeCurrentKey(key: Key) {
Expand Down
4 changes: 4 additions & 0 deletions src/types..d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ interface ImagePayload {
n?: number
size?: string
}

interface ChatMessageWithAudioUrl extends ChatMessage {
audioBlob?: string //base64
}
26 changes: 26 additions & 0 deletions src/utils/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,29 @@ export async function fetchWithTimeout(

return response
}

export function blobToBase64(blob: Blob) {
return new Promise<string>((resolve, reject) => {
const reader = new FileReader()
if (blob.size === 0)
return resolve('')

reader.readAsDataURL(blob)
reader.onload = function () {
const dataUrl = reader.result
resolve(dataUrl!.toString())
}
reader.onerror = reject
})
}

export function base64ToBlob(dataUrl: string) {
const arr = dataUrl.split(',')
const mime = arr[0].match(/:(.*?);/)![1]
const bstr = atob(arr[1])
let n = bstr.length
const u8arr = new Uint8Array(n)
while (n--)
u8arr[n] = bstr.charCodeAt(n)
return new Blob([u8arr], { type: mime })
}

0 comments on commit 1235b52

Please sign in to comment.