/
openai.go
102 lines (83 loc) 路 2.79 KB
/
openai.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
package texttospeech
import (
"context"
"fmt"
"io"
"github.com/hupe1980/golc/schema"
"github.com/sashabaranov/go-openai"
)
// Compile time check to ensure OpenAI satisfies the TextToSpeech interface.
var _ schema.TextToSpeech = (*OpenAI)(nil)
// OpenAIClient is an interface for the OpenAI Text-to-Speech API client.
type OpenAIClient interface {
CreateSpeech(ctx context.Context, request openai.CreateSpeechRequest) (response io.ReadCloser, err error)
}
// OpenAIOptions contains options for configuring the OpenAI transformer.
type OpenAIOptions struct {
Model openai.SpeechModel
Voice openai.SpeechVoice
ResponseFormat openai.SpeechResponseFormat
Speed float64
}
// DefaultOpenAIOptions provides default values for OpenAIOptions.
var DefaultOpenAIOptions = OpenAIOptions{
Model: openai.TTSModel1,
Voice: openai.VoiceAlloy,
ResponseFormat: openai.SpeechResponseFormatMp3,
Speed: 1.0,
}
// OpenAI is a transformer that uses the OpenAI Text-to-Speech API to synthesize speech from text.
type OpenAI struct {
client OpenAIClient
opts OpenAIOptions
}
// NewOpenAI creates a new instance of the OpenAI transformer.
func NewOpenAI(apiKey string, optFns ...func(o *OpenAIOptions)) *OpenAI {
opts := DefaultOpenAIOptions
for _, fn := range optFns {
fn(&opts)
}
config := openai.DefaultConfig(apiKey)
client := openai.NewClientWithConfig(config)
return NewOpenAIFromClient(client, func(o *OpenAIOptions) {
*o = opts
})
}
// NewOpenAI creates a new instance of the OpenAI transformer.
func NewOpenAIFromClient(client OpenAIClient, optFns ...func(o *OpenAIOptions)) *OpenAI {
opts := DefaultOpenAIOptions
for _, fn := range optFns {
fn(&opts)
}
return &OpenAI{
client: client,
opts: opts,
}
}
// SynthesizeSpeech uses the OpenAI Text-to-Speech API to transform text into an audio stream.
func (t2s *OpenAI) SynthesizeSpeech(ctx context.Context, text string) (schema.AudioStream, error) {
var internalOutputFormat schema.OutputFormat
switch t2s.opts.ResponseFormat {
case openai.SpeechResponseFormatMp3:
internalOutputFormat = schema.OutputFormatMP3
case openai.SpeechResponseFormatAac:
internalOutputFormat = schema.OutputFormatAAC
case openai.SpeechResponseFormatOpus:
internalOutputFormat = schema.OutputFormatOpus
case openai.SpeechResponseFormatFlac:
internalOutputFormat = schema.OutputFormatFlac
default:
return nil, fmt.Errorf("unsupported response format: %s", t2s.opts.ResponseFormat)
}
res, err := t2s.client.CreateSpeech(ctx, openai.CreateSpeechRequest{
Model: t2s.opts.Model,
Input: text,
Voice: t2s.opts.Voice,
ResponseFormat: t2s.opts.ResponseFormat,
Speed: t2s.opts.Speed,
})
if err != nil {
return nil, err
}
return NewAudioStream(res, internalOutputFormat), nil
}