/
riva-tts.py
84 lines (66 loc) · 2.37 KB
/
riva-tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import sys
import requests
import av
import io
import subprocess
import itertools
SERVER_URL = "http://192.168.70.168:5000/tts"
def chunked_response_reader(response, chunk_size=8192):
try:
for chunk in response.iter_content(chunk_size):
if chunk:
yield chunk
except requests.exceptions.ChunkedEncodingError:
pass
def play_audio_stream(response):
# Open the response as a container
container = av.open(io.BytesIO(b''.join(chunked_response_reader(response))), mode='r')
# Get the audio stream
audio_stream = container.streams.audio[0]
# Create a subprocess with aplay
aplay_process = subprocess.Popen(["aplay", "-t", "raw", "-f", "U8", "-r", str(audio_stream.rate), "-c", str(audio_stream.channels)], stdin=subprocess.PIPE)
output_buffer = io.BytesIO()
output_container = av.open(output_buffer, mode='w', format='wav')
output_stream = output_container.add_stream("pcm_u8", rate=48000)
# Initialize a variable for the next timestamp
next_dts = 0
# Read and decode audio frames
for frame in container.decode(audio_stream):
frame.pts = next_dts
for packet in output_stream.encode(frame):
# Set the packet's timestamp
packet.dts = packet.pts
output_container.mux(packet)
data = output_buffer.getvalue()
if data:
aplay_process.stdin.write(data)
output_buffer.seek(0)
output_buffer.truncate()
# Update the next timestamp
next_dts += frame.samples
output_container.close()
aplay_process.stdin.close()
aplay_process.wait()
def synthesize_text(text):
payload = {
"voice_name": "English-US.Female-1",
"text": text
}
headers = {"Content-Type": "application/json"}
response = requests.post(SERVER_URL, json=payload, headers=headers, stream=True)
if response.status_code != 200:
print(f"Request failed with status: {response.status_code}")
sys.exit(1)
# Stream and play the audio
play_audio_stream(response)
def main():
if len(sys.argv) == 2:
text = sys.argv[1]
synthesize_text(text)
else:
for line in sys.stdin:
stripped_line = line.strip()
if len(stripped_line) > 0:
synthesize_text(stripped_line)
if __name__ == "__main__":
main()