In [1]:
# install PyAudio http://people.csail.mit.edu/hubert/pyaudio/
# pip install pyaudio
import pyaudio
import wave
import sys

# default settings for record and playback
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 22050

def play_stream(wave_stream):
    global CHUNK
    print("Playing...", end='')
    p = pyaudio.PyAudio()
    out_stream = p.open(format=p.get_format_from_width(wave_stream.getsampwidth()),
                channels=wave_stream.getnchannels(),
                rate=wave_stream.getframerate(),
                output=True)
    data = wave_stream.readframes(CHUNK)
    while data:
        out_stream.write(data)
        data = wave_stream.readframes(CHUNK)
    out_stream.stop_stream()
    out_stream.close()
    p.terminate()
    print("played")


def record_to_file(filename, seconds=5):
    global FORMAT, CHANNELS, RATE
    RECORD_SECONDS = seconds
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)
    print("Start recording... ")
    frames = []
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("...recorded", seconds, "second(s)")
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

In [2]:
# yandex speech kit
with open("go.gcp.id") as f:
    api_key = f.readline()

In [5]:
# simple docs: https://cloud.google.com/text-to-speech/docs/quickstart-protocol
# when enabling API just create API key

import requests
import base64

url = "https://texttospeech.googleapis.com/v1beta1/text:synthesize"

request_data = {
    'input':{
        'text':'Android is a mobile operating system developed by Google.'
    },
    'voice':{
        'languageCode':'en-gb',
        'name':'en-GB-Standard-A',
        'ssmlGender':'FEMALE'
    },
    'audioConfig':{
        'audioEncoding':'MP3'
    }
}

headers = {
    "X-Goog-Api-Key": api_key,
    "Content-Type": "application/json"
}

resp = requests.post(url, json=request_data, headers=headers)
sound = resp.json()['audioContent']
_bytes = base64.b64decode(sound)
with open('datasets/sound/google.t2s.mp3', 'wb') as f:
    f.write(_bytes)
    
request_data['audioConfig']['audioEncoding'] = 'LINEAR16'
# https://cloud.google.com/text-to-speech/docs/voices
request_data['voice']['languageCode'] = 'en-US'
request_data['voice']['name'] = 'en-US-Standard-B'
request_data['voice']['ssmlGender'] = 'MALE'

resp = requests.post(url, json=request_data, headers=headers)
sound = resp.json()['audioContent']
_bytes = base64.b64decode(sound)
with open('datasets/sound/google.t2s.wav', 'wb') as f:
    f.write(_bytes)
wf = wave.open('datasets/sound/google.t2s.wav', 'rb')
play_stream(wf)

Playing...played


In [4]:
# or use API
# pip install --upgrade google-cloud-texttospeech
from google.cloud import texttospeech

# setup credentials file first!!!!

# client = texttospeech.TextToSpeechClient()
# synthesis_input = texttospeech.types.SynthesisInput(text="Hello, World!")

print("Genders:", texttospeech.enums.SsmlVoiceGender._member_names_)
# voice = texttospeech.types.VoiceSelectionParams(
#     language_code='en-US',
#     ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)

print("Encodings:", texttospeech.enums.AudioEncoding._member_names_)
# audio_config = texttospeech.types.AudioConfig(audio_encoding=texttospeech.enums.AudioEncoding.MP3)
# response = client.synthesize_speech(synthesis_input, voice, audio_config)

# with open('datasets/sound/google.t2s.api.mp3', 'wb') as out:
#     # Write the response to the output file.
#     out.write(response.audio_content)

Genders: ['SSML_VOICE_GENDER_UNSPECIFIED', 'MALE', 'FEMALE', 'NEUTRAL']
Encodings: ['AUDIO_ENCODING_UNSPECIFIED', 'LINEAR16', 'MP3', 'OGG_OPUS']
