In [2]:
import librosa
import os
import scipy
from tqdm import tqdm
import soundfile as sf
import numpy as np
import pandas as pd
from natsort import natsorted
from os.path import isfile,join
from os import listdir
import shutil
import parselmouth
from parselmouth.praat import call
from google.cloud import texttospeech
from scipy.io import wavfile
import sox
from google.cloud import speech

# Import & text-to-speech operation

In [2]:
# you need a proper activated google cloud account 
# then you'll have a .json file containing references to your account (it is for billing)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="**************.json"

# everything can be found here --> https://cloud.google.com/text-to-speech

In [2]:
#here you can choose diverse parameters for the voices 
# but you'll need credentials to be able to access it 
def list_voices():
    """Lists the available voices."""
    from google.cloud import texttospeech

    client = texttospeech.TextToSpeechClient()

    # Performs the list voices request
    voices = client.list_voices()

    for voice in voices.voices:
        # Display the voice's name. Example: tpc-vocoded
        print(f"Name: {voice.name}")

        # Display the supported language codes for this voice. Example: "en-US"
        for language_code in voice.language_codes:
            print(f"Supported language: {language_code}")

        ssml_gender = texttospeech.SsmlVoiceGender(voice.ssml_gender)

        # Display the SSML Voice Gender
        print(f"SSML Voice Gender: {ssml_gender.name}")

        # Display the natural sample rate hertz for this voice. Example: 24000
        print(f"Natural Sample Rate Hertz: {voice.natural_sample_rate_hertz}\n")

In [6]:
list_voices()

Name: ar-XA-Wavenet-A
Supported language: ar-XA
SSML Voice Gender: FEMALE
Natural Sample Rate Hertz: 24000

Name: ar-XA-Wavenet-B
Supported language: ar-XA
SSML Voice Gender: MALE
Natural Sample Rate Hertz: 24000

Name: ar-XA-Wavenet-C
Supported language: ar-XA
SSML Voice Gender: MALE
Natural Sample Rate Hertz: 24000

Name: ar-XA-Wavenet-D
Supported language: ar-XA
SSML Voice Gender: FEMALE
Natural Sample Rate Hertz: 24000

Name: bn-IN-Wavenet-A
Supported language: bn-IN
SSML Voice Gender: FEMALE
Natural Sample Rate Hertz: 24000

Name: bn-IN-Wavenet-B
Supported language: bn-IN
SSML Voice Gender: MALE
Natural Sample Rate Hertz: 24000

Name: en-GB-Wavenet-A
Supported language: en-GB
SSML Voice Gender: FEMALE
Natural Sample Rate Hertz: 24000

Name: en-GB-Wavenet-B
Supported language: en-GB
SSML Voice Gender: MALE
Natural Sample Rate Hertz: 24000

Name: en-GB-Wavenet-C
Supported language: en-GB
SSML Voice Gender: FEMALE
Natural Sample Rate Hertz: 24000

Name: en-GB-Wavenet-D
Supported lang

# text-to-speech 

In [8]:
text= "Neuroscience in Marseille is awesome !"
client = texttospeech.TextToSpeechClient()

# Set the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(text=text)

# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
voice = texttospeech.VoiceSelectionParams(
    language_code ='en-US'# 'fr-FR'
    ,name = "en-US-Wavenet-G"  , #"fr-FR-Wavenet-C",
    ssml_gender = texttospeech.SsmlVoiceGender.FEMALE
    )
# Select the type of audio file you want returned
audio_config = texttospeech.AudioConfig(
    audio_encoding = texttospeech.AudioEncoding.LINEAR16,
    speaking_rate = 1)#,
    #sample_rate_hertz=22050 )

# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(input= synthesis_input, voice=voice, audio_config=audio_config)
file_name = "/home/jeremy/Desktop/"+ str(text) + ".wav"

# The response's audio_content is binary.
with open(file_name, 'wb') as out:
    # Write the response to the output file.
    out.write(response.audio_content)
    print('Audio content written to file'+ file_name)



Audio content written to file/home/jeremy/Desktop/Neuroscience in Marseille is awesome !.wav


# Trim audio files to get accurate durations

In [13]:
# trim audio files using pysox

mypath = os.getcwd()+'/audio/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
only_names = [str(i).split(".", 1)[0] for i in onlyfiles]


for num,f in enumerate(onlyfiles):
       name_in = mypath + f
       name_out =mypath+ '/soxed/' + only_names[num] + ".wav"
       tfm = sox.Transformer()
       tfm.silence(location = 0,silence_threshold = 0.1,buffer_around_silence = False)
       tfm.build(name_in,name_out)
       print(name_out)

/home/jeremy/Desktop/clean_github_channel_capacity/code/utils/audio//soxed/Neuroscience in Marseille is awesome !.wav


# Get duration of audio stimuli

In [14]:
sampling_freq = []
di = {}
mypath = os.getcwd()+'/audio/soxed/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
only_names = [i.split(".", 1)[0] for i in onlyfiles]
for num, i in enumerate (onlyfiles):
        fs, data = wavfile.read(mypath + i)
        sampling_freq.append(np.float(fs))
        size = np.round(np.float(len(data)) / np.float(fs),3)
        di[only_names[num]] = size
duration = pd.DataFrame.from_dict(di, orient = 'index', columns = ['duration_s'])


# Compress audio stimuli

In [8]:
factors = [0.1, 0.125, 0.179, 0.233, 0.286, 0.345, 0.4, 0.455, 0.5, 1.0]
speed = ["10","8","5.6","4.3","3.5","2.9","2.5", "2.2","2","1"]


mypath = os.getcwd()+'/audio/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
only_names = [str(i).split(".", 1)[0] for i in onlyfiles]

for val,fac in enumerate(factors):    
    for num,f in enumerate(onlyfiles):       
        name_in = mypath + f
        name_out = mypath+'time_compressed/' + only_names[num] + "x" + speed[val] + ".wav"
        sound = parselmouth.Sound(name_in)
        manipulation = call(sound, "To Manipulation", 0.01, 75, 600)
        duration_tier = call(manipulation, "Extract duration tier")
        call(duration_tier, "Add point", 0, fac)
        call([manipulation, duration_tier], "Replace duration tier")
        stretched_sound = call(manipulation, "Get resynthesis (overlap-add)")
        stretched_sound.save(name_out, "WAV")

NameError: name 'parselmouth' is not defined

# speech-to-text 

In [20]:
client = speech.SpeechClient()



mypath = os.getcwd()+'/audio/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
only_names = [str(i).split(".", 1)[0] for i in onlyfiles]




for num, f in enumerate(onlyfiles):
        with open(mypath + f, 'rb') as audio_file:
            content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=24000,
            language_code='en-US')
        # Detects speech in the audio file
        response = client.recognize(config=config, audio=audio)
        g_rep = []
        for p in range(len(response.results)):
            g_rep.append(response.results[p].alternatives[0].transcript)

In [21]:
g_rep

['Neuroscience in Marseille is awesome']

Thanks: @Yannick Jadoul

references :

[Pysox] R. Bittner, E. Humphrey, and J. Bello, “Pysox: Leveraging the audio signal processing power of sox in python,” in 17th Int. Soc. for Music Info. Retrieval Conf., Late Breaking and Demo Papers, New York City, NY, USA, Aug. 2016.

[PSOLA] Moulines, E., and Charpentier, F. (1990). Pitch-synchronous waveform processing techniques for text-to-speech synthesis using diphones.Speech Commun. 9, 453–467.

[PRAAT] Boersma, P., & Weenink, D. (2018). Praat: doing phonetics by computer [Computer program]. Version 6.0.37, retrieved 3 February 2018 from http://www.praat.org/

[PARSELMOUTH] Jadoul, Y., Thompson, B., & de Boer, B. (2018). Introducing Parselmouth: A Python interface to Praat. Journal of Phonetics, 71, 1-15. https://doi.org/10.1016/j.wocn.2018.07.001
