In [1]:
import ipywidgets as widgets
from IPython.display import display, Audio, clear_output
from google.cloud import texttospeech
from pydub import AudioSegment
from IPython.display import Audio
import tempfile



In [2]:
from IPython.display import HTML

phoneme_table_html = """
<style>
.phoneme-table {
    border-collapse: collapse;
    width: 100%;
    font-family: sans-serif;
    margin-top: 1em;
}
.phoneme-table th, .phoneme-table td {
    border: 1px solid #ccc;
    padding: 6px 10px;
    text-align: center;
}
.phoneme-table th {
    background-color: #f2f2f2;
}
.phoneme-table-container {
    max-height: 300px;
    overflow-y: scroll;
    border: 1px solid #ccc;
    margin-top: 10px;
}
</style>

<div class="phoneme-table-container">
<table class="phoneme-table">
    <tr>
        <th>IPA Phoneme</th>
        <th>Example Word</th>
        <th>IPA Transcription</th>
    </tr>
    <tr><td>p</td><td>popular</td><td>ˈpɑːpjəlɚ</td></tr>
    <tr><td>b</td><td>bubble</td><td>ˈbʌbəl</td></tr>
    <tr><td>t</td><td>tinker</td><td>ˈtɪŋkɚ</td></tr>
    <tr><td>d</td><td>dog</td><td>ˈdɑːg</td></tr>
    <tr><td>k</td><td>crown</td><td>ˈkɹaʊn</td></tr>
    <tr><td>ɡ</td><td>gravely</td><td>ˈgɹeɪˌvliː</td></tr>
    <tr><td>m</td><td>mapping</td><td>ˈmæpəŋ</td></tr>
    <tr><td>n</td><td>nine</td><td>ˈnaɪn</td></tr>
    <tr><td>ŋ</td><td>bank</td><td>ˈbæŋk</td></tr>
    <tr><td>f</td><td>frog</td><td>ˈfɹɑːg</td></tr>
    <tr><td>v</td><td>valve</td><td>ˈvælv</td></tr>
    <tr><td>s</td><td>massage</td><td>məˈsɑːʒ</td></tr>
    <tr><td>z</td><td>zoom</td><td>ˈzuːm</td></tr>
    <tr><td>θ</td><td>thigh</td><td>ˈθaɪ</td></tr>
    <tr><td>ð</td><td>mother</td><td>ˈmʌðɚ</td></tr>
    <tr><td>ʃ</td><td>shopping</td><td>ˈʃɑːpəŋ</td></tr>
    <tr><td>ʒ</td><td>leisure</td><td>ˈliːʒɚ</td></tr>
    <tr><td>h</td><td>mahogany</td><td>məˈhɑːgəˌniː</td></tr>
    <tr><td>l</td><td>lately</td><td>ˈleɪtˌliː</td></tr>
    <tr><td>ɹ</td><td>roaring</td><td>ˈɹɔːɹəŋ</td></tr>
    <tr><td>ʧ</td><td>changed</td><td>ˈʧeɪnʤd</td></tr>
    <tr><td>ʤ</td><td>magenta</td><td>məˈʤɛntə</td></tr>
    <tr><td>j</td><td>younger</td><td>ˈjʌŋgɚ</td></tr>
    <tr><td>w</td><td>whirlwind</td><td>ˈwɚlˌwɪnd</td></tr>
    <tr><td>æ</td><td>cat</td><td>ˈkæt</td></tr>
    <tr><td>ɑː</td><td>cot</td><td>ˈkɑːt</td></tr>
    <tr><td>ə</td><td>again</td><td>əˈgɛn</td></tr>
    <tr><td>ɚ</td><td>bird</td><td>ˈbɚd</td></tr>
    <tr><td>ɛ</td><td>bed</td><td>ˈbɛd</td></tr>
    <tr><td>ɪ</td><td>kit</td><td>ˈkɪt</td></tr>
    <tr><td>iː</td><td>unique</td><td>ˌjuːˈniːk</td></tr>
    <tr><td>ɔː</td><td>more</td><td>ˈmɔːɹ</td></tr>
    <tr><td>ʊ</td><td>could</td><td>ˈkʊd</td></tr>
    <tr><td>uː</td><td>school</td><td>ˈskuːl</td></tr>
    <tr><td>ʌ</td><td>pulse</td><td>ˈpʌls</td></tr>
    <tr><td>aɪ</td><td>price</td><td>ˈpɹaɪs</td></tr>
    <tr><td>aʊ</td><td>flower</td><td>ˈflaʊɚ</td></tr>
    <tr><td>eɪ</td><td>shade</td><td>ˈʃeɪd</td></tr>
    <tr><td>ɔɪ</td><td>choice</td><td>ˈʧɔɪs</td></tr>
    <tr><td>oʊ</td><td>boat</td><td>ˈboʊt</td></tr>
</table>
</div>

<h4>Stress Symbols</h4>
<ul>
    <li><strong>ˈ</strong> – Primary stress</li>
    <li><strong>ˌ</strong> – Secondary stress</li>
    <li><strong>.</strong> – Syllable boundary</li>
</ul>
"""



In [3]:

# Voice and language dropdowns
lang_dropdown = widgets.Dropdown(
    options=[('English (US)', 'en-US'), ('Irish (IE)', 'ga-IE')],
    value='en-US',
    description='Language:'
)

gender_dropdown = widgets.Dropdown(
    options=['FEMALE', 'MALE', 'NEUTRAL'],
    value='FEMALE',
    description='Gender:'
)

# SSML input area
ssml_input = widgets.Textarea(
    value="<speak>Hello world!</speak>",
    placeholder='Paste SSML here (including IPA)',
    description='SSML:',
    layout=widgets.Layout(width='100%', height='150px')
)

# Output area
output_area = widgets.Output()

# Play button
play_button = widgets.Button(
    description='Play',
    button_style='success',
    tooltip='Generate and play TTS audio',
    icon='play'
)

# TTS playback function
def synthesize_and_play_ssml(ssml_text, lang_code, gender):
    client = texttospeech.TextToSpeechClient()

    synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
    voice_params = texttospeech.VoiceSelectionParams(
        language_code=lang_code,
        ssml_gender=getattr(texttospeech.SsmlVoiceGender, gender)
    )
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.LINEAR16)

    response = client.synthesize_speech(
        input=synthesis_input,
        voice=voice_params,
        audio_config=audio_config
    )

    temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    temp_wav.write(response.audio_content)
    temp_wav.close()

    #audio = AudioSegment.from_wav(temp_wav.name)
    #sa.play_buffer(audio.raw_data, num_channels=audio.channels,
    #               bytes_per_sample=audio.sample_width, sample_rate=audio.frame_rate)

    return Audio(temp_wav.name)

# Button click callback
def on_play_clicked(b):
    with output_area:
        clear_output(wait=True)
        ssml_text = ssml_input.value
        lang_code = lang_dropdown.value
        gender = gender_dropdown.value
        try:
            audio_widget = synthesize_and_play_ssml(ssml_text, lang_code, gender)
            display(audio_widget)
        except Exception as e:
            print(f"Error: {e}")

play_button.on_click(on_play_clicked)

# Display UI
ui = widgets.VBox([
    widgets.HBox([lang_dropdown, gender_dropdown]),
    ssml_input,
    play_button,
    output_area
])

display(ui)

VBox(children=(HBox(children=(Dropdown(description='Language:', options=(('English (US)', 'en-US'), ('Irish (I…

In [None]:
# reference table for valid phonemes for US-English tts on google cloud
# https://cloud.google.com/text-to-speech/docs/phonemes
display(HTML(phoneme_table_html))

IPA Phoneme,Example Word,IPA Transcription
p,popular,ˈpɑːpjəlɚ
b,bubble,ˈbʌbəl
t,tinker,ˈtɪŋkɚ
d,dog,ˈdɑːg
k,crown,ˈkɹaʊn
ɡ,gravely,ˈgɹeɪˌvliː
m,mapping,ˈmæpəŋ
n,nine,ˈnaɪn
ŋ,bank,ˈbæŋk
f,frog,ˈfɹɑːg
