# Text to speech model using elevenlabs api
Objectives:
1. Output speech from text using elevenlabs api　✅
2. Combine the audio files ✅
3. Train particular voice　✅

In [1]:
from pathlib import Path
import os
from dotenv import load_dotenv

dotenv_path = Path("__file__").resolve().parents[1].parents[0] / '.local.env'
load_dotenv(dotenv_path)
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
data_path = Path("__file__").resolve().parents[1].parents[0] / "local_data"

In [2]:
audio_file_path = Path(data_path) / "rakugo_v1_short.mp3"

audio_file_paths = [str(audio_file_path)]

In [3]:
from elevenlabs import clone, generate, play

voice = clone(
    api_key=ELEVENLABS_API_KEY,
    name="rakugo_v1",
    files=audio_file_paths,
)

In [4]:
audio = generate(
    api_key=ELEVENLABS_API_KEY,
    text="おけです！あとで連絡するね！",
    voice=voice,
    model="eleven_multilingual_v2",
)

In [5]:
test_segments = [
    {'text': '練習したような小刻みの白書がございます 旬風て一ノ助と申しましてまぁ', 'start': 16.0, 'end': 23.0},
    {'text': '嘘つきは泥棒の始まりなんと まあねほんと猫立ちの悪い嘘つくよりはちょいと間抜けな泥棒の方が', 'start': 23.0, 'end': 30.8799991607666},
    {'text': 'かわいいがあるようでございますおしまいしまいこっち来いしまいはい親分なんかご用ですかよですかじゃないよね', 'start': 30.8799991607666, 'end': 38.08000183105469}
]

sentences = [
    'まるで訓練されているかのような詳細な報告があります。僕の名前は一ノ助と申します。',
    '嘘をつく人は、最終的には盗人になるものだ。だから、上手く嘘をつけない人よりも、ちょっとばかばかしい泥棒の方がそれなりに魅力があるのかもしれない。',
    'これで終わりです。こっちにこい、終わりだよ。はい、親分は何か用があるの？ただの用だけではなくて。'
]

In [6]:
audios = []
for sentence in sentences:
    audio = generate(
        api_key=ELEVENLABS_API_KEY,
        text=sentence,
        voice=voice,
        model="eleven_multilingual_v2",
        output_format="mp3_44100_128"
    )
    audios.append(audio)

In [7]:
play(audios[0])

In [8]:
play(audios[1])

In [9]:
play(audios[2])

In [10]:
temp_res_file_path = Path(data_path) / "temp_results"
os.makedirs(temp_res_file_path, exist_ok=True)

In [11]:
# save generated audio to mp3
for i, audio in enumerate(audios):
    audio_file_path = temp_res_file_path / f"rakugo_v1_{i}.mp3"
    with open(audio_file_path, "wb") as f:
        f.write(audio)
    print(f"audio file saved to {audio_file_path}")

audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_0.mp3
audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_1.mp3
audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_2.mp3


In [12]:
# speed up audio according to segments start and end

from pydub import AudioSegment
from pydub.effects import speedup

for i, segment in enumerate(test_segments):
    audio_file_path = temp_res_file_path / f"rakugo_v1_{i}.mp3"
    audio = AudioSegment.from_file(audio_file_path, format="mp3")
    audio_dur = len(audio) / 1000
    speed = round(audio_dur / (segment['end'] - segment['start']), 1)
    if 0.8 <= speed <= 1.1:
        speed = 1
        speeded_audio = audio
    else:
        speeded_audio = speedup(audio, speed)
    print(f"speeded audio with speed {speed}")
    speeded_audio.export(temp_res_file_path / f"rakugo_v1_{i}_speeded.mp3", format="mp3")
    print(f"audio file saved")

speeded audio with speed 1
audio file saved
speeded audio with speed 1.4
audio file saved
speeded audio with speed 1
audio file saved


In [13]:
# merge speeded audio according to segments start and end
# use the first segment start time as the start time of the merged audio
# fill first silence with original audio
for i, segment in enumerate(test_segments):
    audio_file_path = temp_res_file_path / f"rakugo_v1_{i}_speeded.mp3"
    original_audio_file_path = data_path / "rakugo_v1_short.mp3"
    original_audio = AudioSegment.from_file(original_audio_file_path, format="mp3")
    audio = AudioSegment.from_file(audio_file_path, format="mp3")
    current_time = segment['start'] * 1000
    if i == 0:
        fill_dur = current_time
        fill = original_audio[:fill_dur]
        merged_audio = fill + audio
    else:
        silence_dur = segment['start'] * 1000 - current_time
        silence = AudioSegment.silent(duration=silence_dur)
        merged_audio = merged_audio + silence + audio
    current_time = len(merged_audio)

merged_audio.export(temp_res_file_path / f"rakugo_v1_merged.mp3", format="mp3")

<_io.BufferedRandom name='/Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_merged.mp3'>