# Speech to Speech Flow

## Speech to Text
Objectives:
1. Transcribe text　✅
2. Preserve timestamp　✅

In [1]:
from pathlib import Path
import pytube as pt
from openai import OpenAI
import os
from dotenv import load_dotenv

dotenv_path = Path("__file__").resolve().parents[1].parents[0] / '.local.env'
load_dotenv(dotenv_path)

OpenAI.api_key = os.getenv("OPENAI_API_KEY")
data_path = Path("__file__").resolve().parents[1].parents[0] / "local_data"

In [2]:
YOUTUBE_VIDEO_URL = "https://www.youtube.com/watch?v=g_ltie2ZGNY"

In [3]:
yt = pt.YouTube(YOUTUBE_VIDEO_URL)
stream = yt.streams.filter(only_audio=True)[0]
audio_file_path = Path(data_path) / "rakugo_v1.mp3"
stream.download(filename=audio_file_path)

'/Users/howardtangkulung/code/personal_projects/rakugo/local_data/rakugo_v1.mp3'

In [4]:
# take first 2 minutes of the audio
from pydub import AudioSegment
audio = AudioSegment.from_file(audio_file_path)
two_minutes = 45 * 1000

audio = audio[:two_minutes]
shortened_audio_file_path = Path(data_path) / "rakugo_v1_short.mp3"
audio.export(shortened_audio_file_path, format="mp3")

<_io.BufferedRandom name='/Users/howardtangkulung/code/personal_projects/rakugo/local_data/rakugo_v1_short.mp3'>

In [5]:
client = OpenAI()

shortened_audio_file_path = Path(data_path) / "rakugo_v1_short.mp3"
audio_file= open(shortened_audio_file_path, "rb")

transcript = client.audio.transcriptions.create(
  file=audio_file,
  model="whisper-1",
  response_format="verbose_json",
  timestamp_granularities=["segment"]
)

In [6]:
print(len(transcript.segments), "segments")
print(transcript.segments[:2])

5 segments
[{'id': 0, 'seek': 0, 'start': 0.0, 'end': 15.5600004196167, 'text': 'me', 'tokens': [50364, 1398, 51142], 'temperature': 0.20000000298023224, 'avg_logprob': -0.6036391854286194, 'compression_ratio': 1.0190476179122925, 'no_speech_prob': 0.2711162269115448}, {'id': 1, 'seek': 0, 'start': 16.040000915527344, 'end': 23.0, 'text': 'a 練習したような小刻みの白書がございます 春風って市の助と申しましてまぁ', 'tokens': [51166, 64, 220, 47027, 34025, 8533, 17010, 3203, 7322, 45500, 11362, 2972, 13558, 29801, 5142, 43808, 220, 46953, 22713, 6102, 27261, 2972, 37618, 3193, 3526, 111, 45349, 8822, 37566, 51514], 'temperature': 0.20000000298023224, 'avg_logprob': -0.6036391854286194, 'compression_ratio': 1.0190476179122925, 'no_speech_prob': 0.2711162269115448}]


In [7]:
# remove alphanumeric characters with regex
import re

segments = []
for segment in transcript.segments:
    text = re.sub(r'[a-zA-Z0-9🎶]', '', segment["text"]).strip()
    start = segment["start"]
    end = segment["end"]

    if text and (end-start) > 2.5:
        segments.append({
            "text": text,
            "start": start,
            "end": end
        })

for i in range(len(segments[:5])):
    print(segments[i])

{'text': '練習したような小刻みの白書がございます 春風って市の助と申しましてまぁ', 'start': 16.040000915527344, 'end': 23.0}
{'text': '嘘つきは泥棒の始まりなんとか言って まあねほんと猫立ちの悪い嘘つくよりはちょいと招けな泥棒の方が', 'start': 23.0, 'end': 30.8799991607666}
{'text': '買い代わりようでございますおしまいしまい こっち来いしまいはい親分なんかご用ですかよですかじゃないよね', 'start': 30.8799991607666, 'end': 38.08000183105469}
{'text': 'おめのこと仲間がなぁなんて言ってか知ってんのかねあら見込みがねからさっさと足 荒らして敵に戻したらどうだってみんな濡れると', 'start': 38.08000183105469, 'end': 47.15999984741211}


In [8]:
# # find the possible pause time between words (top 10% of pause times)
# pause_times = []
# for i in range(len(transcript.words) - 1):
#     pause_times.append(transcript.words[i+1]["start"] - transcript.words[i]["end"])

# # find the top 10% of pause times
# pause_times.sort()
# top_20_pause_times = pause_times[-int(len(pause_times) * 0.05):]
# average_pause_time = sum(top_20_pause_times) / len(top_20_pause_times)
# print(average_pause_time)


In [9]:
# # visulaize the time between words, present the cut-off point
# import matplotlib.pyplot as plt
# import numpy as np

# timestamps = [word["start"] for word in transcript.words]
# durations = np.diff(timestamps)
# plt.hist(durations, bins=100)
# plt.axvline(2*average_pause_time, color="red", linestyle="--")
# plt.xlabel("Duration (s)")
# plt.ylabel("Count")
# plt.show()

In [10]:
# # group words into sentences (based on timestamps)
# # or if end in "ます" or "です"　but not "ですよ"
# sentences = []
# sentence = ""
# timestamps_start = []
# timestamps_end = []
# for word in transcript.words:
#     if sentence == "":
#         timestamps_start.append(word["start"])
#     sentence += word["word"]
#     if word["end"] - word["start"] > average_pause_time*2:
#         sentences.append(sentence)
#         sentence = ""
#         timestamps_end.append(word["end"])

# print(len(sentences), "sentences found")
# print(sentences[:5])
# print(timestamps_start[:5])
# print(timestamps_end[:5])

## Text to text
Objectives
1. Easify the text ✅
2. Try to preserve the length of text ✅

In [11]:
sentences = [segment["text"] for segment in segments]

In [12]:
# join segements in the format of
# 1. segment1
# 2. segment2
# ...
input_text = "\n".join([f"{i+1}. {sentence}" for i, sentence in enumerate(sentences)])
print(input_text)
JLPT_LEVEL = "N4"

1. 練習したような小刻みの白書がございます 春風って市の助と申しましてまぁ
2. 嘘つきは泥棒の始まりなんとか言って まあねほんと猫立ちの悪い嘘つくよりはちょいと招けな泥棒の方が
3. 買い代わりようでございますおしまいしまい こっち来いしまいはい親分なんかご用ですかよですかじゃないよね
4. おめのこと仲間がなぁなんて言ってか知ってんのかねあら見込みがねからさっさと足 荒らして敵に戻したらどうだってみんな濡れると


In [13]:
completion = client.chat.completions.create(
  model="gpt-4",
  messages=[
    {"role": "system", "content": """
    日本の落語の一節と日本語能力試験（JLPT）のレベルが与えられます。\n
    その節を、指定されたJLPTレベルに適した語彙を使って簡単にしてください。\n
    語彙のみを変更してください。文の構造や意味は変更しないでください。\n
     """},
    {"role": "user", "content": f"""
    JLPTレベル: {JLPT_LEVEL}\n
     落語の節:\n
     {input_text}
     """}
  ]
)


In [14]:
completion.choices[0].message

ChatCompletionMessage(content='\n    1. 練習したような小さいメモがあります。春風って人と自分を紹介して、まあ\n2. 嘘つきは泥棒の始まりとか言って、まあでも本当に悪い嘘をつくよりは少し招けない泥棒の方が\n3. 買い替えようとしています。終わり、終わり、こっち来い終わり。はい、おやびん、何かご用ですか？はいですかじゃないよね。\n4. あなたのこと、友達がなぁと言って。知ってるのかな。あら、良い見込みがないから、早く足を運んで敵に戻したらどうだってみんな言っていると。', role='assistant', function_call=None, tool_calls=None)

In [15]:
completion_output = completion.choices[0].message.content
print(completion_output)


    1. 練習したような小さいメモがあります。春風って人と自分を紹介して、まあ
2. 嘘つきは泥棒の始まりとか言って、まあでも本当に悪い嘘をつくよりは少し招けない泥棒の方が
3. 買い替えようとしています。終わり、終わり、こっち来い終わり。はい、おやびん、何かご用ですか？はいですかじゃないよね。
4. あなたのこと、友達がなぁと言って。知ってるのかな。あら、良い見込みがないから、早く足を運んで敵に戻したらどうだってみんな言っていると。


In [16]:
# save the easified text using regex
# "number". "sentence" -> "sentence"
import re

easified_sentences = re.findall(r"\d+\. (.+)", completion_output)
print(len(easified_sentences), " sentences found from ", len(segments), " sentences")
print(easified_sentences)

4  sentences found from  4  sentences
['練習したような小さいメモがあります。春風って人と自分を紹介して、まあ', '嘘つきは泥棒の始まりとか言って、まあでも本当に悪い嘘をつくよりは少し招けない泥棒の方が', '買い替えようとしています。終わり、終わり、こっち来い終わり。はい、おやびん、何かご用ですか？はいですかじゃないよね。', 'あなたのこと、友達がなぁと言って。知ってるのかな。あら、良い見込みがないから、早く足を運んで敵に戻したらどうだってみんな言っていると。']


## Text to speech
Objectives
1. Synchronise with timestamps ✅
2. Use same voice ✅

In [17]:
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")

In [18]:
audio_file_paths = [str(shortened_audio_file_path)]

In [19]:
from elevenlabs import clone, generate, play

voice = clone(
    api_key=ELEVENLABS_API_KEY,
    name="rakugo_v1",
    files=audio_file_paths,
)

In [20]:
audios = []
for sentence in easified_sentences:
    audio = generate(
        api_key=ELEVENLABS_API_KEY,
        text=sentence,
        voice=voice,
        model="eleven_multilingual_v2",
        output_format="mp3_44100_128"
    )
    audios.append(audio)

In [21]:
play(audios[0])

In [22]:
temp_res_file_path = Path(data_path) / "temp_results"
os.makedirs(temp_res_file_path, exist_ok=True)

In [23]:
# save generated audio to mp3
for i, audio in enumerate(audios):
    audio_file_path = temp_res_file_path / f"rakugo_v1_{i}.mp3"
    with open(audio_file_path, "wb") as f:
        f.write(audio)
    print(f"audio file saved to {audio_file_path}")

audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_0.mp3
audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_1.mp3
audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_2.mp3
audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_3.mp3


In [26]:
# speed up audio according to segments start and end

from pydub import AudioSegment
from pydub.effects import speedup

for i, segment in enumerate(segments):
    audio_file_path = temp_res_file_path / f"rakugo_v1_{i}.mp3"
    audio = AudioSegment.from_file(audio_file_path, format="mp3")
    audio_dur = len(audio) / 1000
    speed = audio_dur / (segment['end'] - segment['start'])
    # round speed to nearest from 1, 1.25, 1.5
    speed = round(speed*4) / 4
    if speed <= 1:
        speed = 1
        speeded_audio = audio
    else:
        speeded_audio = speedup(audio, speed)
    print(f"speeded audio with speed {speed}")
    speeded_audio.export(temp_res_file_path / f"rakugo_v1_{i}_speeded.mp3", format="mp3")
    print(f"audio file saved")

speeded audio with speed 1
audio file saved
speeded audio with speed 1
audio file saved
speeded audio with speed 1
audio file saved
speeded audio with speed 1.25
audio file saved


In [27]:
# merge speeded audio according to segments start and end
# use the first segment start time as the start time of the merged audio
# fill first silence with original audio
for i, segment in enumerate(segments):
    audio_file_path = temp_res_file_path / f"rakugo_v1_{i}_speeded.mp3"
    original_audio_file_path = data_path / "rakugo_v1_short.mp3"
    original_audio = AudioSegment.from_file(original_audio_file_path, format="mp3")
    audio = AudioSegment.from_file(audio_file_path, format="mp3")
    current_time = segment['start'] * 1000
    if i == 0:
        fill_dur = current_time
        fill = original_audio[:fill_dur]
        merged_audio = fill + audio
    else:
        silence_dur = segment['start'] * 1000 - current_time
        silence = AudioSegment.silent(duration=silence_dur)
        merged_audio = merged_audio + silence + audio
    current_time = len(merged_audio)

merged_audio.export(temp_res_file_path / f"rakugo_v1_merged.mp3", format="mp3")

<_io.BufferedRandom name='/Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_merged.mp3'>