# Speech to Speech Flow

## Speech to Text
Objectives:
1. Transcribe text　✅
2. Preserve timestamp　✅

In [1]:
from pathlib import Path
import pytube as pt
from openai import OpenAI
import os
from dotenv import load_dotenv

dotenv_path = Path("__file__").resolve().parents[1].parents[0] / '.local.env'
load_dotenv(dotenv_path)

OpenAI.api_key = os.getenv("OPENAI_API_KEY")
data_path = Path("__file__").resolve().parents[1].parents[0] / "local_data"

In [2]:
YOUTUBE_VIDEO_URL = "https://www.youtube.com/watch?v=g_ltie2ZGNY"

In [3]:
yt = pt.YouTube(YOUTUBE_VIDEO_URL)
stream = yt.streams.filter(only_audio=True)[0]
audio_file_path = os.path.join(data_path, "rakugo_v1.mp3")
stream.download(filename=audio_file_path)

'/Users/howardtangkulung/code/personal_projects/rakugo/local_data/rakugo_v1.mp3'

In [4]:
# download video as well
video_file_path = os.path.join(data_path, "rakugo_v1.mp4")
yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').asc().first().download(filename=video_file_path)

'/Users/howardtangkulung/code/personal_projects/rakugo/local_data/rakugo_v1.mp4'

In [5]:
# take first 1 minutes of the audio
from pydub import AudioSegment
audio = AudioSegment.from_file(audio_file_path)
duration = 45 * 1000

audio = audio[:duration]
shortened_audio_file_path = os.path.join(data_path, "rakugo_v1_shortened.mp3")
audio.export(shortened_audio_file_path, format="mp3")

<_io.BufferedRandom name='/Users/howardtangkulung/code/personal_projects/rakugo/local_data/rakugo_v1_shortened.mp3'>

In [6]:
# trim video as well
from moviepy.editor import VideoFileClip
video = VideoFileClip(video_file_path)
video = video.subclip(0, int(duration/1000))
shortened_video_file_path = os.path.join(data_path, "rakugo_v1_shortened.mp4")
video.write_videofile(shortened_video_file_path)

Moviepy - Building video /Users/howardtangkulung/code/personal_projects/rakugo/local_data/rakugo_v1_shortened.mp4.
MoviePy - Writing audio in rakugo_v1_shortenedTEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video /Users/howardtangkulung/code/personal_projects/rakugo/local_data/rakugo_v1_shortened.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready /Users/howardtangkulung/code/personal_projects/rakugo/local_data/rakugo_v1_shortened.mp4


In [7]:
client = OpenAI()

shortened_audio_file_path = os.path.join(data_path, "rakugo_v1_shortened.mp3")
audio_file= open(shortened_audio_file_path, "rb")

transcript = client.audio.transcriptions.create(
  file=audio_file,
  model="whisper-1",
  response_format="verbose_json",
  timestamp_granularities=["segment"],
#   prompt="落語の音声が与えられます。"
)

In [8]:
print(len(transcript.segments), "segments")
print(transcript.segments[:2])

14 segments
[{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.0, 'text': '【音楽】', 'tokens': [50364, 15588, 18034, 35479, 15623, 50464], 'temperature': 0.0, 'avg_logprob': -0.31362104415893555, 'compression_ratio': 1.5315788984298706, 'no_speech_prob': 0.10512132197618484}, {'id': 1, 'seek': 0, 'start': 2.0, 'end': 16.0, 'text': '【拍手】', 'tokens': [50464, 15588, 31113, 11389, 15623, 51164], 'temperature': 0.0, 'avg_logprob': -0.31362104415893555, 'compression_ratio': 1.5315788984298706, 'no_speech_prob': 0.10512132197618484}]


In [9]:
# remove alphanumeric characters with regex
import re

segments = []
forbidden_words = ["拍手", "音楽", "笑い"]

for segment in transcript.segments:
    text = re.sub(r'[a-zA-Z0-9🎶]', '', segment["text"]).strip()
    start = segment["start"]
    end = segment["end"]

    if text and (end-start) > 2.5:
        if not any(word in text for word in forbidden_words):
            segments.append({
                "text": text,
                "start": start,
                "end": end
            })

for i in range(len(segments[:5])):
    print(segments[i])

{'text': 'まあ、えー、嘘つきは泥棒の始まりなんと。', 'start': 22.0, 'end': 25.0}
{'text': 'えー、まあね、ほんとね、こう、立ちの悪い嘘つくよりは', 'start': 25.0, 'end': 29.0}
{'text': 'ちょいと招けな泥棒のほうが、買い代わりようでございます。', 'start': 29.0, 'end': 32.0}
{'text': 'さっさと足荒らして、仇に戻したらどうだって。', 'start': 41.0, 'end': 44.0}


## Text to text
Objectives
1. Easify the text ✅
2. Try to preserve the length of text ✅

In [10]:
sentences = [segment["text"] for segment in segments]

In [11]:
# join segements in the format of
# 1. segment1
# 2. segment2
# ...
sentences_list = []
# sentences = [sentence for sentence in sentences if not any(word in sentence for word in forbidden_words)]

for i, sentence in enumerate(sentences):
    sentences_list.append(f"{i+1}. {sentence}")

input_text = "\n".join(sentences_list)
print(input_text)
JLPT_LEVEL = "N4"

1. まあ、えー、嘘つきは泥棒の始まりなんと。
2. えー、まあね、ほんとね、こう、立ちの悪い嘘つくよりは
3. ちょいと招けな泥棒のほうが、買い代わりようでございます。
4. さっさと足荒らして、仇に戻したらどうだって。


In [12]:
completion = client.chat.completions.create(
  model="gpt-4",
  messages=[
    {"role": "system", "content": """
    日本の落語の一節と日本語能力試験（JLPT）のレベルが与えられます。\n
    その節を、指定されたJLPTレベルに適した語彙を使って簡単にしてください。\n
    語彙のみを変更してください。文の構造や意味は変更しないでください。\n
     """},
    {"role": "user", "content": f"""
    JLPTレベル: {JLPT_LEVEL}\n
     落語の節:\n
     {input_text}
     """}
  ]
)


In [13]:
completion.choices[0].message

ChatCompletionMessage(content='    JLPTレベル: N4\n\n     落語の節:\n\n     1. まあ、えー、嘘をつく人は泥棒の始まりだよ。\n2. えー、まあね、本当だよ、この、悪い嘘をつくよりは\n3. 少しくらいの泥棒の方が、すぐに取り替えようがあるよ。\n4. 早く走って、敵に戻したらどうだろう。', role='assistant', function_call=None, tool_calls=None)

In [14]:
completion_output = completion.choices[0].message.content
print(completion_output)

    JLPTレベル: N4

     落語の節:

     1. まあ、えー、嘘をつく人は泥棒の始まりだよ。
2. えー、まあね、本当だよ、この、悪い嘘をつくよりは
3. 少しくらいの泥棒の方が、すぐに取り替えようがあるよ。
4. 早く走って、敵に戻したらどうだろう。


In [15]:
# save the easified text using regex
# "number". "sentence" -> "sentence"
import re

easified_sentences = re.findall(r"\d+\. (.+)", completion_output)
print(len(easified_sentences), " sentences found from ", len(segments), " sentences")
print(easified_sentences)

4  sentences found from  4  sentences
['まあ、えー、嘘をつく人は泥棒の始まりだよ。', 'えー、まあね、本当だよ、この、悪い嘘をつくよりは', '少しくらいの泥棒の方が、すぐに取り替えようがあるよ。', '早く走って、敵に戻したらどうだろう。']


## Text to speech
Objectives
1. Synchronise with timestamps ✅
2. Use same voice ✅

In [16]:
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")

In [17]:
audio_file_paths = [str(shortened_audio_file_path)]

In [18]:
from elevenlabs import clone, generate, play

voice = clone(
    api_key=ELEVENLABS_API_KEY,
    name="rakugo_v1",
    files=audio_file_paths,
)

In [19]:
audios = []
for sentence in easified_sentences:
    audio = generate(
        api_key=ELEVENLABS_API_KEY,
        text=sentence,
        voice=voice,
        model="eleven_multilingual_v2",
        output_format="mp3_44100_128"
    )
    audios.append(audio)

In [20]:
# play(audios[0])

In [21]:
temp_res_file_path = os.path.join(data_path, "temp_results")
os.makedirs(temp_res_file_path, exist_ok=True)

In [22]:
# save generated audio to mp3
for i, audio in enumerate(audios):
    audio_file_path = os.path.join(temp_res_file_path, f"rakugo_v1_{i}.mp3")
    with open(audio_file_path, "wb") as f:
        f.write(audio)
    print(f"audio file saved to {audio_file_path}")

audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_0.mp3
audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_1.mp3
audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_2.mp3
audio file saved to /Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_3.mp3


In [23]:
# speed up audio according to segments start and end

from pydub import AudioSegment
from pydub.effects import speedup

for i, segment in enumerate(segments):
    audio_file_path = os.path.join(temp_res_file_path, f"rakugo_v1_{i}.mp3")
    audio = AudioSegment.from_file(audio_file_path, format="mp3")
    audio_dur = len(audio) / 1000
    speed = audio_dur / (segment['end'] - segment['start'])
    print("original audio duration", audio_dur, "segment duration", segment['end'] - segment['start'], "speed", speed)
    # round speed to nearest from 1, 1.25, 1.5
    speed = round(speed*4) / 4
    if speed <= 1:
        speed = 1
        speeded_audio = audio
    else:
        speeded_audio = speedup(audio, speed)
    print(f"speeded audio with speed {speed}")
    speeded_audio.export(os.path.join(temp_res_file_path, f"rakugo_v1_{i}_speeded.mp3"), format="mp3")
    print(f"audio file saved")

original audio duration 2.508 segment duration 3.0 speed 0.836
speeded audio with speed 1
audio file saved
original audio duration 3.056 segment duration 4.0 speed 0.764
speeded audio with speed 1
audio file saved
original audio duration 3.109 segment duration 3.0 speed 1.0363333333333333
speeded audio with speed 1
audio file saved
original audio duration 2.456 segment duration 3.0 speed 0.8186666666666667
speeded audio with speed 1
audio file saved


In [24]:
# merge speeded audio according to segments start and end
# use the first segment start time as the start time of the merged audio
# fill first silence with original audio
for i, segment in enumerate(segments):
    audio_file_path = os.path.join(temp_res_file_path, f"rakugo_v1_{i}_speeded.mp3")
    original_audio_file_path = os.path.join(data_path, "rakugo_v1_shortened.mp3")
    original_audio = AudioSegment.from_file(original_audio_file_path, format="mp3")
    audio = AudioSegment.from_file(audio_file_path, format="mp3")
    current_time = segment['start'] * 1000
    print("current_time", current_time, "segment start", segment['start'], "segment end", segment['end'])
    if i == 0:
        fill_dur = current_time
        fill = original_audio[:fill_dur]
        merged_audio = fill + audio
    else:
        silence_dur = segment['start'] * 1000 - current_time
        silence = AudioSegment.silent(duration=silence_dur)
        merged_audio = merged_audio + silence + audio
    current_time = len(merged_audio)

merged_audio.export(os.path.join(temp_res_file_path, f"rakugo_v1_merged.mp3"), format="mp3")

current_time 22000.0 segment start 22.0 segment end 25.0
current_time 25000.0 segment start 25.0 segment end 29.0
current_time 29000.0 segment start 29.0 segment end 32.0
current_time 41000.0 segment start 41.0 segment end 44.0


<_io.BufferedRandom name='/Users/howardtangkulung/code/personal_projects/rakugo/local_data/temp_results/rakugo_v1_merged.mp3'>

In [25]:
import moviepy.editor as mp

final_res_file_path = os.path.join(data_path, "final_results")

audio = mp.AudioFileClip(os.path.join(temp_res_file_path, "rakugo_v1_merged.mp3"))
video1 = mp.VideoFileClip(shortened_video_file_path)
final = video1.set_audio(audio)

final.write_videofile(os.path.join(final_res_file_path, "rakugo_v1_final.mp4"))

Moviepy - Building video /Users/howardtangkulung/code/personal_projects/rakugo/local_data/final_results/rakugo_v1_final.mp4.
MoviePy - Writing audio in rakugo_v1_finalTEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video /Users/howardtangkulung/code/personal_projects/rakugo/local_data/final_results/rakugo_v1_final.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready /Users/howardtangkulung/code/personal_projects/rakugo/local_data/final_results/rakugo_v1_final.mp4


In [26]:
# import glob

# files = glob.glob(os.path.join(temp_res_file_path, "*.mp3"))
# for f in files:
#     os.remove(f)