In [None]:
import asyncio
import collections.abc
import dataclasses
import logging
import math
import os
import tempfile
import typing
import warnings

import IPython.display
import ipyvuetify.extra
import ipywidgets
import nest_asyncio
import openai
import openai.types.audio
import openai.types.chat
import pydantic

try:
    import pydub
    import pydub.silence
except ImportError:
    import pip

    pip.main(["install", "pydub"])
    import pydub
    import pydub.silence

try:
    import pyannote.audio
except ImportError:
    import pip

    pip.main(["install", "pyannote.audio"])
    import pyannote.audio


class DialogueSegment(pydantic.BaseModel):
    speaker: typing.Literal["タカシ", "タケシ"]
    content: str


class ResponseFormat(pydantic.BaseModel):
    title: str
    dialogues: typing.List[DialogueSegment]


warnings.simplefilter("ignore")
nest_asyncio.apply()
logging.getLogger("httpx").setLevel(logging.ERROR)

COMBINED_TAB_NAME = "Combined"
CACHE_DIRECTORY = os.path.expanduser("~/work/.cache/transcribe")
OPENAI_BASE_URL = "http://cortex-api.cortex-api.svc.cluster.local:8080/v1"
with open("/var/run/secrets/kubernetes.io/serviceaccount/token", "r", encoding="utf-8") as f:
    OPENAI_API_KEY = f.read()

file_upload = ipyvuetify.extra.FileInput()

model = ipywidgets.widgets.Dropdown(
    options=["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"],
    value="gpt-4o-mini-transcribe",
    description="Model: ",
)

prompt = ipywidgets.widgets.Textarea(
    value="",
    description="Prompt: ",
)

diarization = ipywidgets.widgets.Checkbox(
    value=False,
    description="Diarization",
    disabled=True,
)

speakers = ipywidgets.widgets.IntSlider(
    value=2,
    min=1,
    max=10,
    step=1,
    description="Speakers:",
    disabled=True,
)

process_button = ipywidgets.widgets.Button(
    description="Process",
    button_style="primary",
    icon="play",
)

output = ipywidgets.widgets.Output()
status_label = ipywidgets.widgets.Label(value="")


async def post_ephemeral_message(
    message: str,
    duration: int = 1,
):
    status_label.value = message
    await asyncio.sleep(duration)
    status_label.value = ""


async def extract_audio_from_video(working_directory: str, video_path: str, output_format: str = "mp3") -> str:
    output_path = os.path.join(working_directory, f"{video_path}.{output_format}")

    def _extract() -> str:
        audio = pydub.AudioSegment.from_file(video_path)
        audio.export(output_path, format=output_format)
        return output_path

    return await asyncio.get_running_loop().run_in_executor(None, _extract)


async def split_audio_by_size(
    working_directory: str,
    audio_path: str,
    max_size_mb: int,
    output_format: str = "mp3",
) -> collections.abc.Sequence[str]:
    audio = pydub.AudioSegment.from_file(audio_path)

    file_size = os.path.getsize(audio_path)

    max_size_bytes = max_size_mb * 1024 * 1024
    number_of_files = math.ceil(file_size / max_size_bytes)

    if number_of_files <= 1:
        return [audio_path]

    length = len(audio) // number_of_files

    futures = []
    for i in range(number_of_files):
        start = i * length
        end = min((i + 1) * length, len(audio))

        chunk = audio[start:end]
        chunk_path = os.path.join(working_directory, f"{audio_path}_{i:03d}.{output_format}")

        def _export(c=chunk, cp=chunk_path) -> str:
            c.export(cp, format=output_format)
            return cp

        futures.append(asyncio.get_running_loop().run_in_executor(None, _export))

    return await asyncio.gather(*futures)


async def split_audio_by_silence(
    working_directory: str,
    audio_path: str,
    max_size_mb: int,
    output_format: str = "mp3",
) -> collections.abc.Sequence[str]:
    audio = pydub.AudioSegment.from_file(audio_path)

    file_size = os.path.getsize(audio_path)
    max_size_bytes = max_size_mb * 1024 * 1024

    total_ms = len(audio)
    target_ms = int(max_size_bytes * total_ms / file_size)

    silences = pydub.silence.detect_silence(audio)
    cuts = sorted({(s + e) // 2 for s, e in silences})
    boundaries = [0] + cuts + [total_ms]

    chunks = []
    i = 0
    while i < len(boundaries) - 1:
        j = i + 1
        while j < len(boundaries) and boundaries[j] - boundaries[i] <= target_ms:
            j += 1

        start_ms = boundaries[i]
        end_ms = boundaries[j - 1]
        if end_ms == start_ms:
            end_ms = min(start_ms + target_ms, total_ms)

        chunks.append((start_ms, end_ms))

        if end_ms not in boundaries:
            boundaries.insert(i + 1, end_ms)
        i = boundaries.index(end_ms)

    futures = []
    for i, (start_ms, end_ms) in enumerate(chunks):
        chunk = audio[start_ms:end_ms]
        chunk_path = os.path.join(working_directory, f"{audio_path}_{i:03d}.{output_format}")

        def _export(c=chunk, cp=chunk_path) -> str:
            c.export(cp, format=output_format)
            return cp

        futures.append(asyncio.get_running_loop().run_in_executor(None, _export))

    return await asyncio.gather(*futures)


@dataclasses.dataclass
class DiarizationChunk:
    start: float
    end: float
    label: str
    audio_path: str


async def perform_diarization(
    working_directory: str,
    audio_path: str,
    num_speakers: int,
    output_format: str = "mp3",
) -> collections.abc.Sequence[DiarizationChunk]:
    pipeline = pyannote.audio.Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
    )

    annotation = pipeline(audio_path, num_speakers=num_speakers)

    futures = []
    audio = pydub.AudioSegment.from_file(audio_path)
    for segment, _, speaker in annotation.itertracks(yield_label=True):
        start_ms = int(segment.start * 1000)
        end_ms = int(segment.end * 1000)

        chunk = audio[start_ms:end_ms]
        chunk_path = os.path.join(working_directory, f"{audio_path}_{start_ms}_{end_ms}_{speaker}.{output_format}")

        def _export(c=chunk, cp=chunk_path, s=segment, label=speaker) -> DiarizationChunk:
            c.export(cp, format=output_format)
            diarization_chunk = DiarizationChunk(
                start=s.start,
                end=s.end,
                label=label,
                audio_path=cp
            )
            return diarization_chunk

        futures.append(asyncio.get_running_loop().run_in_executor(None, _export))

    return await asyncio.gather(*futures)


async def transcribe_file(
    file_path: str,
) -> str:
    file_path = os.path.abspath(file_path)
    extension = os.path.splitext(file_path)[1].lower() or ".mp3"

    with tempfile.TemporaryDirectory() as t:
        cache_directory = CACHE_DIRECTORY
        if diarization.value:
            cache_directory = os.path.join(cache_directory, "diarization")
        cache_path = os.path.join(cache_directory, os.path.basename(file_path))
        if os.path.exists(cache_path):
            return open(cache_path).read()
        else:
            os.makedirs(os.path.dirname(cache_path), exist_ok=True)

        if extension in [".mp4"]:
            output_format = "mp3"
            file_path = await extract_audio_from_video(t, file_path, output_format=output_format)
        else:
            output_format = extension.lstrip(".")

        async def atranscribe_by_openai(audio_path: str) -> openai.types.audio.Transcription:
            client = openai.AsyncOpenAI(
                base_url=OPENAI_BASE_URL,
                api_key=OPENAI_API_KEY,
            )

            with open(audio_path, "rb") as audio_file:
                response = await client.audio.transcriptions.create(
                    model=model.value,
                    file=audio_file
                )

            return response

        if diarization.value:
            speaker_chunks = await perform_diarization(t, file_path, speakers.value)
            transcription_tasks = [atranscribe_by_openai(chunk.audio_path) for chunk in speaker_chunks]
            transcriptions = await asyncio.gather(*transcription_tasks)

            result = ""
            for chunk, transcription in zip(speaker_chunks, transcriptions):
                result += f"{chunk.label} ({chunk.start:.2f}s - {chunk.end:.2f}s):\n{transcription.text}\n\n"

            with open(cache_path, "w") as f:
                f.write(result)

            return result
        else:
            chunks = await split_audio_by_silence(t, file_path, 5, output_format=output_format)
            transcription_tasks = [atranscribe_by_openai(chunk_path) for chunk_path in chunks]
            transcriptions = await asyncio.gather(*transcription_tasks)

            result = "\n".join([t.text for t in transcriptions])

            with open(cache_path, "w") as f:
                f.write(result)

            return result


async def transcribe_files() -> None:
    process_button.disabled = True

    try:
        with tempfile.TemporaryDirectory() as t:
            tasks = []
            file_names = []

            for uploaded in file_upload.get_files():
                file_path = os.path.join(t, uploaded["name"])
                file_names.append(uploaded["name"])
                with open(file_path, "wb") as f:
                    f.write(uploaded["file_obj"].read())
                tasks.append(transcribe_file(file_path))

            if tasks:
                transcriptions = await asyncio.gather(*tasks)

                initialize_tabs()

                if len(transcriptions) > 1:
                    combined_text = []
                    for i, transcription in enumerate(transcriptions):
                        combined_text.append(f"# {file_names[i]}\n{transcription}\n")

                    tab_contents[COMBINED_TAB_NAME].result.value = "\n".join(combined_text)
                else:
                    tab_contents[COMBINED_TAB_NAME].result.value = "".join(transcriptions)

                for i, file_name in enumerate(file_names):
                    tab_contents[file_name] = TabContent(file_name)
                    tab_contents[file_name].result.value = transcriptions[i]
                    tabs.children += (tab_contents[file_name].widgets,)
                    tabs.set_title(i + 1, file_name)
            else:
                status_label.value = "No such file"
    except Exception as e:
        status_label.value = f"Error: {str(e)}"
        raise e
    finally:
        process_button.disabled = False


@output.capture()
def process(b: ipywidgets.widgets.Button) -> None:
    status_label.value = ""
    output.clear_output()

    task = asyncio.create_task(transcribe_files())
    asyncio.get_running_loop().run_until_complete(task)


process_button.on_click(process)


class TabContent:
    def __init__(self, name: str):
        self.name = name
        self.result = ipywidgets.widgets.Textarea(
            value="",
            description="Result: ",
            layout=ipywidgets.Layout(width="80%", height="300px"),
        )

        self._clipboard_button = ipywidgets.widgets.Button(
            description="",
            icon="clipboard",
            tooltip="Copy to clipboard",
            disabled=True,
            layout=ipywidgets.Layout(width="40px", height="40px"),
        )

        self._clipboard_button.on_click(self.clipboard_handler)

        self._summary_button = ipywidgets.widgets.Button(
            description="Summary",
            icon="pen",
            disabled=True,
        )
        self._podcast_button = ipywidgets.widgets.Button(
            description="Podcast",
            icon="headphones",
            disabled=True,
        )

        self._summary_button.on_click(lambda b: self.summary_handler(b))
        self._podcast_button.on_click(lambda b: self.podcast_handler(b))

        self.widgets = ipywidgets.widgets.VBox([
            ipywidgets.widgets.HBox([
                self.result,
                self._clipboard_button
            ]),
            ipywidgets.widgets.HBox([
                self._summary_button,
                self._podcast_button,
            ])
        ])

        self.result.observe(self.update_button_availability, names="value")

    def update_button_availability(self, change) -> None:
        has_content = bool(change.new)

        self._clipboard_button.disabled = not has_content
        self._summary_button.disabled = not has_content
        self._podcast_button.disabled = not has_content

    @output.capture()
    def clipboard_handler(self, b: ipywidgets.widgets.Button) -> None:
        IPython.display.display(IPython.display.Javascript(f"navigator.clipboard.writeText(`{self.result.value}`);"))
        asyncio.create_task(post_ephemeral_message("Copied to clipboard"))

    @output.capture()
    def summary_handler(self, b: ipywidgets.widgets.Button) -> None:
        b.disabled = True

        client = openai.OpenAI(
            base_url=OPENAI_BASE_URL,
            api_key=OPENAI_API_KEY,
        )

        response = client.chat.completions.create(
            model="gpt-5.2",
            messages=[
                openai.types.chat.ChatCompletionSystemMessageParam(
                    role="system",
                    content="You are an AI assistant that summarizes the transcript of an audio file."
                ),
                openai.types.chat.ChatCompletionUserMessageParam(
                    role="user",
                    content=self.result.value,
                )
            ],
        )

        content = response.choices[0].message.content
        self.result.value = content

        b.disabled = False

    async def _podcast_handler(self):
        async def generate_speech_segment(client, speaker, text, segment_file):
            voice = "onyx" if speaker == "タカシ" else "echo"
            instructions = """
落ち着いた男性の声。分析的で理知的なトーン。明瞭で説得力のある話し方。
専門知識を持つ解説者のような声質。少しゆっくりめのテンポ。
""" if speaker == "タカシ" else """
明るく活発な男性の声。表現力豊かで感情を込めた話し方。
ユーモアのある軽快なトーンと抑揚。親しみやすく、エネルギッシュな声質。
テンポ良く会話するスタイル。
"""

            response = await client.audio.speech.create(
                model="gpt-4o-mini-tts",
                voice=voice,
                input=text,
                instructions=instructions,
                response_format="mp3",
            )

            with open(segment_file, "wb") as f:
                f.write(response.content)

            return segment_file

        client = openai.AsyncOpenAI(
            base_url=OPENAI_BASE_URL,
            api_key=OPENAI_API_KEY,
        )

        response = await client.beta.chat.completions.parse(
            model="gpt-5.2",
            response_format=ResponseFormat,
            messages=[
                openai.types.chat.ChatCompletionSystemMessageParam(
                    role="system",
                    content="""
2人のパーソナリティ(タカシとタケシ)が登場するポッドキャストのスクリプトを日本語で作成してください。
与えられた内容を面白く、魅力的に紹介する必要があります。

タカシ: 冷静で分析的なタイプで、事実とデータを重視します。専門的な解説が得意です。
タケシ: 明るく活発でユーモアのあるタイプです。感情表現が豊かで、聴衆を楽しませるのが得意です。

各ホストの発言は明確に示してください。
イントロ、メインコンテンツの紹介、そして締めくくりという構成にしてください。
最後に、生成されたスクリプトをもとに興味を引くポッドキャストのタイトルも考えてください。
""",
                ),
                openai.types.chat.ChatCompletionUserMessageParam(
                    role="user",
                    content=self.result.value,
                )
            ],
        )

        content = response.choices[0].message.content
        response_format = ResponseFormat.model_validate_json(content)

        with tempfile.TemporaryDirectory() as t:
            dialogue_segments = [(segment.speaker, segment.content) for segment in response_format.dialogues]

            tasks = []
            for i, (speaker, text) in enumerate(dialogue_segments):
                segment_file = os.path.join(t, f"segment_{i}.mp3")
                tasks.append(generate_speech_segment(client, speaker, text, segment_file))

            segment_files = await asyncio.gather(*tasks)

            audio_segments = [pydub.AudioSegment.from_file(segment_file) for segment_file in segment_files]
            silence = pydub.AudioSegment.silent(duration=300)
            if audio_segments:
                combined_audio = pydub.AudioSegment.empty()
                for segment in audio_segments:
                    if len(combined_audio) > 0:
                        combined_audio += silence
                    combined_audio += segment

                combined_file = f"/tmp/{response_format.title}.mp3"
                combined_audio.export(combined_file, format="mp3")
                IPython.display.display(IPython.display.Audio(combined_file))

    @output.capture()
    def podcast_handler(self, b: ipywidgets.widgets.Button) -> None:
        b.disabled = True

        task = asyncio.create_task(self._podcast_handler())
        asyncio.get_running_loop().run_until_complete(task)

        b.disabled = False


tab_contents = {}
tabs = ipywidgets.widgets.Tab()


def initialize_tabs():
    tab_contents[COMBINED_TAB_NAME] = TabContent(COMBINED_TAB_NAME)
    tabs.children = [tab_contents[COMBINED_TAB_NAME].widgets]
    tabs.set_title(0, COMBINED_TAB_NAME)


initialize_tabs()

diarization_options = ipywidgets.widgets.VBox([
    diarization,
    ipywidgets.widgets.HBox([speakers])
])


def update_speakers_availability(change) -> None:
    speakers.disabled = not change.new


diarization.observe(update_speakers_availability, names="value")


def update_diarization_availability(change) -> None:
    enabled = True

    for uploaded in change.new:
        file_path = os.path.abspath(uploaded["name"])
        extension = os.path.splitext(file_path)[1].lower()
        if extension not in [".wav", ".mp3"]:
            enabled = False
            break

    diarization.disabled = not enabled


file_upload.observe(update_diarization_availability, names="file_info")

IPython.display.display(
    file_upload,
    prompt,
    model,
    diarization_options,
    process_button,
    output,
    status_label,
    tabs,
)