In [None]:
from pathlib import Path
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from backend.main import generate_video, generate_script, generate_paper, generate_assets, publish_post_command
from dataclasses import dataclass

# For notebook usage we need to apply the nest_asyncio patch
# !pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()

In [None]:
root_path = Path("./pipeline")
root_path

In [None]:
paper_id = "2406.04325"

In [None]:
def get_html_paper_url(paper_id: str) -> str | None:
    url = f"https://ar5iv.org/html/{paper_id}/"
    response = requests.get(url)
    if "arxiv.org/abs/" not in response.url:
        return url

    url = f"https://arxiv.org/html/{paper_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return url
    return None


In [None]:
url = get_html_paper_url(paper_id)
url

In [None]:
paper_root = root_path / paper_id
paper_root.mkdir(exist_ok=True)

In [None]:
paper_md_target = (paper_root / "paper").with_suffix(".md")
paper_md = generate_paper(url, store=paper_md_target.absolute().as_posix())

In [None]:
script_txt_target = (paper_root / "script").with_suffix(".txt")
script_txt, intro_txt = generate_script(paper_md, url=url, use_path=False, store=script_txt_target.absolute().as_posix())
title, figure_url = intro_txt.split("\n")

In [None]:
total_duration = generate_assets(
    script=script_txt,
    use_path=False,
    mp3_output=(paper_root / "audio").with_suffix(".wav").absolute().as_posix(),
    srt_output=(paper_root / "subtitles").with_suffix(".srt").absolute().as_posix(),
    rich_output=(paper_root / "rich").with_suffix(".json").absolute().as_posix(),
)

In [None]:
output_video = (paper_root / "video").with_suffix(".mp4")
generate_video(
    input_dir = paper_root,
    output_video = output_video,
)

In [None]:
output_video = (paper_root / "video").with_suffix(".mp4")
output_video