In [29]:
from elevenlabs.client import ElevenLabs
from elevenlabs import Voice, VoiceSettings, play, stream, save
from elevenlabs.client import ElevenLabs
from IPython.display import Audio as IPAudio
from dataclasses import dataclass
from typing import ClassVar
import numpy as np
from typing import Literal, Iterator
import pandas as pd
import whisper
import torch
import torchaudio
import srt
from datetime import timedelta


In [30]:
ELEVENLABS_API_KEY = "c973ad05989d6e4ce28a9acf65238957"
ELEVENLABS_VOICE = Voice(
    voice_id='lxYfHSkYm1EzQzGhdbfc',
    settings=VoiceSettings(stability=0.35, similarity_boost=0.8, style=0.0, use_speaker_boost=True)
)
ELEVENLABS_MODEL = "eleven_turbo_v2"

In [31]:
@dataclass
class Caption:
    word: str
    start: float
    end: float

In [32]:
@dataclass
class RichContent:
    content: str
    start: float | None = None
    end: float | None = None

@dataclass
class Figure(RichContent):
    pass

@dataclass
class Text:
    content: str
    audio: bytes | Iterator[bytes] | None = None
    audio_path: str | None = None
    captions: list[Caption] | None = None
    start: float | None = None
    end: float | None = None
    pass

@dataclass
class Equation(RichContent):
    pass

@dataclass
class Headline(RichContent):
    pass

In [33]:
paper = open('./paper/paper.md', 'r').read()
script = open('./script/script.txt', 'r').read()

In [54]:
script

'\\Headline: Michelangelo: Conditional 3D Shape Generation\n\\Text: Welcome back to Arxflix! Today, we’re diving into an intriguing new paper titled "Michelangelo: Conditional 3D Shape Generation based on Shape-Image-Text Aligned Latent Representation". This research looks at a novel approach for generating 3D shapes conditioned on 2D images or texts, which is a significant breakthrough in the field of deep learning and 3D modeling.\n\\Figure: https://ar5iv.labs.arxiv.org/html/2306.17115/assets/x1.png\n\\Text: Here’s an overview of the Michelangelo pipeline showing the two key models: the Shape-Image-Text-Aligned Variational Auto-Encoder (SITA-VAE) and the Aligned Shape Latent Diffusion Model (ASLDM). These models work together to bridge the distribution gap between 3D shapes and 2D images or texts.\n\\Headline: The Challenge\n\\Text: The main challenge addressed in this paper is the significant distribution gap between 3D shapes and their 2D or textual descriptions. Directly mapping t

In [34]:
def parse_script(script: str) -> list[RichContent | Text]:
    lines = script.split('\n')
    content = []
    for line in lines:
        if line.startswith(r'\Figure: '):
            figure_content = line.replace(r'\Figure: ', '')
            figure = Figure(content=figure_content)
            content.append(figure)
        elif line.startswith(r'\Text: '):
            text_content = line.replace(r'\Text: ', '')
            text = Text(content=text_content)
            content.append(text)
        elif line.startswith(r'\Equation: '):
            equation_content = line.replace(r'\Equation: ', '')
            equation = Equation(content=equation_content)
            content.append(equation)
        elif line.startswith(r'\Headline: '):
            headline_content = line.replace(r'\Headline: ', '')
            headline = Headline(content=headline_content)
            content.append(headline)
    return content

In [35]:
script_contents

[Headline(content='Michelangelo: Conditional 3D Shape Generation', start=0.0, end=24.372244897959185),
 Text(content='Welcome back to Arxflix! Today, we’re diving into an intriguing new paper titled "Michelangelo: Conditional 3D Shape Generation based on Shape-Image-Text Aligned Latent Representation". This research looks at a novel approach for generating 3D shapes conditioned on 2D images or texts, which is a significant breakthrough in the field of deep learning and 3D modeling.', audio=<generator object TextToSpeechClient.convert at 0x152503dc0>, audio_path='./audio/text_1.wav', captions=[Caption(word='Welcome', start=0.0, end=0.2), Caption(word='back', start=0.2, end=0.56), Caption(word='to', start=0.56, end=0.76), Caption(word='ARXFlicks.', start=0.76, end=1.3), Caption(word='Today', start=2.0, end=2.18), Caption(word="we're", start=2.18, end=2.44), Caption(word='diving', start=2.44, end=2.6), Caption(word='into', start=2.6, end=3.04), Caption(word='an', start=3.04, end=3.14), Ca

In [36]:
script_contents = parse_script(script)

In [37]:
elevenlabs_client = ElevenLabs(
  api_key=ELEVENLABS_API_KEY
)

In [38]:
def make_caption(result: dict) -> list[Caption]:
    captions: list[Caption] = []
    for segment in result['segments']:
        for word in segment['words']: # type: ignore
            _word = word['word'] # type: ignore
            # Remove leading space if there is one
            if _word.startswith(' '):
                _word = _word[1:]
            caption = Caption(word=_word, start=word['start'], end=word['end']) # type: ignore
            captions.append(caption)
    return captions

In [39]:
def generate_audio_and_caption(script_contents: list[RichContent | Text]) -> list[RichContent | Text]:
  for i, script_content in enumerate(script_contents):
    match script_content:
      case RichContent(content=content):
          pass
      case Text(content=content, audio=None, captions=None):
          # script_content.audio = elevenlabs_client.generate(
          #     text=content,
          #     voice=ELEVENLABS_VOICE,
          #     model=ELEVENLABS_MODEL
          # )
          audio_path = f'./audio/text_{i}.wav'
          # save(script_content.audio, audio_path)
          audio, sr = torchaudio.load(audio_path)
          model = whisper.load_model('base.en')
          option = whisper.DecodingOptions(language='en', fp16=True, without_timestamps=False, task='transcribe')
          result = model.transcribe(f'./audio/text_{i}.wav', word_timestamps=True)
          script_content.captions = make_caption(result)
          script_content.audio_path = audio_path
          total_audio_duration = audio.size(1) / sr
          script_content.end = total_audio_duration
  return script_contents

In [40]:
script_contents = generate_audio_and_caption(script_contents)



In [44]:
def add_caption_offset_and_gap(script_contents: list[RichContent | Text]) -> list[RichContent | Text]:
    offset = 0
    for i, script_content in enumerate(script_contents):
        if not(isinstance(script_content, Text)):
            continue
        if not script_content.captions:
            continue
        for caption in script_content.captions:
            caption.start += offset
            caption.end += offset
        script_content.start = offset
        if script_content.end:
            script_content.end = script_content.end + offset
        else:
            script_content.end = script_content.captions[-1].end
        offset = script_content.end
    return script_contents

In [45]:
script_contents = add_caption_offset_and_gap(script_contents)

In [46]:
def fill_rich_content_time(script_contents: list[RichContent | Text]) -> list[RichContent | Text]:
    k = 0
    while k < len(script_contents):
        current_rich_content_group = []
        while k < len(script_contents) and not isinstance(script_contents[k], Text):
            current_rich_content_group.append(script_contents[k])
            k += 1
        
        if k >= len(script_contents):
            break

        next_text_group = []
        while k < len(script_contents) and isinstance(script_contents[k], Text):
            next_text_group.append(script_contents[k])
            k += 1
        
        if not next_text_group:
            break

        total_duration = next_text_group[-1].end - next_text_group[0].start
        duration_per_rich_content = total_duration / len(current_rich_content_group)
        offset = next_text_group[0].start
        for i, rich_content in enumerate(current_rich_content_group):
            rich_content.start = offset + i * duration_per_rich_content
            rich_content.end = offset + (i + 1) * duration_per_rich_content
            # print(f"Asigning {rich_content.start} - {rich_content.end} to {rich_content}")
    return script_contents

In [47]:
script_contents = fill_rich_content_time(script_contents)

In [48]:
script_contents

[Headline(content='Michelangelo: Conditional 3D Shape Generation', start=0.0, end=24.372244897959185),
 Text(content='Welcome back to Arxflix! Today, we’re diving into an intriguing new paper titled "Michelangelo: Conditional 3D Shape Generation based on Shape-Image-Text Aligned Latent Representation". This research looks at a novel approach for generating 3D shapes conditioned on 2D images or texts, which is a significant breakthrough in the field of deep learning and 3D modeling.', audio=None, audio_path='./audio/text_1.wav', captions=[Caption(word='Welcome', start=0.0, end=0.2), Caption(word='back', start=0.2, end=0.56), Caption(word='to', start=0.56, end=0.76), Caption(word='ARXFlicks.', start=0.76, end=1.3), Caption(word='Today', start=2.0, end=2.18), Caption(word="we're", start=2.18, end=2.44), Caption(word='diving', start=2.44, end=2.6), Caption(word='into', start=2.6, end=3.04), Caption(word='an', start=3.04, end=3.14), Caption(word='intriguing', start=3.14, end=3.46), Caption(

In [49]:
rich_content = [c for c in script_contents if not isinstance(c, Text)]
text_content = [c for c in script_contents if isinstance(c, Text)]

In [50]:
def export_mp3(text_content: list[Text], out_path: str) -> None:
    # Merge all mp3 and add a 0.5s silence between each
    audio_all = []
    for i, text in enumerate(text_content):
        if not text.audio_path:
            continue

        path = text.audio_path
        audio, sr = torchaudio.load(path)
        audio_all.append(audio)
    audio_all_torch = torch.cat(audio_all, dim=1)
    torchaudio.save(out_path, audio_all_torch, sr)

In [51]:
export_mp3(text_content, './audio.wav')

In [52]:
def export_srt(full_audio_path: str, out_path: str) -> None:    
    model = whisper.load_model('base.en')
    option = whisper.DecodingOptions(language='en', fp16=True, without_timestamps=False, task='transcribe')
    result = model.transcribe(full_audio_path, word_timestamps=True)
    flatten_caption = make_caption(result)
    
    # flatten_caption = []
    # for text in all_text_content:
    #     if text.captions:
    #         for caption in text.captions:
    #             flatten_caption.append(caption)
        
    # flatten_caption = sorted(flatten_caption, key=lambda x: x.start)

    subs = [
        srt.Subtitle(index=i, start=timedelta(seconds=t.start), end=timedelta(seconds=t.end), content=t.word)
        for i, t in enumerate(flatten_caption)
    ]
    srt_text = srt.compose(subs)
    with open(out_path, 'w') as f:
        f.write(srt_text)

In [53]:
export_srt('./audio.wav', './output.srt')



In [24]:
def export_rich_content_json(rich_content: list[RichContent], out_path: str) -> None:
    rich_content_dict = []
    for i, content in enumerate(rich_content):
        content_dict = {
            'type': content.__class__.__name__.lower(),
            'content': content.content,
            'start': content.start,
            'end': content.end
        }
        rich_content_dict.append(content_dict)
    df = pd.DataFrame(rich_content_dict)
    df.to_json(out_path, orient='records')

In [25]:
export_rich_content_json(rich_content, './output.json')

In [33]:
import subprocess
from pathlib import Path
from dataclasses import dataclass, asdict, field
from typing import Literal
import json

In [34]:

video_fps = 30
video_height = 1080
video_width = 1920

@dataclass
class CompositionProps:
    durationInSeconds: int = 5
    audioOffsetInSeconds: int = 0
    subtitlesFileName: str = "public/output.srt"
    audioFileName: str = "public/audio.wav"
    richContentFileName: str = "public/output.json"
    waveColor: str = "#a3a5ae"
    subtitlesLinePerPage: int = 2
    subtitlesLineHeight: int = 98
    subtitlesZoomMeasurerSize: int = 10
    onlyDisplayCurrentSentence: bool = True
    mirrorWave: bool = False
    waveLinesToDisplay: int = 300
    waveFreqRangeStartIndex: int = 5
    waveNumberOfSamples: Literal['32', '64', '128', '256', '512'] = '512'
    durationInFrames: int = field(init=False)
    def __post_init__(self):
        self.durationInFrames: int = self.durationInSeconds * video_fps

In [35]:
props = CompositionProps()


In [36]:
props.durationInFrames

150

In [31]:
remotion_root_path = Path("../remotion/index.ts")
composition_id = "Arflix"
props = CompositionProps()
# json.dumps(asdict(props))
concurrency = 1
output = Path("./output.mp4")
subprocess.run([
  "npx", 
  "remotion", 
  "render", 
  remotion_root_path.absolute().as_posix(),
  "--props", json.dumps(asdict(props)),
  "--compositionId", composition_id,
  "--concurrency", str(concurrency),
  "--output", output.absolute().as_posix()
])




Bundling 6%
Bundling 17%
Bundling 56%
Bundling 63%
Bundling 69%
Bundling 74%
Bundling 80%
Bundling 85%
Bundling 90%
Bundling 95%
Bundling 100%
Copying public dir 16.9 MB
Copying public dir 36.9 MB
Getting compositions
[90mComposition        Arxflix[39m
[90mCodec              h264[39m
[90mOutput             /Users/julienblanchon/Git/new_repo/arxflix/api/output.mp4[39m
[90mConcurrency        1x[39m
Rendered 0/150
Rendered 1/150, time remaining: 2m 44s
Rendered 2/150, time remaining: 1m 55s
Rendered 3/150, time remaining: 1m 38s
Rendered 4/150, time remaining: 30s
Rendered 5/150, time remaining: 25s
Rendered 6/150, time remaining: 21s
Rendered 7/150, time remaining: 19s
Rendered 8/150, time remaining: 17s
Rendered 9/150, time remaining: 15s
Rendered 10/150, time remaining: 14s
Rendered 11/150, time remaining: 13s
Rendered 12/150, time remaining: 12s
Rendered 13/150, time remaining: 12s
Rendered 14/150, time remaining: 11s
Rendered 15/150, time remaining: 11s
Rendered 16/150, time 

CompletedProcess(args=['npx', 'remotion', 'render', '/Users/julienblanchon/Git/new_repo/arxflix/api/../remotion/index.ts', '--props', '{"durationInSeconds": 5, "audioOffsetInSeconds": 0, "subtitlesFileName": "public/output.srt", "audioFileName": "public/audio.wav", "richContentFileName": "public/output.json", "waveColor": "#a3a5ae", "subtitlesLinePerPage": 2, "subtitlesLineHeight": 98, "subtitlesZoomMeasurerSize": 10, "onlyDisplayCurrentSentence": true, "mirrorWave": false, "waveLinesToDisplay": 300, "waveFreqRangeStartIndex": 5, "waveNumberOfSamples": "512", "durationInFrames": 300}', '--compositionId', 'Arflix', '--concurrency', '1', '--output', '/Users/julienblanchon/Git/new_repo/arxflix/api/output.mp4'], returncode=0)

{'durationInSeconds': 5,
 'audioOffsetInSeconds': 0,
 'subtitlesFileName': 'output.srt',
 'audioFileName': 'audio.wav',
 'richContentFileName': 'output.json',
 'waveColor': '#a3a5ae',
 'subtitlesLinePerPage': 2,
 'subtitlesLineHeight': 98,
 'subtitlesZoomMeasurerSize': 10,
 'onlyDisplayCurrentSentence': True,
 'mirrorWave': False,
 'waveLinesToDisplay': 300,
 'waveFreqRangeStartIndex': 5,
 'waveNumberOfSamples': '512'}

In [24]:
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import torch

device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"
if torch.backends.mps.is_available():
    device = "mps"
if torch.xpu.is_available():
    device = "xpu"
torch_dtype = torch.float16 if device != "cpu" else torch.float32

model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device, dtype=torch_dtype)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")

prompt = "Hey, how are you doing today?"
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."

input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids).to(torch.float32)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate)


'{"durationInSeconds": 5, "audioOffsetInSeconds": 0, "subtitlesFileName": "output.srt", "audioFileName": "audio.wav", "richContentFileName": "output.json", "waveColor": "#a3a5ae", "subtitlesLinePerPage": 2, "subtitlesLineHeight": 98, "subtitlesZoomMeasurerSize": 10, "onlyDisplayCurrentSentence": true, "mirrorWave": false, "waveLinesToDisplay": 300, "waveFreqRangeStartIndex": 5, "waveNumberOfSamples": "512"}'