In [None]:
import json
import re
import pathlib
import openai
import os
path = pathlib.Path("path_to_good_stuff")

In [None]:
from yt_dlp import YoutubeDL

URLS = ['channel, video, or playlist urls']
with YoutubeDL(params={'format': '140', "paths": {"home": path.as_posix()}}
               ) as ydl:
    ydl.download(URLS)

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

model_id = "distil-whisper/distil-large-v2"
torch_dtype = torch.float16
device = "cuda:0"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, use_safetensors=True, 
    use_flash_attention_2=False
    ).to_bettertransformer()
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline("automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                torch_dtype=torch_dtype
                device=device)

for fname in path.glob("*.m4a"):
    if fname.with_suffix(".txt").exists():
        continue
    
    print(fname)
    outputs = pipe(fname.as_posix(),
                chunk_length_s=15,
                batch_size=32,
                return_timestamps=True)

    text = outputs["text"]
    #chunks = [text[i:i+6000] for i in range(0, len(text), 6000)]

    with open(fname.with_suffix(".txt"), "w") as f:
        #for c in chunks:
        #    f.write(c)
        #    f.write("\n\n\n")
        f.write(text)
    

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")

i = 0
for fname in path.glob("*.txt"):
    if (path / "summaries" / fname.name).exists():
        continue
    if i > 4:
        break
    print(fname)
    transcript = fname.read_text().strip()
    title = re.sub(r'\[.*?\]', '', fname.stem).strip()
    
    
    system_msg = f"""I need you to take notes on the following transcript. To give you context, the title is: {title}. Write any key points, actionable tips, insights, advice, or important information that you think is relevant. You can write as much as you want, but make sure that you cover everything that was discussed."""
    #print(system_msg)
    prompt = f"""TRANSCRIPT:\n\n{transcript}"""

    completion = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        temperature=0.0,
        max_tokens=1000,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": prompt},
        ],
    )

    summary = completion['choices'][0].message['content']


    with open(path / "summaries" / fname.name, "w") as f:
        f.write(summary)
    i += 1    