# 0. Initialization

In [None]:
DATA_FOLDER = "data"
AUDIO_EXTENSION = ".mp3"
STT_MODEL_ID = "openai/whisper-large-v3-turbo" # 7GB VRAM required

## pip

In [None]:
pip install -qU pip

## HuggingFace

In [None]:
pip install -qU pip transformers datasets[audio] accelerate huggingface_hub[hf_xet]

## PyTorch

In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

## Others

In [None]:
pip install jupyter ipywidgets langchain azure-ai-inference python-dotenv tiktoken

# 1. Extract text from audio

In [None]:
import os
import json
from tqdm import tqdm

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    STT_MODEL_ID,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(STT_MODEL_ID)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    chunk_length_s=30,
    batch_size=16
)

transcriptions = []

for filename in tqdm(os.listdir(DATA_FOLDER), "Extracting text from audio files"):
    if filename.endswith(AUDIO_EXTENSION):
        file_path = os.path.join(DATA_FOLDER, filename)
        result = pipe(file_path)
        transcriptions.append({
            "filename": filename,
            "text": result["text"] # type: ignore
        })

with open(os.path.join(DATA_FOLDER, "transcriptions.json"), "w", encoding="utf-8") as f:
    json.dump(transcriptions, f, ensure_ascii=False, indent=2)

# 2. Summarize text

## Load transcriptions

In [None]:
import os
import json

import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4")

DATA_FOLDER = "data"
AUDIO_EXTENSION = ".mp3"
STT_MODEL_ID = "openai/whisper-large-v3-turbo" # 7GB VRAM required
SUMMARIZATION_MODEL_ID = "facebook/bart-large-cnn"

with open(os.path.join(DATA_FOLDER, "transcriptions.json"), "r", encoding="utf-8") as f:
    transcriptions = json.load(f)

    print(f"Loaded {len(transcriptions)} transcriptions:")
    for transcription in transcriptions:
        print(f"\t{transcription['filename']} with lenght {len(encoding.encode(transcription['text']))} tokens")


## Option 1: Foundation model summarization
> GPT-4.1 via GutHub models (limit 8k input context)

In [None]:
import os
from dotenv import load_dotenv

from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential
from langchain.text_splitter import CharacterTextSplitter


endpoint = "https://models.github.ai/inference"
model = "openai/gpt-4.1"
token = os.environ["GITHUB_TOKEN"]


load_dotenv()

client = ChatCompletionsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(token),
)

splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=30000,
    chunk_overlap=500
)

for transcription in transcriptions:

    transcription["summary"] = ""

    chunks = splitter.split_text(transcription["text"])

    for chunk in chunks:

        response = client.complete(
            messages=[
                SystemMessage("Your task is to summarize the text provided in the next message. "
                            "Focus on practical information, key points, and actionable insights. "
                            "Ignore any personal opinions, anecdotes, or irrelevant details. "
                            "Look also for information about sunglasses that allow some UV light to pass through. "),
                UserMessage(chunk),
            ],
            temperature=1.0,
            top_p=1.0,
            model=model
        )

        transcription["summary"] += response.choices[0].message.content + "\n"


In [None]:
import textwrap

print(textwrap.fill(transcriptions[0]["summary"], width=80))

## Option 2: Local summarization model
> BART
- No option to specify how the summary should look like
- Very limited context window (2K or so)
- These 2 combined produce very long summaries

In [None]:
from tqdm import tqdm

from transformers import pipeline
from langchain.text_splitter import CharacterTextSplitter


model = "facebook/bart-large-cnn" # 3 GB VRAM required for batch size 1

summarizer = pipeline("summarization", model=model)

for transcription in tqdm(transcriptions, "Summarizing transcriptions"):

    text = transcription["text"]

    splitter = CharacterTextSplitter(
        separator=" ",
        chunk_size=2000,
        chunk_overlap=100
    )
    chunks = splitter.split_text(text)

    transcription["summaries"] = summarizer(
        chunks,
        batch_size=16,
        do_sample=False,
    )

    transcription["summary"] = " ".join(
        [summary["summary_text"] for summary in transcription["summaries"]]
    )
    print(f"Summary lenght: {len(transcription['summary'])} from original {len(text)} characters")

In [None]:
import textwrap

print(textwrap.fill(str(transcriptions[0]["summary"]), width=80))