# Summary from an Audio file

In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2

In [None]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch

# Download denver_extract.mp3

https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing


In [None]:
# Constants

AUDIO_MODEL = "whisper-1"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
# New capability - connect this Colab to my Google Drive
# See immediately below this for instructions to obtain denver_extract.mp3

drive.mount("/content/drive")
audio_filename = "/content/denver_extract.mp3"

In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# Sign in to OpenAI using Secrets in Colab

openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

In [None]:
# Use the Whisper OpenAI model to convert the Audio to Text
# If you'd prefer to use an Open Source model, check "alternative implementation" at the bottom of this file

audio_file = open(audio_filename, "rb")
transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format="text")
print(transcription)

In [None]:
system_message = "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown and in results in PT-BR please."
user_prompt = f"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\n{transcription}"

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
  ]


In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

In [None]:
response = tokenizer.decode(outputs[0])

In [None]:
display(Markdown(response))

## Alternative implementation

**Download audio files at**:
https://www.bbc.co.uk/programmes/p02pc9zn/episodes/downloads

In [None]:
AUDIO_MODEL = "openai/whisper-medium"
speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(AUDIO_MODEL, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
speech_model.to('cuda')
processor = AutoProcessor.from_pretrained(AUDIO_MODEL)

pipe = pipeline(
    "automatic-speech-recognition",
    model=speech_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch.float16,
    device='cuda',
)

In [None]:
drive.mount("/content/drive")
audio_filename2 = "/content/LearningEnglishConversations-20250408-TheEnglishWeSpeakDoItForThePlot.mp3"

In [None]:
# Use the Whisper OpenAI model to convert the Audio to Text
result2 = pipe(audio_filename2, return_timestamps=True)

In [None]:
transcription2 = result2["text"]
print(transcription2)

In [None]:
system_message2 = "You are an assistant that produces notes from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown and in results in PT-BR please."
user_prompt2 = f"Below is an extract transcript of an audio file. Please write a in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\n{transcription2}"

messages = [
    {"role": "system", "content": system_message2},
    {"role": "user", "content": user_prompt2}
  ]

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained(LLAMA)
tokenizer2.pad_token = tokenizer2.eos_token
inputs = tokenizer2.apply_chat_template(messages, return_tensors="pt").to("cuda")
streamer = TextStreamer(tokenizer2)
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

In [None]:
response2 = tokenizer2.decode(outputs[0])

In [None]:
display(Markdown(response2))

## Adding Gradio interface

In [None]:
!pip install -q gradio torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers accelerate bitsandbytes

In [None]:
import gradio as gr
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoProcessor, AutoModelForSpeechSeq2Seq

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

# Modelo de transcrição (Whisper Open Source)
asr_model_name = "openai/whisper-medium"
speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    asr_model_name, torch_dtype=torch.float16, use_safetensors=True, low_cpu_mem_usage=True
).to(device)
processor = AutoProcessor.from_pretrained(asr_model_name)

asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=speech_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch.float16,
    device=device
)

# Modelo LLM (LLaMA 3 Instruct)
llm_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    device_map="auto",
    quantization_config=quant_config
)

def transcribe_and_summarize(audio_file):
    try:
        # Passo 1: Transcrição
        result = asr_pipeline(audio_file, return_timestamps=True)
        transcript = result["text"]

        # Passo 2: Construção do prompt
        system_message = "Você é um assistente que produz atas de reuniões a partir de transcrições, com resumo, pontos discutidos e itens de ação. O resultado deve estar em Markdown e em português."
        user_prompt = f"A seguir está uma transcrição de áudio. Gere a ata:\n\n{transcript}"

        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt}
        ]

        # Passo 3: Geração
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
        outputs = model.generate(inputs, max_new_tokens=1500)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return transcript, response
    
    except Exception as e:
        return "Erro na transcrição", f"Erro durante o processamento: {str(e)}"

# Interface Gradio com duas saídas: transcrição + resumo
interface = gr.Interface(
    fn=transcribe_and_summarize,
    inputs=gr.Audio(type="filepath", label="Envie seu áudio (.mp3, .wav etc)"),
    outputs=[
        gr.Textbox(label="Transcrição gerada"),
        gr.Markdown(label="Resumo em Markdown")
    ],
    title="Resumo de Áudio com Whisper + LLaMA 3 (Colab)",
    description="Faça upload de um áudio e obtenha a transcrição + resumo automático com pontos discutidos e ações. Utiliza modelos open source. Resultados em Markdown."
)

interface.launch(share=True)
