<a href="https://colab.research.google.com/github/kristheticcc/Audio-Summarizer-AI/blob/main/speech_to_summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --upgrade bitsandbytes accelerate

In [None]:
# Imports

import os
import requests
import torch
from IPython.display import display, Markdown
from huggingface_hub import login
from google.colab import userdata, drive
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
from openai import OpenAI

In [None]:
# Checking the GPU

gpu_info=!nvidia-smi
gpu_info='\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print("GPU Connected")
  if gpu_info.find("Tesla T4")>=0:
    print("Connected to T4")
  else:
    print("Not connected to T4")

In [None]:
# Logging in to huggingface

hf_token=userdata.get("HF_TOKEN")
if not hf_token:
  print("HuggingFace token not found!!!")
else:
  print("Token Found")
login(hf_token, add_to_git_credential=True)

In [None]:
# Models

openai_api_key=userdata.get("OPENAI_API_KEY")
openai=OpenAI(api_key=openai_api_key)
frontier_gpt="gpt-4o-mini-transcribe"       # for transcribing using frontier model

open_source="openai/whisper-medium.en"      # for transcribing using open-source model

LLAMA="meta-llama/Llama-3.2-3B-Instruct"    # for summarizing the transcript

In [None]:
# Getting the audio file from drive and opening it

drive.mount("/content/drive")
audio_filename="/content/drive/MyDrive/llms/denver_extract.mp3"

audio_file=open(audio_filename, "rb")

In [None]:
# Transcribing the audio using open source model (Huggingface pipelines)

pipe=pipeline("automatic-speech-recognition", model=open_source, dtype=torch.float16, device="cuda", return_timestamps=True)
result=pipe(audio_filename)
transcription=result["text"]
print(display(Markdown(transcription)))

In [None]:
# Transcribing audio using frontier model using frontier model (openai)
# Do not run if openai api key not available

result=openai.audio.transcriptions.create(model=frontier_gpt, file=audio_file, response_format="text")
print(display(Markdown(result)))

In [None]:
# Defining system and user prompts for summarizing the audio transcription
# We'll use transript generated by open source model

system_prompt="""
You produce minutes of meetings from transcripts, with summary, key discussion points,
takeaways and action items with owners, in markdown format without code blocks.
"""

user_prompt=f"""
Below is an extract transcript of a Denver council meeting.
Please write minutes in markdown without code blocks, including:
- a summary with attendees, location and date
- discussion points
- takeaways
- action items with owners

Transcription:
{transcription}
"""

In [None]:
# Preparing the message and quantization

messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

quant_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)


In [None]:
# Getting input tokens

tokenizer=AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token=tokenizer.eos_token
inputs=tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
print(inputs)

In [None]:
# Generating the summary

streamer=TextStreamer(tokenizer)
model=AutoModelForCausalLM.from_pretrained(LLAMA, quantization_config=quant_config, device_map="auto")
output=model.generate(inputs=inputs, max_new_tokens=2000, streamer=streamer)
print(display(Markdown(tokenizer.decode(output[0]))))