In [2]:
# Install required packages in Google Colab
%pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.28.1 gradio

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# imports
import torch
import threading
from openai import OpenAI
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig
import gradio as gr

In [4]:
# Constants
AUDIO_MODEL = "whisper-1"  # OpenAI Whisper API model
LLM_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [5]:
# Google Colab User Data
# Ensure you have set the following in your Google Colab environment:
hf_token = userdata.get('HF_TOKEN')
openai_api_key = userdata.get('OPENAI_API_KEY')

In [6]:
login(hf_token, add_to_git_credential=True)
openai = OpenAI(api_key=openai_api_key)

In [7]:
class MeetingAssistant:
    def __init__(self, model_name=LLM_MODEL, audio_model=AUDIO_MODEL):

        # Load tokenizer and llm model
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4"
        )

        self.audio_model = audio_model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=quant_config
        )

    def transcribe_audio(self, audio_path, progress):
        """Transcribes the uploaded audio file using OpenAI Whisper API."""

        progress(0.3, desc="Transcribing audio...")

        try:
            with open(audio_path, "rb") as audio_file:
                transcription = openai.audio.transcriptions.create(
                    model=self.audio_model,
                    file=audio_file,
                    response_format="text"
                )
                return transcription
        except Exception as e:
            return f"Error during transcription: {str(e)}"

    def generate_minutes(self, transcription, attendees=None, date=None, location=None, progress=None):
        """Generates meeting minutes from the transcript using the Llama model."""
        if progress:
            progress(0.6, desc="Generating meeting minutes...")

        system_message = (
            "You are a professional meeting assistant specialized in generating structured and concise meeting minutes in Japanese. "
            "From a given transcript, produce a markdown-formatted document that includes:\n"
            "- A brief summary with meeting title, date, location, and attendees\n"
            "- Key discussion points with context\n"
            "- Clear takeaways and decisions made\n"
            "- Action items with responsible persons and deadlines\n"
            "Ensure the output is well-organized, easy to read, and suitable for sharing with stakeholders."
        )

        details = []
        if attendees:
            details.append(f"Attendees: {attendees}")
        if date:
            details.append(f"Date: {date}")
        if location:
            details.append(f"Location: {location}")
        details_text = "\n".join(details)

        user_prompt = (
            f"Below is an extract transcript of a meeting.\n"
            f"{details_text}\n"
            f"Please write minutes in markdown in Japanese, including a summary with attendees, location and date; "
            f"discussion points; takeaways; and action items with owners.\n{transcription}"
        )

        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt}
        ]

        inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
        streamer = TextIteratorStreamer(self.tokenizer)

        thread = threading.Thread(
            target=self.model.generate, kwargs={
                "input_ids": inputs,
                "max_new_tokens": 2000,
                "streamer": streamer
            })
        thread.start()

        started = False
        for new_text in streamer:
            if not started:
                if "<|start_header_id|>assistant<|end_header_id|>" in new_text:
                    started = True
                    new_text = new_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()

            if started:
                if "<|eot_id|>" in new_text:
                    new_text = new_text.replace("<|eot_id|>", "")
                if new_text.strip():
                    yield new_text

    def process_meeting(self, audio_file, attendees=None, date=None, location=None, progress=None):
        """Handles the complete process: transcribes audio and generates minutes."""
        if progress:
            progress(0.1, desc="Processing audio file...")

        if audio_file is None:
            return "Please upload an audio file."

        try:
            if not str(audio_file).lower().endswith('.mp3'):
                return "Please upload an MP3 file."

            transcription = self.transcribe_audio(audio_file, progress)

            accumulated_text = ""
            minutes = self.generate_minutes(transcription, attendees, date, location, progress)
            for chunk in minutes:
                accumulated_text += chunk
                yield accumulated_text

        except Exception as e:
            return f"Error processing file: {str(e)}"

In [8]:
class GradioInterface:
    def __init__(self):
        """Initializes the Gradio interface for processing audio files."""
        self.assistant = MeetingAssistant()
        self.iface = gr.Interface(
            fn=self.process_audio,
            inputs=[
                gr.Audio(type="filepath", label="Upload MP3 File", format="mp3"),
                gr.Textbox(label="Attendees", placeholder="e.g. Taro Yamada, Hanako Sato"),
                gr.Textbox(label="Date", placeholder="e.g. 2025-09-08"),
                gr.Textbox(label="Location", placeholder="e.g. Online, Meeting Room A")
            ],
            outputs=gr.Markdown(label="Meeting Minutes", min_height=60),
            title="AI Meeting Assistant",
            description="Upload an audio file and provide meeting details to generate minutes.",
            flagging_mode="never"
        )

    def process_audio(self, audio_file, attendees, date, location, progress=gr.Progress()):
        """Handles user input from Gradio, processes the audio, and returns meeting minutes."""
        response = self.assistant.process_meeting(audio_file, attendees, date, location, progress)
        for chunk in response:
            yield chunk

    def launch(self):
        """Launches the Gradio interface."""
        self.iface.launch()

In [9]:
if __name__ == "__main__":
    app = GradioInterface()
    app.launch()

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f83c77671ee02f104f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)