In [2]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio diffusers datasets

In [13]:
# imports

import os
import requests
import torch
import soundfile as sf
from dotenv import load_dotenv
from diffusers import DiffusionPipeline
from IPython.display import Markdown, display, update_display
from huggingface_hub import login
import google.generativeai as genai
from google.generativeai import types
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, pipeline
from datasets import load_dataset
from IPython.display import Audio

In [11]:
# Constants
AUDIO_MODEL = "gemini-1.5-flash"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [14]:
# Load environment variables in a file called .env
load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [15]:
#create meeting minutes assistance
class MeetingAssistance:
  def __init__(self, audio_file, prompt):
    self.audio_file = audio_file
    self.prompt = prompt
    self.messages = []
    self.sumText = ""
    self.image = None
    self.sumAudio = None

  def transcribe_file_with_auto_punctuation(self):
    # create an instance of generative agent
    client = genai.Client(api_key=google_api_key)
    with open(self.audio_file, 'rb') as f:
      image_bytes = f.read()

    response = client.models.generate_content(
      model=AUDIO_MODEL,
      contents=[
        "transcribr the audio file that provided",# prompt
        types.Part.from_bytes(
          data=image_bytes,
          mime_type='audio/mp3',
        )
      ]
    )
    # system_message = "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."
    user_prompt = f"Below is an extract transcript of a Denver council meeting. \
    Please write minutes in markdown, including a summary with attendees, location and date; \
    discussion points; takeaways; and action items with owners.\n{response.text}"
    self.messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": user_prompt}
      ]

  def summarize(self):
    quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_quant_type="nf4"
    )

    tokenizer = AutoTokenizer.from_pretrained(LLAMA)
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer.apply_chat_template(self.messages, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)
    model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
    outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

    self.sumText = tokenizer.decode(outputs[0])
    
  def text_to_audio(self):
    synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts", device='cuda')
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    # Split text into smaller chunks
    chunk_size = 500  # Adjust as needed
    chunks = [textToSpeech[i:i+chunk_size] for i in range(0, len(textToSpeech), chunk_size)]

    speech_chunks = []
    for chunk in chunks:
        speech = synthesiser(chunk, forward_params={"speaker_embeddings": speaker_embedding})
        speech_chunks.append(speech["audio"])

    # Concatenate audio chunks (if necessary)
    combined_speech = torch.cat(speech_chunks)

    sf.write("speech.wav", combined_speech, samplerate=speech["sampling_rate"])
    self.sumAudio = Audio("speech.wav")

  def text_to_image(self):
    # Generate a prompt for image
    generator = pipeline("text-generation", device="cuda")
    result = generator(f"extract the key word for the following article.{self.sumText}")

    # Image Generation
    image_gen = DiffusionPipeline.from_pretrained(
        "stabilityai/stable-diffusion-2",
        torch_dtype=torch.float16,
        use_safetensors=True,
        variant="fp16"
        ).to("cuda")

    text = result[0]['generated_text']
    print(text)
    self.image = image_gen(prompt=text).images[0]

In [16]:
prompt = "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."
audio_filename = "C:\\Users\\checfeng\\OneDrive - Cisco\\Desktop\\LLM\\llm_engineering\\week3\\denver_extract.mp3"
assistance = MeetingAssistance(audio_filename,prompt)
assistance.summarize()
# assistance.text_to_audio()
# assistance.text_to_image()

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

IndexError: list index out of range

In [None]:
def MinutesSummarizer(audio_file, prompt):
    assistance = MeetingAssistance(audio_file, prompt)
    assistance.transcribe_file_with_auto_punctuation()
    assistance.summarize()
    assistance.text_to_audio()
    assistance.text_to_image()
    return assistance.sumText, assistance.image, assistance.sumAudio

In [None]:
import gradio as gr

In [None]:
defaultPrompt = "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."

interface = gr.Interface(
    fn=MinutesSummarizer,
    inputs=[gr.File(label="Upload Audio"), gr.Textbox(value=defaultPrompt, label="Prompt(Optional)", lines=3,)],
    outputs=[gr.Textbox(label="Minutes"), gr.Image(label="Image"), gr.Audio(label="Audio")],
    title="Audio Transcription with Auto Punctuation",
    description="Upload an audio file and get a transcription with automatic punctuation"
)

interface.launch()