In [1]:
import torch
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import os
from pydub import AudioSegment
from pydub.utils import make_chunks
from llama_cpp import Llama
from langdetect import detect
import pycountry
from deep_translator import GoogleTranslator
import string
import re

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name_or_path = ".\whisper-small-indonesian\checkpoint-1000" 
llm_model_path = ".\llm\Yi-1.5-9B-Chat-Q3_K_L.gguf"

processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path).to(device)
model.eval()
llm = Llama(model_path=llm_model_path,n_gpu_layers=28,n_ctx=4096,chat_format="chatml",f16_kv=True,verbose=False)



In [None]:
def clean_transcription(text):
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

def clean_spammy_transcription(text):
    text = re.sub(r"<\|\d+(\.\d+)?\|>", "", text)
    text = re.sub(r'\b(\w)\s?(?:\1\s?){4,}', '', text, flags=re.IGNORECASE)

    text = re.sub(r'\b(\w+)(?:\s+\1){3,}', '', text, flags=re.IGNORECASE)

    text = re.sub(r'(\b\w\b)(?:\s+\1){4,}', '', text)

    filler_words = ["uh", "um", "ah", "eh", "er", "hmm"]
    for word in filler_words:
        text = re.sub(rf'\b(?:{word})(?:\s+{word}){{3,}}', '', text, flags=re.IGNORECASE)

    text = ' '.join([word for word in text.split() if len(word) <= 47])
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text
def seconds_to_hhmmss(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    return f"{hours:02d}:{minutes:02d}:{secs:02d}"

def get_audio_data(audio_path):
    try:
        print("Audio :",audio_path)
        audio, sr = librosa.load(audio_path, sr=16000)
        print(f"Audio duration : {seconds_to_hhmmss(len(audio)//sr)}")
        return audio,sr
    except Exception as e:
        print(f"Error loading audio file: {e}")
        raise

def load_audio(audio_path):
    """Load audio file and return as numpy array with sample rate of 16kHz."""
    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        return audio
    except Exception as e:
        print(f"Error loading audio file: {e}")
        raise

def split_audio(audio_path, chunk_length_ms=30000):
    """
    Split audio file into chunks of specified length in milliseconds.
    Returns list of temporary chunk file paths.
    Default chunk size is 30 seconds.
    """
    try:
        print(f"Splitting audio file: {audio_path}")
        audio = AudioSegment.from_file(audio_path)
        chunks = make_chunks(audio, chunk_length_ms)
        
        chunk_files = []
        for i, chunk in enumerate(chunks):
            chunk_name = f"temp_chunk_{i}.wav"
            chunk.export(chunk_name, format="wav")
            chunk_files.append(chunk_name)
            
        print(f"Audio split into {len(chunk_files)} chunks")
        return chunk_files
    except Exception as e:
        print(f"Error splitting audio: {e}")
        raise

def cleanup_chunks(chunk_files):
    """Remove temporary chunk files."""
    for file in chunk_files:
        try:
            os.remove(file)
        except Exception as e:
            print(f"Failed to remove temporary file {file}: {e}")

def transcribe_chunk(audio_path):
    """Transcribe a single audio chunk using Whisper model."""
    audio = load_audio(audio_path)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
    input_features = inputs.input_features.to(device)

    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

def transcribe_long_audio(audio_path, chunk_length_ms=30000):
    """
    Transcribe long audio by splitting it into manageable chunks.
    chunk_length_ms: length of each chunk in milliseconds (default: 30 seconds)
    """
    chunk_files = split_audio(audio_path, chunk_length_ms)
    
    transcriptions = []
    for i, chunk_file in enumerate(chunk_files):
        print(f"Transcribing chunk {i+1}/{len(chunk_files)}")
        transcription = transcribe_chunk(chunk_file)
        transcriptions.append(transcription)
    
    cleanup_chunks(chunk_files)
    
    full_transcription = " ".join(transcriptions)
    return full_transcription

def format_prompt(text):
    """Format prompt for LLM summarization."""
    
    lang = detect(text)
    language = pycountry.languages.get(alpha_2=lang)
    print(f'Detected Language : {language.name}')
    prompt = [
        {"role": "system", "content": 
            "You are a multilingual transcription summarizer.\n"
            "Your job is to analyze the transcription of an audio recording and extract the key information.\n"
            "Correct any misheard or misspelled words, grammatical errors, or likely transcription mistakes.\n"
            "Remove any irrelevant or meaningless fragments ('trash transcription').\n"
            "Only output a direct, tidy summary — do not simulate a conversation or dialogue.\n"
            "If there is no transcription, just say \"Failed to summarize audio.\"\n\n"
            "Summarize in this format:\n"
            "1. A concise overview (more than 1 sentences)\n"
            "2. Key points in bullet format\n"
            "3. Detailed explanation of the main concepts\n"
            "4. Any important context, examples, or data mentioned\n"
            "5. Conclusion or main takeaway\n\n"
            "Make your summary comprehensive yet clear and organized.\n\n"
            f"Transcription:\n{text}"
        },
        {"role": "assistant", "content": ""}
    ]
    return prompt,lang

def safe_translate(text, target_lang='id'):
    lines = text.split('\n')
    translated_lines = [
        GoogleTranslator(source='auto', target=target_lang).translate(line) if line.strip() else ''
        for line in lines
    ]
    return '\n'.join(translated_lines)


def generate_response(prompt):
    """Generate summary response from LLM."""
    print("Summarizing. . .")
    response = llm.create_chat_completion(
        messages=prompt,
        max_tokens=4096,
        stop=["<|user|>", "</|assistant|>", "<|system|>"],
        repeat_penalty=1.5
    )
    return response["choices"][0]["message"]["content"]

def extract_response(llm_output):
    """Extract final response from LLM output."""
    if "</think>" in llm_output:
        return llm_output.split("</think>", 1)[1].strip()
    return llm_output.strip()

def format_for_cli(output: str) -> str:
    """
    Replaces HTML-style <br> tags with newlines for CLI display.
    Also strips excessive whitespace if needed.
    """
    output = output.replace('<br>', '\n').replace('<br/>', '\n').replace('<br />', '\n').replace('</br>','\n')

    output = output.strip()

    return output
def summarize(audio_path, chunk_length_ms=30000):
    """Transcribe long audio and summarize its content."""
    try:
        get_audio_data(audio_path)
        transcript = transcribe_long_audio(audio_path, chunk_length_ms)
        print(f"Raw Audio Transcription:\n{transcript}")
        print("============================================\n")
        transcript = clean_spammy_transcription(transcript)
        print(f"Cleaned Audio Transcription:\n{transcript}")
        print("============================================\n")
        
        prompt,lang = format_prompt(transcript)
        response=generate_response(prompt)
        response = extract_response(response)
        response = safe_translate(response,lang)
        print("Summary:\n")
        print(format_for_cli(response))
        return response
    except Exception as e:
        print(f"Error in summarization pipeline: {e}")
        import traceback
        traceback.print_exc()
        return None

In [4]:
audio_input = "Audio.mp3"

In [5]:
result = summarize(audio_input)

Audio : Audio.mp3
Audio duration : 00:08:13
Splitting audio file: Audio.mp3
Audio split into 17 chunks
Transcribing chunk 1/17




Transcribing chunk 2/17
Transcribing chunk 3/17
Transcribing chunk 4/17
Transcribing chunk 5/17
Transcribing chunk 6/17
Transcribing chunk 7/17
Transcribing chunk 8/17
Transcribing chunk 9/17
Transcribing chunk 10/17
Transcribing chunk 11/17
Transcribing chunk 12/17
Transcribing chunk 13/17
Transcribing chunk 14/17
Transcribing chunk 15/17
Transcribing chunk 16/17
Transcribing chunk 17/17
Raw Audio Transcription:
 Diaku ya atau tidak, bahwa impresentasi saja sudah cukup susah bagi kita yang tidak terbiasa. Nah, ternyata, bikin orang mau dengarin presentasi kita juga enggak kalah rumitnya, teman-teman! Pernahkah berada pada kondisi di mana audien selebih memilih untuk ngobrol, main handphone, atau bahkan tidur daripada dengarkan persimpasi kita? Kalo pernah, you are on the right video, teman-teman! Karena di video ini, kita akan bahas dua senjata pabungkas yang bisa membantu S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S S