# **Install insanely-fast-whisper via pip**


In [7]:
!pip install insanely-fast-whisper --ignore-requires-python


Collecting insanely-fast-whisper
  Downloading insanely_fast_whisper-0.0.15-py3-none-any.whl.metadata (9.9 kB)
Collecting pyannote-audio>=3.1.0 (from insanely-fast-whisper)
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote-audio>=3.1.0->insanely-fast-whisper)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote-audio>=3.1.0->insanely-fast-whisper)
  Downloading lightning-2.5.1-py3-none-any.whl.metadata (39 kB)
Collecting omegaconf<3.0,>=2.1 (from pyannote-audio>=3.1.0->insanely-fast-whisper)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pyannote.core>=5.0.0 (from pyannote-audio>=3.1.0->insanely-fast-whisper)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote-audio>=3.1.0->insanely-fast-whisper)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.m

# **Install Required Python Packages**


In [None]:
!pip install noisereduce librosa soundfile numpy moviepy


Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Downloading noisereduce-3.0.3-py3-none-any.whl (22 kB)
Installing collected packages: noisereduce
Successfully installed noisereduce-3.0.3


# **Speaker Diarization ,Transcription & Reduce Noise**

In [None]:
import subprocess
import json
import librosa
import soundfile as sf
import noisereduce as nr
import numpy as np
import os
from moviepy.editor import AudioFileClip  # Import moviepy

def convert_mp4_to_wav(input_file, output_file="converted_audio.wav"):
    """
    Converts an MP4 file to WAV format using moviepy's AudioFileClip.
    """
    try:
        audio = AudioFileClip(input_file)
        audio.write_audiofile(output_file, codec="pcm_s16le")
        audio.close()
        print(f"Converted {input_file} to WAV: {output_file}")
        return output_file
    except Exception as e:
        print("Error converting MP4 to WAV:", e)
        return None

def reduce_noise(input_path, output_path="cleaned_audio.wav"):
    """
    Applies noise reduction to improve speech clarity.
    - Detects a silent portion as the noise sample.
    - Reduces noise more aggressively.
    - Applies band-pass filtering for better results.
    - Normalizes the audio for clearer speech.
    """
    try:
        # Load audio
        y, sr = librosa.load(input_path, sr=None)

        # 🔹 Step 1: Find a silent portion to use as noise (instead of first second)
        silent_indices = np.where(np.abs(y) < 0.01)[0]  # Find low-amplitude areas
        if len(silent_indices) > sr:  # If we have enough silent data
            noise_sample = y[silent_indices[:sr]]  # Use first second of detected silence
        else:
            noise_sample = y[:sr]  # Fallback: first second of the file

        # 🔹 Step 2: Reduce Noise (More aggressive)
        reduced_audio = nr.reduce_noise(y=y, sr=sr, y_noise=noise_sample, prop_decrease=0.95)

        # 🔹 Step 3: Apply Band-Pass Filtering (Remove Low & High Frequencies)
        def bandpass_filter(audio, sr, lowcut=100, highcut=8000):
            fft = np.fft.rfft(audio)
            frequencies = np.fft.rfftfreq(len(audio), 1/sr)
            fft[(frequencies < lowcut) | (frequencies > highcut)] = 0  # Zero out unwanted frequencies
            return np.fft.irfft(fft)

        reduced_audio = bandpass_filter(reduced_audio, sr)

        # 🔹 Step 4: Normalize Audio (Boost speech clarity)
        reduced_audio = reduced_audio / np.max(np.abs(reduced_audio)) * 0.9

        # Save cleaned audio
        sf.write(output_path, reduced_audio, sr)
        print(f"✅Noise reduction applied. Cleaned audio saved to: {output_path}")

        return output_path  # Return cleaned audio path

    except Exception as e:
        print(f"Error in noise reduction: {e}")
        return input_path  # If noise reduction fails, return original file

def transcribe_audio(input_path, diarization_model="pyannote/speaker-diarization-3.1", hf_token="HUGGINGFACE_TOKEN"):
    """
    Transcribes an audio file and returns speaker-diarized text with improved accuracy.
    """
    # 🔹 Step 0: Check if input is an MP4 and convert it to WAV if necessary
    filename, file_extension = os.path.splitext(input_path)
    if file_extension.lower() == ".mp4":
        converted_path = convert_mp4_to_wav(input_path)
        if converted_path is None:
            print("Conversion failed. Exiting transcription.")
            return None
        input_path = converted_path

    # Step 1: Apply Noise Reduction
    cleaned_audio_path = reduce_noise(input_path)

    # Step 2: Run transcription on the cleaned audio using the CLI tool
    command = [
        "insanely-fast-whisper",
        "--file-name", cleaned_audio_path,
        "--diarization_model", diarization_model,
        "--hf-token", hf_token,
    ]

    try:
        # Run the transcription command
        subprocess.run(command, text=True, capture_output=True, check=True)

        # Read the output JSON file
        with open("output.json", "r", encoding="utf-8") as file:
            data = json.load(file)

        if not isinstance(data, dict) or "speakers" not in data:
            raise ValueError("Invalid JSON format: expected a dictionary with 'speakers' key.")

        formatted_result = {"speakers": []}

        for segment in data["speakers"]:
            start_time, end_time = segment["timestamp"]

            # 🔹 Fix reversed timestamps
            if start_time > end_time:
                start_time, end_time = end_time, start_time  # Swap values

            # 🔹 Skip empty text segments
            text = segment.get("text", "").strip()
            if not text:
                continue

            # Append cleaned segment
            formatted_result["speakers"].append({
                "speaker": segment.get("speaker", "UNKNOWN"),
                "timestamp": [round(float(start_time), 2), round(float(end_time), 2)],
                "text": text
            })

        return formatted_result

    except subprocess.CalledProcessError as e:
        print("Error executing command:\n", e.stderr)
        return None
    except (FileNotFoundError, json.JSONDecodeError, ValueError) as e:
        print("Error reading JSON:", e)
        return None

# Example usage
result = transcribe_audio("/content/drive/MyDrive/conv.MP4")  # Works with both MP4 and WAV

if result:
    print(json.dumps(result, indent=4, ensure_ascii=False))
else:
    print("Transcription failed.")


✅ Noise reduction applied. Cleaned audio saved to: cleaned_audio.wav
{
    "speakers": [
        {
            "speaker": "SPEAKER_01",
            "timestamp": [
                0.0,
                1.8
            ],
            "text": "صباح الخير"
        },
        {
            "speaker": "SPEAKER_00",
            "timestamp": [
                1.8,
                2.5
            ],
            "text": "لندن"
        },
        {
            "speaker": "SPEAKER_00",
            "timestamp": [
                2.5,
                2.6
            ],
            "text": "أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أهلاً وصحبتك أه