In [1]:

import os
from pydub import AudioSegment
from pydub.generators import Sine
import whisper
from better_profanity import profanity

# Initialize the profanity filter
profanity.load_censor_words()
def transcribe_audio_with_word_timestamps(audio_path, model_type="base"):
    """
    Transcribe audio to text using Whisper and include word-level timestamps.
    """
    try:
        # Check if the file exists
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"File not found: {audio_path}")

        # Load the Whisper model
        model = whisper.load_model(model_type)
        # Transcribe the audio with word-level timestamps
        result = model.transcribe(audio_path, word_timestamps=True)
        print("Transcription with word-level timestamps successful!")

        # Extract words with timestamps
        words = []
        for segment in result["segments"]:
            for word_info in segment["words"]:
                words.append({
                    "text": word_info["word"],
                    "start": word_info["start"],
                    "end": word_info["end"]
                })

        return words
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return []

def bleep_audio_at_word_level(audio_path, words):
    """Creates a bleeped version of the audio at word level using timestamps."""
    audio = AudioSegment.from_file(audio_path)

    # Overlay bleep for detected cuss words
    for word in words:
        if profanity.contains_profanity(word["text"]):
            start = word["start"] * 1000  # Convert to milliseconds
            end = word["end"] * 1000  # Convert to milliseconds

            # Generate a 1000 Hz sine wave tone for the bleep
            bleep = Sine(1000).to_audio_segment(duration=(end - start))
            audio = audio[:int(start)] + bleep + audio[int(end):]

    return audio

def process_audio_with_word_level_timestamps(input_audio_path, output_audio_path):
    """Processes the audio, bleeps out cuss words, and saves the output."""
    words = transcribe_audio_with_word_timestamps(input_audio_path)
    if not words:
        print("No transcription available. Exiting.")
        return

    # Detect cuss words in the transcription
    cuss_words = [word for word in words if profanity.contains_profanity(word["text"])]
    print(f"Cuss words detected: {cuss_words}")

    if not cuss_words:
        print("No cuss words detected. Saving the original audio.")
        os.rename(input_audio_path, output_audio_path)
        return

    bleeped_audio = bleep_audio_at_word_level(input_audio_path, cuss_words)
    bleeped_audio.export(output_audio_path, format="wav")
    print(f"Bleeped audio saved to {output_audio_path}")

# Example usage
input_audio = "output_audio_GOT.wav"  # Replace with your audio filey

output_audio = "output_audio_censored2.wav"
process_audio_with_word_level_timestamps(input_audio, output_audio)




Transcription with word-level timestamps successful!
Cuss words detected: [{'text': ' kill', 'start': 27.16, 'end': 27.6}, {'text': ' kill', 'start': 78.84, 'end': 79.48}, {'text': ' bastard', 'start': 84.42, 'end': 84.9}, {'text': ' murder,', 'start': 117.2, 'end': 117.56}]
Bleeped audio saved to output_audio_censored2.wav
