# **Install Required Libraries:**



In [None]:
!pip install moviepy pyannote.audio soundfile pandas numpy speechrecognition pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


# **Logic of the code**




In [None]:
# Import required libraries
from moviepy.editor import AudioFileClip
from pyannote.audio import Pipeline
import os
import pandas as pd
import whisper  # OpenAI Whisper library

# Convert MP4 to WAV
def convert_mp4_to_wav(input_file, output_file):
    audio = AudioFileClip(input_file)
    audio.write_audiofile(output_file, codec="pcm_s16le")
    audio.close()

# Perform Speaker Diarization
def perform_diarization(wav_file, hf_token):
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=hf_token)
    diarization = pipeline(wav_file)
    speaker_segments = []
    for segment in diarization.itertracks(yield_label=True):
        speaker_segments.append({
            "start": segment[0].start,
            "end": segment[0].end,
            "speaker": segment[2]
        })
    return speaker_segments

# Transcribe Audio using OpenAI Whisper
def transcribe_audio_with_whisper(wav_file, language="ar"):
    # Load the Whisper model
    model = whisper.load_model("large")  # You can use "small", "medium", or "large" for better accuracy
    # Transcribe the audio file
    result = model.transcribe(wav_file, language=language)
    return result["segments"]  # Return segments with timestamps

# Merge Diarization with Transcription
def merge_diarization_and_transcription(speaker_segments, whisper_segments):
    merged_results = []
    for speaker_segment in speaker_segments:
        speaker_start = speaker_segment["start"]
        speaker_end = speaker_segment["end"]
        speaker_id = speaker_segment["speaker"]

        # Find transcription segments that fall within the speaker segment
        segment_text = ""
        for whisper_segment in whisper_segments:
            whisper_start = whisper_segment["start"]
            whisper_end = whisper_segment["end"]
            whisper_text = whisper_segment["text"]

            # Check if the Whisper segment overlaps with the speaker segment
            if whisper_start < speaker_end and whisper_end > speaker_start:
                # Add the overlapping text to the segment
                segment_text += whisper_text + " "

        # Add the merged result
        merged_results.append({
            "speaker_id": f"Speaker {1 if speaker_id == 'SPEAKER_00' else 2}",
            "start": speaker_start,
            "end": speaker_end,
            "text": segment_text.strip()  # Remove trailing spaces
        })
    return merged_results


mp4_file = "/content/drive/MyDrive/final_ara.mp4"  # Path to  MP4 file in Google Drive
wav_file = "audio.wav"

# Step 1: Convert MP4 to WAV
convert_mp4_to_wav(mp4_file, wav_file)
print(f"Converted {mp4_file} to {wav_file}")

# Hugging Face Token
hf_token = "replace_with_your_hf_token"

# Step 2: Perform Speaker Diarization
speaker_segments = perform_diarization(wav_file, hf_token)
print("Diarization Complete")

# Step 3: Transcribe Audio using OpenAI Whisper
whisper_segments = transcribe_audio_with_whisper(wav_file, language="ar")  # Arabic language
print("Transcription Complete")

# Step 4: Merge Diarization and Transcription
final_output = merge_diarization_and_transcription(speaker_segments, whisper_segments)

# Display the structured output
output_df = pd.DataFrame(final_output)
print(output_df)

MoviePy - Writing audio in audio.wav




MoviePy - Done.
Converted /content/drive/MyDrive/final_ara.mp4 to audio.wav


  std = sequences.std(dim=-1, correction=1)



Diarization Complete


100%|█████████████████████████████████████| 2.88G/2.88G [00:48<00:00, 63.9MiB/s]
  checkpoint = torch.load(fp, map_location=device)



Transcription Complete
  speaker_id     start       end                        text
0  Speaker 1  0.030969  2.444094     مرحباً، كيف حالك اليوم؟
1  Speaker 1  2.562219  5.549094  أهلاً، أنا بخير، ماذا عنك؟


# **Orginal Audio**

In [None]:
# Step 1: Upload the MP4 file
from google.colab import files


# Step 2: Extract audio from the MP4 file
from moviepy.editor import AudioFileClip

# Path to the uploaded MP4 file
mp4_file = "/content/drive/MyDrive/final_ara.mp4"

# Extract audio and save as a temporary WAV file
audio = AudioFileClip(mp4_file)
audio.write_audiofile("temp_audio.wav", codec="pcm_s16le")
audio.close()

# Step 3: Play the audio
from IPython.display import Audio

# Play the audio
Audio("temp_audio.wav")

MoviePy - Writing audio in temp_audio.wav


                                                        

MoviePy - Done.




## Noise Reduction 

In [None]:
import librosa
import noisereduce as nr
import soundfile as sf

def reduce_noise(input_file, output_file):
    """
    Reduce noise in an audio file using spectral noise reduction.
    
    Args:
        input_file (str): Path to the input WAV file.
        output_file (str): Path to save the noise-reduced WAV file.
    """
    # Load audio file
    audio, sr = librosa.load(input_file, sr=None)  # Load with original sampling rate

    # Estimate noise profile from the first 1 second of the audio
    noise_sample = audio[:sr]  # Assuming the first second is mostly noise

    # Apply noise reduction
    reduced_audio = nr.reduce_noise(y=audio, sr=sr, y_noise=noise_sample)

    # Save the processed audio to a new file
    sf.write(output_file, reduced_audio, sr)
    print(f"Noise-reduced audio saved to {output_file}")

# Example Usage
input_file = "audio.wav"  # Path to the input WAV file
output_file = "audio_denoised.wav"  # Path to save the denoised WAV file
reduce_noise(input_file, output_file)
