<a href="https://colab.research.google.com/github/imsrija/MLproject/blob/main/Untitled37.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install opencv-python-headless
!pip install pydub
!pip install librosa
!pip install soundfile
!apt-get update
!apt-get install -y ffmpeg

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,059 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [4,572 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages 

In [2]:
import cv2
import numpy as np
import os
import tempfile
import subprocess
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import librosa
import soundfile as sf
from google.colab import files

def extract_motion_video_and_voice(input_video_path, output_video_path, output_audio_path):
    """
    Extract both motion frames for video and voice-only segments for audio
    """
    # First extract and process the audio
    voice_segments = extract_voice_only_audio(input_video_path, output_audio_path)

    # Then extract motion frames for video
    motion_video, motion_timelines = extract_motion_video(input_video_path, output_video_path)

    return {
        'motion_video': motion_video,
        'motion_timelines': motion_timelines,
        'voice_audio': output_audio_path,
        'voice_segments': voice_segments
    }

def extract_voice_only_audio(input_video_path, output_audio_path):
    """
    Extract only the voice segments from the input video's audio
    """
    print("\nExtracting and processing audio from video...")

    # Create a temporary audio file
    temp_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name

    # Extract audio from video using ffmpeg
    subprocess.call(['ffmpeg', '-i', input_video_path, '-q:a', '0', '-map', 'a', temp_audio, '-y'],
                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # Load audio file
    print("Loading audio and detecting voice segments...")
    audio = AudioSegment.from_file(temp_audio, format="wav")

    # Detect non-silent parts
    # min_silence_len: minimum silence length in ms
    # silence_thresh: silence threshold in dB
    non_silent_chunks = detect_nonsilent(audio,
                                        min_silence_len=500,  # 500ms
                                        silence_thresh=-35)   # -35dB

    # Convert to seconds for reporting
    voice_segments = [(start/1000, end/1000) for start, end in non_silent_chunks]

    print(f"Found {len(voice_segments)} voice segments")

    # Create a new audio file with only the voice segments
    output_audio = AudioSegment.empty()
    for start_ms, end_ms in non_silent_chunks:
        output_audio += audio[start_ms:end_ms]

    # Export the voice-only audio
    output_audio.export(output_audio_path, format="wav")

    # Calculate statistics
    total_audio_duration = len(audio) / 1000  # in seconds
    voice_audio_duration = len(output_audio) / 1000  # in seconds

    print(f"Original audio duration: {total_audio_duration:.2f} seconds")
    print(f"Voice-only audio duration: {voice_audio_duration:.2f} seconds")
    print(f"Voice percentage: {(voice_audio_duration/total_audio_duration)*100:.2f}%\n")

    # Print timeline details
    print("Voice Segment Timelines:")
    for i, (start, end) in enumerate(voice_segments, 1):
        duration = end - start
        print(f"Voice {i}: {start:.2f}s - {end:.2f}s (Duration: {duration:.2f}s)")

    # Cleanup temporary files
    os.unlink(temp_audio)

    return voice_segments

def extract_motion_video(input_video_path, output_video_path):
    """
    Extract frames with significant motion using OpenCV only
    """
    # Open input video
    cap = cv2.VideoCapture(input_video_path)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Video writer for motion frames
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    # Motion detection parameters
    MOVEMENT_THRESHOLD = 5000  # Adjust based on your video
    MOTION_WINDOW = int(fps * 0.5)  # 0.5-second window
    BUFFER_FRAMES = int(fps * 0.2)  # 0.2s buffer

    # Variables for tracking
    motion_frames = []
    prev_frame = None
    motion_in_progress = False
    motion_count = 0
    frame_count = 0
    motion_timelines = []
    current_motion = {'start_time': None, 'end_time': None}
    recent_frames_buffer = []

    print("Processing video for motion...")

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Keep buffer of recent frames
        recent_frames_buffer.append(frame.copy())
        if len(recent_frames_buffer) > BUFFER_FRAMES * 2:
            recent_frames_buffer.pop(0)

        # Convert to grayscale for motion detection
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        gray = cv2.GaussianBlur(gray, (21, 21), 0)

        # Compare with previous frame
        if prev_frame is not None:
            # Calculate difference
            frame_diff = cv2.absdiff(prev_frame, gray)
            thresh = cv2.threshold(frame_diff, 25, 255, cv2.THRESH_BINARY)[1]
            thresh = cv2.dilate(thresh, None, iterations=2)

            # Find contours in threshold image
            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            # Calculate total motion area
            motion_area = sum([cv2.contourArea(c) for c in contours])

            # Check if motion exceeds threshold
            is_motion_frame = motion_area > MOVEMENT_THRESHOLD

            # Handle motion detection
            if is_motion_frame:
                if not motion_in_progress:
                    motion_in_progress = True
                    current_motion['start_time'] = frame_count / fps
                    motion_count += 1

                    # Add buffer frames
                    buffer_start = max(0, len(recent_frames_buffer) - BUFFER_FRAMES)
                    for buffer_frame in recent_frames_buffer[buffer_start:-1]:
                        annotated_frame = buffer_frame.copy()
                        buffer_time = (frame_count - (len(recent_frames_buffer) - buffer_start)) / fps
                        cv2.putText(annotated_frame, f"Original Time: {buffer_time:.2f}s", (10, height - 20),
                                  cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
                        motion_frames.append(annotated_frame)

                # Add current frame with annotation
                annotated_frame = frame.copy()
                current_time = frame_count / fps
                cv2.putText(annotated_frame, f"Motion {motion_count} at {current_time:.2f}s", (width - 240, 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
                cv2.putText(annotated_frame, f"Original Time: {current_time:.2f}s", (10, height - 20),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

                current_motion['end_time'] = current_time
                motion_frames.append(annotated_frame)
            else:
                if motion_in_progress:
                    # Add buffer frames after motion
                    buffer_count = 0
                    for i in range(BUFFER_FRAMES):
                        if frame_count + i < total_frames and buffer_count < BUFFER_FRAMES:
                            ret_ahead, frame_ahead = cap.read()
                            if ret_ahead:
                                annotated_frame = frame_ahead.copy()
                                buffer_time = (frame_count + buffer_count) / fps
                                cv2.putText(annotated_frame, f"Original Time: {buffer_time:.2f}s", (10, height - 20),
                                          cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
                                motion_frames.append(annotated_frame)
                                buffer_count += 1

                    # Reposition video
                    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count + buffer_count)

                    # Add timeline
                    motion_timelines.append({
                        'start_time': current_motion['start_time'],
                        'end_time': current_motion['end_time'],
                        'duration': current_motion['end_time'] - current_motion['start_time']
                    })

                    # Reset motion tracking
                    current_motion = {'start_time': None, 'end_time': None}
                    motion_in_progress = False

        # Update previous frame
        prev_frame = gray

        # Progress tracking
        frame_count += 1
        if frame_count % 100 == 0:
            print(f"Processed {frame_count}/{total_frames} frames")

    # Write motion frames to output
    print(f"Writing {len(motion_frames)} motion frames to output video...")
    for frame in motion_frames:
        out.write(frame)

    # Clean up
    cap.release()
    out.release()

    # Print statistics
    input_duration = total_frames / fps
    output_duration = len(motion_frames) / fps
    print(f"\nVideo Processing Complete:")
    print(f"Input Duration: {input_duration:.2f}s")
    print(f"Motion Video Duration: {output_duration:.2f}s")
    print(f"Percentage of Motion Frames: {(output_duration/input_duration)*100:.2f}%")

    # Print timelines
    print("\nMotion Timelines:")
    for i, timeline in enumerate(motion_timelines, 1):
        print(f"Motion {i}: {timeline['start_time']:.2f}s - {timeline['end_time']:.2f}s (Duration: {timeline['duration']:.2f}s)")

    return output_video_path, motion_timelines

def main():
    print("Please upload your interview video:")
    uploaded = files.upload()

    # Get the filename of the uploaded video
    input_video_path = list(uploaded.keys())[0]
    output_video_path = 'motion_video_only.mp4'
    output_audio_path = 'voice_only_audio.wav'

    # Extract motion frames with timelines and voice audio
    results = extract_motion_video_and_voice(input_video_path, output_video_path, output_audio_path)

    # Download the results
    files.download(output_video_path)
    files.download(output_audio_path)

    print("\nProcessing complete! Both files are available for download.")

# Run the main function
if __name__ == "__main__":
    main()

Please upload your interview video:


Saving WhatsApp Video 2025-04-02 at 12.59.30 AM.mp4 to WhatsApp Video 2025-04-02 at 12.59.30 AM.mp4

Extracting and processing audio from video...
Loading audio and detecting voice segments...
Found 31 voice segments
Original audio duration: 127.39 seconds
Voice-only audio duration: 90.52 seconds
Voice percentage: 71.06%

Voice Segment Timelines:
Voice 1: 0.00s - 4.62s (Duration: 4.62s)
Voice 2: 5.20s - 5.76s (Duration: 0.56s)
Voice 3: 7.93s - 8.37s (Duration: 0.44s)
Voice 4: 9.33s - 12.64s (Duration: 3.31s)
Voice 5: 14.50s - 14.54s (Duration: 0.04s)
Voice 6: 16.83s - 21.54s (Duration: 4.71s)
Voice 7: 22.54s - 26.29s (Duration: 3.75s)
Voice 8: 27.41s - 30.36s (Duration: 2.96s)
Voice 9: 31.96s - 35.18s (Duration: 3.22s)
Voice 10: 35.92s - 38.27s (Duration: 2.36s)
Voice 11: 39.74s - 41.34s (Duration: 1.60s)
Voice 12: 42.37s - 46.15s (Duration: 3.78s)
Voice 13: 47.52s - 55.57s (Duration: 8.04s)
Voice 14: 56.25s - 56.47s (Duration: 0.22s)
Voice 15: 57.12s - 63.56s (Duration: 6.43s)
Voice 1

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing complete! Both files are available for download.
