In [1]:
#!pip install vosk

In [1]:
import cv2
import numpy as np
import moviepy.editor as mp
from pydub import AudioSegment
from pydub.playback import play
from vosk import Model, KaldiRecognizer

In [4]:
video_path = r"sample_video2.mp4"
audio_path = "audio.wav"

#### Extracting audio from video:

In [5]:
def extract_audio(video_path, audio_path):
    video = mp.VideoFileClip(video_path)
    audio = video.audio
    print(audio)
    audio.write_audiofile(audio_path)

In [6]:
extract_audio(video_path, audio_path)

<moviepy.audio.io.AudioFileClip.AudioFileClip object at 0x000002A9A4512310>
MoviePy - Writing audio in audio2.wav


                                                                                                                       

MoviePy - Done.




#### Detect faces in video

In [None]:
def detect_faces(video_path):
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    video_capture = cv2.VideoCapture(video_path)

    while True:
        ret, frame = video_capture.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

        for (x, y, w, h) in faces:
            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)

        cv2.imshow('Video', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    video_capture.release()
    cv2.destroyAllWindows()

In [None]:
detect_faces(video_path)

#### Detect images in video

In [9]:
def detect_images(video_path):
    net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
    with open("coco.names", "r") as f:
        classes = [line.strip() for line in f.readlines()]

    layer_names = net.getLayerNames()
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

    cap = cv2.VideoCapture(video_path)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        height, width, channels = frame.shape

        blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
        net.setInput(blob)
        outs = net.forward(output_layers)

        for out in outs:
            for detection in out:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]
                if confidence > 0.5 and class_id == 0:  # Assuming class_id 0 corresponds to person
                    center_x = int(detection[0] * width)
                    center_y = int(detection[1] * height)
                    w = int(detection[2] * width)
                    h = int(detection[3] * height)
                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)
                    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

        cv2.imshow("Video", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

In [None]:
detect_images(video_path)

In [26]:
# def diarize_speakers(audio_path):
#     model = Model("vosk-model-spk-0.4")
#     audio = AudioSegment.from_wav(audio_path)
#     recognizer = KaldiRecognizer(model, audio.frame_rate)

#     # Split the audio into chunks and process each chunk
#     chunk_size = 10  
#     for i in range(0, len(audio.raw_data), chunk_size * audio.frame_width):
#         chunk = audio.raw_data[i:i + chunk_size * audio.frame_width]
#         if recognizer.AcceptWaveform(chunk):
#             result = recognizer.Result()
#             print(result)

#     # Finalize the recognition
#     result = recognizer.FinalResult()
#     print(result)

In [None]:
# diarize_speakers(audio_path)