Na podstawie
https://stackoverflow.com/questions/14140495/how-to-capture-a-video-and-audio-in-python-from-a-camera-or-webcam

In [1]:
#Wczytujemy pakiety
import cv2
import pyaudio
import wave
import threading
import time
import subprocess
import os
import speech_recognition as sr

In [2]:
#Rejestrowanie video
class VideoRecorder():  

    # Video class based on openCV 
    def __init__(self):

        self.open = True
        self.device_index = 0
        self.fps = 6               # fps should be the minimum constant rate at which the camera can
        self.fourcc = "MJPG" # "XVID"       # capture images (with no decrease in speed over time; testing is required)
        self.frameSize = (640,480) # video formats and sizes also depend and vary according to the camera used
        self.video_filename = "temp_video.avi"
        self.video_cap = cv2.VideoCapture(self.device_index)
        self.video_writer = cv2.VideoWriter_fourcc(*self.fourcc)
        self.video_out = cv2.VideoWriter(self.video_filename, self.video_writer, self.fps, self.frameSize)
        self.frame_counts = 1
        self.start_time = time.time()


    # Video starts being recorded 
    def record(self):

        timer_start = time.time()
        timer_current = 0

        while(self.open==True):
            
            ret, video_frame = self.video_cap.read()
            
            if (ret==True):

                    self.video_out.write(video_frame)
                    self.frame_counts += 1
                    time.sleep(0.16)
                    cv2.imshow('video_frame', video_frame)
                    cv2.waitKey(1)
            else:
                break

    def stop(self):

        if self.open==True:

            self.open=False
            self.video_out.release()
            self.video_cap.release()
            cv2.destroyAllWindows()

        else: 
            pass

    def start(self):
        video_thread = threading.Thread(target=self.record)
        video_thread.start()


In [3]:
#Nagrywanie dzwieku
class AudioRecorder():

    def __init__(self):

        self.open = True
        self.rate = 44100
        self.frames_per_buffer = 1024
        self.channels = 2
        self.format = pyaudio.paInt16
        self.audio_filename = "temp_audio.wav"
        self.audio = pyaudio.PyAudio()
        self.stream = self.audio.open(format=self.format,
                                      channels=self.channels,
                                      rate=self.rate,
                                      input=True,
                                      frames_per_buffer = self.frames_per_buffer)
        self.audio_frames = []

    def record(self):

        self.stream.start_stream()
        while(self.open == True):
            data = self.stream.read(self.frames_per_buffer) 
            self.audio_frames.append(data)
            if self.open==False:
                break
                
    def stop(self):

        if self.open==True:
            self.open = False
            self.stream.stop_stream()
            self.stream.close()
            self.audio.terminate()

            waveFile = wave.open(self.audio_filename, 'wb')
            waveFile.setnchannels(self.channels)
            waveFile.setsampwidth(self.audio.get_sample_size(self.format))
            waveFile.setframerate(self.rate)
            waveFile.writeframes(b''.join(self.audio_frames))
            waveFile.close()

        pass

    def start(self):
        audio_thread = threading.Thread(target=self.record)
        audio_thread.start()


In [4]:
#Funkcje uruchamiajace i zatrzymujace nagrywanie

def start_AVrecording(filename):

    global video_thread
    global audio_thread

    video_thread = VideoRecorder()
    audio_thread = AudioRecorder()

    audio_thread.start()
    video_thread.start()

    return filename

def stop_AVrecording(filename):

    audio_thread.stop() 
    frame_counts = video_thread.frame_counts
    elapsed_time = time.time() - video_thread.start_time
    recorded_fps = frame_counts / elapsed_time
    print("total frames " + str(frame_counts))
    print("elapsed time " + str(elapsed_time))
    print("recorded fps " + str(recorded_fps))
    video_thread.stop() 

    if abs(recorded_fps - 6) >= 0.01:    # If the fps rate was higher/lower than expected, re-encode it to the expected

        print("Re-encoding")
        cmd = "ffmpeg -r " + str(recorded_fps) + " -i temp_video.avi -pix_fmt yuv420p -r 6 temp_video2.avi"
        subprocess.call(cmd, shell=True)

        print("Muxing")
        cmd = "ffmpeg -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video2.avi -pix_fmt yuv420p " + filename + ".avi"
        subprocess.call(cmd, shell=True)

    else:

        print("Normal recording\nMuxing")
        cmd = "ffmpeg -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video.avi -pix_fmt yuv420p " + filename + ".avi"
        subprocess.call(cmd, shell=True)

        print("..")


In [5]:
#Kasowanie obecnych plikow
def file_manager(filename):

    local_path = os.getcwd()

    if os.path.exists(str(local_path) + "/temp_audio.wav"):
        os.remove(str(local_path) + "/temp_audio.wav")

    if os.path.exists(str(local_path) + "/temp_video.avi"):
        os.remove(str(local_path) + "/temp_video.avi")

    if os.path.exists(str(local_path) + "/temp_video2.avi"):
        os.remove(str(local_path) + "/temp_video2.avi")

    if os.path.exists(str(local_path) + "/" + filename + ".avi"):
        os.remove(str(local_path) + "/" + filename + ".avi")

In [6]:
#Rozpoczecie nagrywania

filename = "Default_user_3"
file_manager(filename)

start_AVrecording(filename)  

#time.sleep(10)
input("Please speak till you're done. Then, press Enter to accomplish recording...")

stop_AVrecording(filename)

print("Done")

Please speak till you're done. Then, press Enter to accomplish recording...
total frames 40
elapsed time 9.585766792297363
recorded fps 4.1728534468564344
Re-encoding
Muxing
Done


In [7]:
#Wydobycie slow z nagranego tekstu

local_path = os.getcwd()

r = sr.Recognizer()
mic = sr.AudioFile(str(local_path) + '/temp_audio.wav')

with mic as source:
     r.adjust_for_ambient_noise(source)
     audio = r.listen(source)
    
try:
    print("Text: "+r.recognize_google(audio, language="pl-PL"))
    words = r.recognize_google(audio, language="pl-PL").split()
    print(words)
except:
    print("Sorry, I did not get that")

Text: test aplikacji kolejny
['test', 'aplikacji', 'kolejny']
