### Setup

In [None]:
%pip install tensorflow tf-keras deepface opencv-python pyaudio ffmpeg SpeechRecognition openai-whisper transformers

In [None]:
import speech_recognition as sr
import whisper
import os
from transformers import pipeline
from deepface import DeepFace
import cv2
import threading

### Speech Recognition

In [None]:
def speech_to_text_whisper(audio):
    """
    Converts recorded audio into text using OpenAI's Whisper model.
    
    - Loads the Whisper model (`medium`).
    - Saves the recorded audio as a temporary WAV file.
    - Uses Whisper to transcribe the saved audio.
    - Deletes the temporary file after transcription.
    
    Args:
        audio (sr.AudioData): The recorded audio data.
    
    Returns:
        str: The transcribed text.
    """
    model = whisper.load_model("medium")
    
    temp_filename = "temp.wav"
    with open(temp_filename, "wb") as f:
        f.write(audio.get_wav_data())
    
    result = model.transcribe(temp_filename)
    
    os.remove(temp_filename)
    return result["text"]

### Speech emotion

In [None]:
def get_emotion_scores(text):
    classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
    classifier(text) 

### Visual emotion thread

In [None]:
class EmotionDetectionThread(threading.Thread):
    def __init__(self):
        super().__init__()
        self._stop_event = threading.Event()
        self.cap = None
        self.recorded = []

    def recognize_emotion(self, frame):
        try:
            result = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
            return result[0]['dominant_emotion']
        except Exception as e:
            print(f"Emotion detection failed: {e}")
            return "unknown"

    def run(self):
        self.cap = cv2.VideoCapture(0)
        if not self.cap.isOpened():
            print("Error: Webcam not accessible.")
            return

        while not self._stop_event.is_set():
            ret, frame = self.cap.read()
            if not ret:
                print("Failed to grab frame.")
                break

            # Detect emotion
            emotion = self.recognize_emotion(frame)
            self.recorded.append(emotion)
            # Display emotion label
            cv2.putText(frame, f'Emotion: {emotion}', (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.imshow('Webcam - Emotion Detection', frame)

        self.cap.release()
        cv2.destroyAllWindows()

    def stop(self):
        self._stop_event.set()

In [None]:
def record_audio():
    recognizer = sr.Recognizer()
    cap = cv2.VideoCapture(0)
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        print("Recording... Speak now!")
        audio = recognizer.listen(source)
        print("Recording complete.")
        return audio


In [None]:
emotion_thread = EmotionDetectionThread()
emotion_thread.start()
audio = record_audio()
emotion_thread.stop()
emotion_thread.join() 
   
print(emotion_thread.recorded)
try:
    whisper_text = speech_to_text_whisper(audio)
    print("Whisper Transcription:", whisper_text)
    print("emotion score:", get_emotion_scores(whisper_text))
except Exception as e:
    print("Whisper failed:", e)