In [None]:
# Depression Detection Audio-Video Fusion with Gradio GUI (Colab Ready)
import tensorflow as tf
import numpy as np
import librosa
import cv2
import joblib
import tempfile
import os
import matplotlib.pyplot as plt
from moviepy.editor import VideoFileClip
import gradio as gr

# Load models
audio_model = tf.keras.models.load_model("/models/depression_model_finetuned.h5")
video_model = tf.keras.models.load_model("/models/densenet201_depression_model.keras")
scaler = joblib.load("/models/scaler.pkl")

IMG_HEIGHT, IMG_WIDTH = 224, 224
FRAMES_PER_VIDEO = 20
MAX_PAD_LEN = 216
VIDEO_INFLUENCE_WEIGHT = 0.4

def extract_frames(video_path, num_frames=FRAMES_PER_VIDEO):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    frames = []
    for fid in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, fid)
        ret, frame = cap.read()
        if not ret:
            frames.append(np.zeros((IMG_HEIGHT, IMG_WIDTH, 3), dtype=np.uint8))
            continue
        frame = cv2.resize(frame, (IMG_WIDTH, IMG_HEIGHT))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()
    frames = np.array(frames).astype('float32') / 255.0
    return frames

def extract_audio_features(audio_path, max_pad_len=MAX_PAD_LEN):
    y, sr = librosa.load(audio_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=26)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    combined = np.concatenate((mfcc, delta, delta2), axis=0)
    pad_width = max(0, max_pad_len - combined.shape[1])
    combined = np.pad(combined, ((0, 0), (0, pad_width)), mode='constant')
    combined = combined.T[:max_pad_len]
    return combined

def extract_audio_from_video(video_path):
    clip = VideoFileClip(video_path)
    temp_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    clip.audio.write_audiofile(temp_audio.name, logger=None)
    return temp_audio.name

def predict_fusion_gradio(video_file):
    audio_path = extract_audio_from_video(video_file)
    audio_features = extract_audio_features(audio_path)
    audio_features = audio_features[np.newaxis, ...]
    audio_features_scaled = scaler.transform(audio_features.reshape(-1, audio_features.shape[2])).reshape(audio_features.shape)
    audio_pred = audio_model.predict(audio_features_scaled)[0][0]
    frames = extract_frames(video_file)
    video_preds = video_model.predict(frames)
    video_emotion_mean = np.mean(video_preds)
    video_adjustment = VIDEO_INFLUENCE_WEIGHT * (video_emotion_mean - 0.5)
    final_pred = audio_pred + video_adjustment
    final_pred = np.clip(final_pred, 0, 1)
    label = "Depressed" if final_pred >= 0.5 else "Not Depressed"
    os.unlink(audio_path)
    return f"Audio: {audio_pred:.3f}, Video: {video_emotion_mean:.3f}, Final: {final_pred:.3f}, Label: {label}"

def launch_gradio():
    gr.Interface(
        fn=predict_fusion_gradio,
        inputs=gr.File(file_types=[".mp4", ".flv"], label="Upload MP4 or FLV Video"),
        outputs="text",
        title="Depression Detection",
        description="Upload a video (MP4/FLV) containing both audio and video. The model will predict depression status."
    ).launch(share=True, debug=True)

launch_gradio()