# **Curso IA desde Cero**

* Dr. Irvin Hussein L√≥pez Nava
* M.C. Joan M. Raygoza Romero

# Instalar las librerias necesarias para imagen, keypoints y transcripci√≥n de audio a texto

In [None]:
!pip install mediapipe opencv-python moviepy SpeechRecognition pydub
import warnings
warnings.filterwarnings("ignore")

## Extraer keypoints y features para el modelo de visi√≥n

In [None]:
import cv2
import numpy as np
import mediapipe as mp
import pickle

# =========================
# 1. Definici√≥n de keypoints
# =========================

IMPORTANT_KEYPOINTS = {
    # ---- REFERENCIAS / NORMALIZACI√ìN ----
    "nose_tip": 1,
    "nose_bridge": 6,
    "forehead_center": 10,
    "chin": 152,

    # ---- BOCA (EXTERNA E INTERNA) ----
    "mouth_left": 61,
    "mouth_right": 291,
    "mouth_upper_outer": 13,
    "mouth_lower_outer": 14,
    "mouth_upper_inner": 0,
    "mouth_lower_inner": 17,
    "mouth_left_inner": 40,
    "mouth_right_inner": 270,
    "mouth_corner_left_top": 84,
    "mouth_corner_left_bottom": 181,
    "mouth_corner_right_top": 314,
    "mouth_corner_right_bottom": 405,

    # ---- OJO IZQUIERDO ----
    "left_eye_outer": 33,
    "left_eye_inner": 133,
    "left_eye_upper": 159,
    "left_eye_lower": 145,
    "left_eye_upper_inner": 158,
    "left_eye_lower_inner": 153,
    "left_eye_upper_outer": 160,
    "left_eye_lower_outer": 144,

    # ---- OJO DERECHO ----
    "right_eye_outer": 263,
    "right_eye_inner": 362,
    "right_eye_upper": 386,
    "right_eye_lower": 374,
    "right_eye_upper_inner": 385,
    "right_eye_lower_inner": 380,
    "right_eye_upper_outer": 387,
    "right_eye_lower_outer": 373,

    # ---- CEJA IZQUIERDA ----
    "left_eyebrow_outer": 70,
    "left_eyebrow_middle": 105,
    "left_eyebrow_inner": 107,
    "left_eyebrow_lower_outer": 46,
    "left_eyebrow_lower_middle": 52,
    "left_eyebrow_lower_inner": 55,

    # ---- CEJA DERECHA ----
    "right_eyebrow_outer": 300,
    "right_eyebrow_middle": 334,
    "right_eyebrow_inner": 336,
    "right_eyebrow_lower_outer": 285,
    "right_eyebrow_lower_middle": 282,
    "right_eyebrow_lower_inner": 276
}

IMPORTANT_KEYPOINTS_IDX_LIST = list(IMPORTANT_KEYPOINTS.values())
KEY_NAMES = list(IMPORTANT_KEYPOINTS.keys())

# √≠ndices de los ojos para normalizaci√≥n
LEFT_EYE_INNER_IDX = IMPORTANT_KEYPOINTS["left_eye_inner"]
RIGHT_EYE_INNER_IDX = IMPORTANT_KEYPOINTS["right_eye_inner"]

# preparar face mesh
mp_face_mesh = mp.solutions.face_mesh

# =========================
# 2. Funciones auxiliares
# =========================

def extract_keypoints_from_frame(img_bgr, face_mesh,
                                normalization=True,
                                head_orientation=True):
    """
    img_bgr: frame en BGR (OpenCV)
    face_mesh: instancia de mp.solutions.face_mesh.FaceMesh
    Devuelve: vector 1D con coords [x1,y1,x2,y2,...] normalizadas,
              o None si no hay rostro.
    """
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(img_rgb)

    if not results.multi_face_landmarks:
        return None

    face_landmarks = results.multi_face_landmarks[0]

    # Extraer solo keypoints importantes
    important_landmarks = []
    for idx in IMPORTANT_KEYPOINTS_IDX_LIST:
        lm = face_landmarks.landmark[idx]
        important_landmarks.append([lm.x, lm.y])

    keypoints_array = np.array(important_landmarks)  # (N, 2)

    # Ojos para referencia (usamos inner)
    le_lm = face_landmarks.landmark[LEFT_EYE_INNER_IDX]
    re_lm = face_landmarks.landmark[RIGHT_EYE_INNER_IDX]
    left_eye  = np.array([le_lm.x, le_lm.y])
    right_eye = np.array([re_lm.x, re_lm.y])

    # 1) Centrar en el medio de los ojos
    anchor = (left_eye + right_eye) / 2
    keypoints_array = keypoints_array - anchor

    # 2) Escala (distancia entre ojos)
    if normalization:
        eye_vec  = right_eye - left_eye
        eye_dist = np.linalg.norm(eye_vec)
        if eye_dist < 1e-6:
            eye_dist = 1e-6
        keypoints_array = keypoints_array / eye_dist

    # 3) Rotaci√≥n (alinear ojos al eje X)
    if head_orientation:
        eye_vec = right_eye - left_eye
        angle = np.arctan2(eye_vec[1], eye_vec[0])
        R = np.array([
            [np.cos(-angle), -np.sin(-angle)],
            [np.sin(-angle),  np.cos(-angle)]
        ])
        keypoints_array = keypoints_array @ R.T

    return keypoints_array.flatten()

In [None]:
def distance(a, b):
    """Distancia euclidiana 2D simple."""
    return float(np.linalg.norm(a - b))


def build_feature_vector(keypoints):
    pts = keypoints.reshape(-1, 2)  # (n_points, 2)

    # Mapeo nombre -> punto (x,y)
    P = {name: pts[i] for i, name in enumerate(KEY_NAMES)}

    feats = []

    # ---------- BOCA ----------
    mouth_width = distance(P["mouth_left"], P["mouth_right"])
    mouth_open_outer = abs(P["mouth_upper_outer"][1] - P["mouth_lower_outer"][1])
    mouth_open_inner = abs(P["mouth_upper_inner"][1] - P["mouth_lower_inner"][1])

    # ‚Äúcurvatura‚Äù de la sonrisa (boca hacia arriba/abajo)
    mean_corner_y = 0.5 * (P["mouth_left"][1] + P["mouth_right"][1])
    smile_curvature = mean_corner_y - P["mouth_lower_outer"][1]

    feats += [
        mouth_width,
        mouth_open_outer,
        mouth_open_inner,
        smile_curvature,
    ]

    # ---------- OJOS ----------
    # ojo izquierdo
    left_eye_open = distance(P["left_eye_upper"], P["left_eye_lower"])
    left_eye_open_inner = distance(P["left_eye_upper_inner"], P["left_eye_lower_inner"])
    left_eye_open_outer = distance(P["left_eye_upper_outer"], P["left_eye_lower_outer"])

    # ojo derecho
    right_eye_open = distance(P["right_eye_upper"], P["right_eye_lower"])
    right_eye_open_inner = distance(P["right_eye_upper_inner"], P["right_eye_lower_inner"])
    right_eye_open_outer = distance(P["right_eye_upper_outer"], P["right_eye_lower_outer"])

    feats += [
        left_eye_open,
        left_eye_open_inner,
        left_eye_open_outer,
        right_eye_open,
        right_eye_open_inner,
        right_eye_open_outer,
    ]

    # ---------- CEJAS (altura respecto al ojo y pendiente) ----------
    left_eye_center = 0.5 * (P["left_eye_inner"] + P["left_eye_outer"])
    right_eye_center = 0.5 * (P["right_eye_inner"] + P["right_eye_outer"])

    left_brow_height = left_eye_center[1] - P["left_eyebrow_middle"][1]
    right_brow_height = right_eye_center[1] - P["right_eyebrow_middle"][1]

    # pendiente (inclinaci√≥n) de ceja: outer vs inner
    left_brow_slope = P["left_eyebrow_outer"][1] - P["left_eyebrow_inner"][1]
    right_brow_slope = P["right_eyebrow_outer"][1] - P["right_eyebrow_inner"][1]

    feats += [
        left_brow_height,
        right_brow_height,
        left_brow_slope,
        right_brow_slope,
    ]

    # ---------- PROPORCIONES CARA ----------
    # altura de cara: frente -> ment√≥n
    face_height = distance(P["forehead_center"], P["chin"])
    # longitud nariz: nose_bridge -> nose_tip
    nose_length = distance(P["nose_bridge"], P["nose_tip"])

    # algunos ratios que son invariantes de escala
    mouth_width_ratio = mouth_width / face_height
    nose_face_ratio = nose_length / face_height

    feats += [
        face_height,
        nose_length,
        mouth_width_ratio,
        nose_face_ratio,
    ]

    return np.array(feats)

## Predecir una emocion con los keypoints del video

In [None]:
def predict_emotion_from_video(clip, model, fps_sample=5):
    """
    Toma frames del video a una cierta frecuencia (fps_sample),
    obtiene keypoints y promedia las probabilidades.
    """
    probs_list = []
    with mp_face_mesh.FaceMesh(
        static_image_mode=False,
        max_num_faces=1,
        refine_landmarks=True,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    ) as face_mesh:

        times = np.arange(0, clip.duration, 1.0 / fps_sample)
        for t in times:
            frame_rgb = clip.get_frame(t)  # RGB (moviepy da RGB)
            kp_flat = extract_keypoints_from_frame(frame_rgb, face_mesh)
            if kp_flat is None:
                continue
            X = kp_flat.reshape(1, -1)
            X = build_feature_vector(X).reshape(1, -1)
            proba = img_model.predict_proba(X)[0]
            probs_list.append(proba)

    if not probs_list:
        return None  # no se pudo detectar rostro en ning√∫n frame

    probs_mean = np.mean(probs_list, axis=0)
    return probs_mean

## Preparar la transcripci√≥n del audio y extracci√≥n de embeddings del texto

In [None]:
from sentence_transformers import SentenceTransformer
import speech_recognition as sr

emb_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

def transcribe_audio(audio_path, language="es-MX"):
    r = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = r.record(source)
    try:
        text = r.recognize_google(audio, language=language)
        return text
    except Exception as e:
        print("Error al transcribir:", e)
        return ""


def predict_emotion_from_text(text, model, emb_model):
    if not text.strip():
        return None
    embs = emb_model.encode(text)
    proba = model.predict_proba([embs])[0]
    return proba

## Cargar modelos

In [6]:
IMG_MODEL_PATH = "model.pkl"
IMG_LABEL_ENCODER_PATH = "le.pkl"
TEXT_MODEL_PATH = "model_text.pkl"
TEXT_LABEL_ENCODER_PATH = "le_text.pkl"

with open(IMG_MODEL_PATH, "rb") as f:
    img_model = pickle.load(f)

with open(IMG_LABEL_ENCODER_PATH, "rb") as f:
    img_le = pickle.load(f)

with open(TEXT_MODEL_PATH, "rb") as f:
    text_model = pickle.load(f)

with open(TEXT_LABEL_ENCODER_PATH, "rb") as f:
    text_le = pickle.load(f)

## Funcion para fusionar las probabilidades del video y texto

In [7]:
def fuse_probs(p_video, p_text, alpha=0.5):
    """
    alpha: peso para el video (0.0‚Äì1.0)
    """
    if p_video is None and p_text is None:
        return None
    if p_video is None:
        return p_text
    if p_text is None:
        return p_video
    return alpha * p_video + (1 - alpha) * p_text

## Capturar un video y obtener el audio

In [9]:
from google.colab import output
import numpy as np
import base64
from moviepy.editor import VideoFileClip

VIDEO_PATH = "captura.webm"

# Funci√≥n robusta para grabar video con audio en Colab
def grabar_video(segundos=10):
  js = f"""
    async function recordVideo() {{
      const stream = await navigator.mediaDevices.getUserMedia({{video: true, audio: true}});
      const options = {{mimeType: "video/webm;codecs=vp9,opus"}};

      const mediaRecorder = new MediaRecorder(stream, options);
      let chunks = [];

      mediaRecorder.ondataavailable = (e) => {{
        if (e.data.size > 0) chunks.push(e.data);
      }};

      mediaRecorder.start();

      // bot√≥n para detener
      const btn = document.createElement("button");
      btn.textContent = "‚èπÔ∏è DETENER";
      btn.style = "font-size:20px; margin:10px;";
      document.body.appendChild(btn);

      let stopped = false;
      btn.onclick = () => {{
        if (!stopped) {{
          mediaRecorder.stop();
          stopped = true;
        }}
      }};

      // tiempo m√°ximo
      await new Promise(resolve => setTimeout(resolve, {segundos * 1000}));

      if (!stopped) mediaRecorder.stop();

      await new Promise(resolve => mediaRecorder.onstop = resolve);

      document.body.removeChild(btn);
      stream.getTracks().forEach(t => t.stop());

      const blob = new Blob(chunks, {{type: "video/webm"}});
      const reader = new FileReader();

      return await new Promise(resolve => {{
        reader.onloadend = () => resolve(reader.result);
        reader.readAsDataURL(blob);
      }});
    }}

    recordVideo();
  """

  print("üé• Grabando... (m√°x", segundos, "seg)")
  data_url = output.eval_js(js)
  print("Finalizado, guardando archivo...")

  # Extraer solo el Base64
  base64_data = data_url.split(",")[1]

  # ARREGLAR PADDING DE BASE64 MANUALMENTE (la causa de tu error)
  missing_padding = len(base64_data) % 4
  if missing_padding != 0:
      base64_data += "=" * (4 - missing_padding)

  video_bytes = base64.b64decode(base64_data)

  with open(VIDEO_PATH, "wb") as f:
      f.write(video_bytes)

  print("üìÅ Archivo guardado como:", VIDEO_PATH)

# Ejecutar
grabar_video(10)

clip = VideoFileClip(VIDEO_PATH)
print("Duraci√≥n del video:", clip.duration, "segundos")

# Guardar audio a WAV
AUDIO_PATH = "audio.wav"
clip.audio.write_audiofile(AUDIO_PATH)
print("Audio guardado en", AUDIO_PATH)

üé• Grabando... (m√°x 10 seg)
Finalizado, guardando archivo...
üìÅ Archivo guardado como: captura.webm
Duraci√≥n del video: 9.93 segundos
MoviePy - Writing audio in audio.wav


                                                        

MoviePy - Done.
Audio guardado en audio.wav




## Probar ambos modelos

In [10]:
emotion_labels = text_le.classes_

# 1) Probabilidades desde el video (keypoints)
p_video = predict_emotion_from_video(clip, img_model, fps_sample=5)
print("Probabilidades desde VIDEO (keypoints):")
if p_video is not None:
    for label, p in zip(emotion_labels, p_video):
        print(f"  {label}: {p*100:.1f}%")
else:
    print("  No se pudo estimar (no se detect√≥ rostro).")

# 2) Probabilidades desde el texto (audio transcrito)
texto = transcribe_audio(AUDIO_PATH, language="es-MX")
print("\nTranscripci√≥n de audio:")
print(texto)

p_text = predict_emotion_from_text(texto, text_model, emb_model)
print("\nProbabilidades desde TEXTO:")
if p_text is not None:
    for label, p in zip(emotion_labels, p_text):
        print(f"  {label}: {p*100:.1f}%")
else:
    print("  No se pudo estimar (texto vac√≠o o error).")

# 3) Fusi√≥n
p_final = fuse_probs(p_video, p_text, alpha=0.5)

print("\n=== PREDICCI√ìN FINAL FUSIONADA ===")
if p_final is not None:
    for label, p in zip(emotion_labels, p_final):
        print(f"  {label}: {p*100:.1f}%")
    best_idx = np.argmax(p_final)
    print("\nEmoci√≥n final predicha:", emotion_labels[best_idx])
else:
    print("No se pudo obtener una predicci√≥n final.")

Probabilidades desde VIDEO (keypoints):
  angry: 76.7%
  happy: 11.2%
  neutral: 1.4%
  sad: 10.7%

Transcripci√≥n de audio:
S√≠ la verdad es que estoy muy triste porque me fue muy mal y pues ni modo es lo que hay as√≠ es la vida a veces se gana a veces se pierde no O sea que

Probabilidades desde TEXTO:
  angry: 0.2%
  happy: 0.2%
  neutral: 0.3%
  sad: 99.3%

=== PREDICCI√ìN FINAL FUSIONADA ===
  angry: 38.5%
  happy: 5.7%
  neutral: 0.9%
  sad: 55.0%

Emoci√≥n final predicha: sad
