# 720 AI Assistant MVP — Video Pose Analysis (MediaPipe)

This notebook processes a **video** (not just photos):
- Runs **MediaPipe PoseLandmarker** per frame
- Saves an **annotated video** with landmarks
- Computes simple technique metrics and prints **insights/cues**

> Update `VIDEO_PATH` to your local file name (upload it to the same folder as the notebook or mount it).

In [1]:
import sys
sys.version

'3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]'

In [2]:
!pip install -q mediapipe opencv-python


[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!pip install -q pandas


[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import urllib.request, os, pathlib

MODEL_URL = "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_heavy/float16/1/pose_landmarker_heavy.task"
MODEL_PATH = "pose_landmarker_heavy.task"

if not os.path.exists(MODEL_PATH):
    urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)
    print("Downloaded:", MODEL_PATH)
else:
    print("Model already exists:", MODEL_PATH)


Downloaded: pose_landmarker_heavy.task


In [4]:
# Drawing helper (MediaPipe official style)
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

def draw_landmarks_on_image(rgb_image, detection_result):
    pose_landmarks_list = detection_result.pose_landmarks
    annotated_image = np.copy(rgb_image)

    for idx in range(len(pose_landmarks_list)):
        pose_landmarks = pose_landmarks_list[idx]

        pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
        pose_landmarks_proto.landmark.extend([
            landmark_pb2.NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z, visibility=getattr(lm, "visibility", 0.0))
            for lm in pose_landmarks
        ])

        solutions.drawing_utils.draw_landmarks(
            annotated_image,
            pose_landmarks_proto,
            solutions.pose.POSE_CONNECTIONS,
            solutions.drawing_styles.get_default_pose_landmarks_style()
        )
    return annotated_image


## Run on a video

1) Set `VIDEO_PATH`
2) Run the cell — it will create:
- `annotated_output.mp4`
- `metrics.csv`
- Printed insights

In [7]:
import cv2
import numpy as np
import pandas as pd
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from dataclasses import dataclass
from typing import Optional, Dict, List

# ========= 1) INPUT =========
VIDEO_PATH = "swing720_attempt.mp4"   # <-- change this to your file
OUTPUT_VIDEO = "annotated_output.mp4"
OUTPUT_METRICS = "metrics.csv"

# ========= 2) PoseLandmarker (VIDEO mode) =========
BaseOptions = mp.tasks.BaseOptions
PoseLandmarker = mp.tasks.vision.PoseLandmarker
PoseLandmarkerOptions = mp.tasks.vision.PoseLandmarkerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

options = PoseLandmarkerOptions(
    base_options=BaseOptions(model_asset_path=MODEL_PATH),
    running_mode=VisionRunningMode.VIDEO,
    num_poses=1
)
landmarker = PoseLandmarker.create_from_options(options)

# ========= 3) Landmark indices =========
NOSE = 0
L_SHOULDER, R_SHOULDER = 11, 12
L_ELBOW, R_ELBOW = 13, 14
L_WRIST, R_WRIST = 15, 16
L_HIP, R_HIP = 23, 24
L_ANKLE, R_ANKLE = 27, 28

def lm_xy(lms, idx):
    lm = lms[idx]
    return np.array([lm.x, lm.y], dtype=np.float32)

def dist(a, b):
    return float(np.linalg.norm(a - b))

@dataclass
class FrameMetrics:
    t: float
    head_drop: float      # proxy for "looking down" / chest collapsing
    elbow_tuck: float     # proxy for elbows in (lower is better)
    arm_open: float       # proxy for arms open (higher is more open)
    torso_lean: float     # radians from vertical (higher = more inclined)
    kick_lateral: float   # proxy for kick going to the side
    rot_proxy: float      # proxy for rotation per-frame (higher = faster)

def compute_frame_metrics(lms, t: float, prev_state: Optional[Dict]=None):
    nose = lm_xy(lms, NOSE)
    ls, rs = lm_xy(lms, L_SHOULDER), lm_xy(lms, R_SHOULDER)
    le, re = lm_xy(lms, L_ELBOW), lm_xy(lms, R_ELBOW)
    lw, rw = lm_xy(lms, L_WRIST), lm_xy(lms, R_WRIST)
    lh, rh = lm_xy(lms, L_HIP), lm_xy(lms, R_HIP)
    la, ra = lm_xy(lms, L_ANKLE), lm_xy(lms, R_ANKLE)

    shoulder_mid = (ls + rs) / 2.0
    hip_mid = (lh + rh) / 2.0

    sh_w = dist(ls, rs)
    sh_w = sh_w if sh_w > 1e-6 else 1e-6

    # 1) Head drop proxy: nose lower than shoulder mid (y grows downward)
    head_drop = float((nose[1] - shoulder_mid[1]) / sh_w)

    # 2) Elbow tuck proxy
    elbow_tuck = float(((dist(le, shoulder_mid) + dist(re, shoulder_mid)) / 2.0) / sh_w)

    # 3) Arm open proxy: wrist far from shoulder
    arm_open = float(((dist(lw, ls) + dist(rw, rs)) / 2.0) / sh_w)

    # 4) Torso lean angle to vertical
    v = shoulder_mid - hip_mid
    v_norm = np.linalg.norm(v) + 1e-6
    cos_to_vertical = abs(np.dot(v / v_norm, np.array([0.0, -1.0], dtype=np.float32)))
    torso_lean = float(np.arccos(np.clip(cos_to_vertical, 0, 1)))  # radians

    # 5) Kick lateral proxy: ankles horizontal deviation from hips
    kick_lateral = float((abs(la[0] - hip_mid[0]) + abs(ra[0] - hip_mid[0])) / sh_w)

    # 6) Rotation proxy: change in shoulder line angle
    shoulder_vec = rs - ls
    shoulder_angle = float(np.arctan2(shoulder_vec[1], shoulder_vec[0]))
    rot_proxy = 0.0
    if prev_state and "shoulder_angle" in prev_state:
        d = shoulder_angle - prev_state["shoulder_angle"]
        d = (d + np.pi) % (2*np.pi) - np.pi
        rot_proxy = abs(float(d))

    state = {"shoulder_angle": shoulder_angle}

    return FrameMetrics(t, head_drop, elbow_tuck, arm_open, torso_lean, kick_lateral, rot_proxy), state

# ========= 4) Read video and write annotated output =========
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise FileNotFoundError(f"Could not open video: {VIDEO_PATH}")

fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (w, h))

rows: List[dict] = []
prev_state = None
frame_idx = 0
pose_frames = 0

while True:
    ok, frame_bgr = cap.read()
    if not ok:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)

    t_ms = int((frame_idx / fps) * 1000)
    result = landmarker.detect_for_video(mp_image, t_ms)

    if result.pose_landmarks and len(result.pose_landmarks) > 0:
        pose_frames += 1
        lms = result.pose_landmarks[0]
        t = frame_idx / fps
        m, prev_state = compute_frame_metrics(lms, t, prev_state)
        rows.append(m.__dict__)

        annotated_rgb = draw_landmarks_on_image(frame_rgb, result)
        annotated_bgr = cv2.cvtColor(annotated_rgb, cv2.COLOR_RGB2BGR)
        writer.write(annotated_bgr)
    else:
        # no pose -> write original frame
        writer.write(frame_bgr)

    frame_idx += 1

cap.release()
writer.release()

df = pd.DataFrame(rows)
df.to_csv(OUTPUT_METRICS, index=False)

print("Done.")
print("Frames:", frame_idx, "| Frames with pose:", pose_frames)
print("Saved:", OUTPUT_VIDEO)
print("Saved:", OUTPUT_METRICS)
df.head()


Done.
Frames: 106 | Frames with pose: 106
Saved: annotated_output.mp4
Saved: metrics.csv


Unnamed: 0,t,head_drop,elbow_tuck,arm_open,torso_lean,kick_lateral,rot_proxy
0,0.0,-0.242433,0.7606,0.464836,0.074764,0.838851,0.0
1,0.033333,-0.24617,0.74819,0.453997,0.075691,0.827802,0.002731
2,0.066667,-0.241542,0.70514,0.407603,0.078757,0.711475,0.007264
3,0.1,-0.217436,0.680718,0.385812,0.087183,0.648729,0.015019
4,0.133333,-0.226747,0.662409,0.368128,0.090305,0.621979,0.002655


In [12]:
import numpy as np
import pandas as pd

def pct_over(arr, thr): 
    return float(np.mean(arr > thr)) if len(arr) else 0.0

def insights_from_metrics(df: pd.DataFrame):
    if df is None or df.empty:
        return ["No pose frames detected. Ensure full body is visible and the video is not too dark/blurred."]

    head = df["head_drop"].to_numpy(np.float32)
    tuck = df["elbow_tuck"].to_numpy(np.float32)
    open_ = df["arm_open"].to_numpy(np.float32)
    lean = df["torso_lean"].to_numpy(np.float32)
    kick = df["kick_lateral"].to_numpy(np.float32)
    rot = df["rot_proxy"].to_numpy(np.float32)

    out = []

    # Starter thresholds (tune with your own videos)
    if pct_over(head, 0.10) > 0.35:
        out.append("❗ Head drops a lot (proxy): you likely look down / collapse chest. Cue: keep chin neutral, look forward.")
    if pct_over(tuck, 0.55) > 0.40:
        out.append("❗ Elbows not tucked (proxy). Cue: pin elbows to ribs during the spin.")
    if pct_over(open_, 0.85) > 0.45:
        out.append("❗ Arms open too early (proxy). Cue: stay closed longer before you open for landing.")
    if pct_over(lean, 0.60) > 0.30:
        out.append("❗ Torso too inclined (proxy). Cue: brace core, avoid throwing the chest down on takeoff.")
    if pct_over(kick, 1.00) > 0.35:
        out.append("❗ Kick goes too lateral (proxy). Cue: drive kick more backward/controlled, keep hips square.")
    if float(np.mean(rot)) < 0.02:
        out.append("❗ Rotation looks slow (proxy). Cue: earlier shoulder+hip snap at takeoff; close arms fast.")

    if not out:
        out.append("✅ No major flags from these simple proxies. Next step: detect phases (takeoff/airborne/landing) for more precise cues.")

    # Summary stats (helps tuning)
    summary = {
        "avg_head_drop": float(np.mean(head)),
        "avg_elbow_tuck": float(np.mean(tuck)),
        "avg_arm_open": float(np.mean(open_)),
        "avg_torso_lean_rad": float(np.mean(lean)),
        "avg_kick_lateral": float(np.mean(kick)),
        "avg_rot_proxy": float(np.mean(rot)),
    }

    return out, summary

ins, summary = insights_from_metrics(df)
print("\n".join(ins))
print("\nSummary:", summary)


❗ Elbows not tucked (proxy). Cue: pin elbows to ribs during the spin.
❗ Kick goes too lateral (proxy). Cue: drive kick more backward/controlled, keep hips square.

Summary: {'avg_head_drop': -0.34332185983657837, 'avg_elbow_tuck': 0.9731638431549072, 'avg_arm_open': 1.1120712757110596, 'avg_torso_lean_rad': 0.19476869702339172, 'avg_kick_lateral': 1.5466331243515015, 'avg_rot_proxy': 0.1301153600215912}
