In [4]:
!pip install mediapipe==0.10.20


Collecting mediapipe==0.10.20
  Downloading mediapipe-0.10.20-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting numpy<2 (from mediapipe==0.10.20)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5,>=4.25.3 (from mediapipe==0.10.20)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe==0.10.20)
  Downloading sounddevice-0.5.3-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of jax to determine which version is compatible with other requirements. This could take a while.
Collecting jax (from mediapipe==0.10.20)
  Downloading jax-0.8.0-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib (from mediapipe==0.10.20)
  Downloading jaxlib-0.8.0-cp312-cp312-manylinux_2_27_x86_64.whl.

In [1]:
import os, sys, math, random, shutil, zipfile, glob, time
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
import mediapipe as mp
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter, defaultdict



In [6]:
ZIP_VIDEOS = "/content/drive/MyDrive/Colab Notebooks/words_31.zip"    # path to zip containing all videos (flat)
CSV_PATH    = "/content/adjectives_32.csv"  # path to your adjectives_32.csv (auto-detected)
OUT_ROOT    = "/content/pipeline_output"  # results folder
FPS         = 10        # frames per second to extract
T_SEQ       = 32        # frames per sequence
MIN_CLASS_SAMPLES = 1   # keep >=1 samples per class for now (you can bump to 5 if you want)
SEED = 42
BATCH_SIZE = 32
EPOCHS = 30
LR = 3e-4

In [7]:
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.makedirs(OUT_ROOT, exist_ok=True)

print("Config:")
print(" ZIP_VIDEOS:", ZIP_VIDEOS)
print(" CSV_PATH   :", CSV_PATH)
print(" OUT_ROOT   :", OUT_ROOT)
print(" FPS        :", FPS, "T_SEQ:", T_SEQ)

Config:
 ZIP_VIDEOS: /content/drive/MyDrive/Colab Notebooks/words_31.zip
 CSV_PATH   : /content/adjectives_32.csv
 OUT_ROOT   : /content/pipeline_output
 FPS        : 10 T_SEQ: 32


In [4]:
# ------------------- 1) Unzip videos (if zipped) -------------------
VIDEOS_DIR = os.path.join(OUT_ROOT, "videos")
if os.path.exists(VIDEOS_DIR):
    print("Videos dir exists, using:", VIDEOS_DIR)
else:
    os.makedirs(VIDEOS_DIR, exist_ok=True)
    if os.path.exists(ZIP_VIDEOS):
        print("Unzipping videos...")
        with zipfile.ZipFile(ZIP_VIDEOS, 'r') as z:
            z.extractall(VIDEOS_DIR)
        print("Unzipped to", VIDEOS_DIR)
    else:
        raise FileNotFoundError(f"ZIP_VIDEOS not found: {ZIP_VIDEOS}")

Unzipping videos...
Unzipped to /content/pipeline_output/videos


In [8]:
# ------------------- 2) Load CSV and auto-detect columns -------------------
df_csv = pd.read_csv(CSV_PATH)
print("CSV loaded. Columns:", df_csv.columns.tolist())

# auto-detect likely column names
col_word = None
col_singer = None
col_file = None
for c in df_csv.columns:
    cl = c.lower()
    if "word" in cl or "word_name" in cl or "wordname" in cl:
        col_word = c
    if "singer" in cl or "signer" in cl or "person" in cl or "speaker" in cl:
        col_singer = c
    if "file" in cl or "filename" in cl or "file_name" in cl or "path" in cl:
        col_file = c
# if not found, try common names
if col_word is None:
    for c in df_csv.columns:
        if any(k in c.lower() for k in ["name","label"]):
            col_word = c; break
if col_singer is None:
    for c in df_csv.columns:
        if any(k in c.lower() for k in ["id","singer","signer","person"]):
            col_singer = c; break
if col_file is None:
    for c in df_csv.columns:
        if any(k in c.lower() for k in ["file","video","path","fname"]):
            col_file = c; break

if not (col_word and col_singer and col_file):
    raise ValueError(f"Could not auto-detect columns. Found: word={col_word}, singer={col_singer}, file={col_file}. CSV columns: {list(df_csv.columns)}")

print("Using columns -> word:", col_word, "singer:", col_singer, "file:", col_file)
df_csv = df_csv[[col_word, col_singer, col_file]].rename(columns={col_word:"word", col_singer:"singer_id", col_file:"file_name"})

# Trim whitespace & cast to str
df_csv['word'] = df_csv['word'].astype(str).str.strip()
df_csv['file_name'] = df_csv['file_name'].astype(str).str.strip()
df_csv['singer_id'] = df_csv['singer_id'].astype(str).str.strip()

# Build mapping from file_name -> absolute path by searching VIDEOS_DIR (flat)
print("Indexing video files (this may take a moment)...")
video_index = {}
for root, dirs, files in os.walk(VIDEOS_DIR):
    for f in files:
        if f.lower().endswith(('.mp4','.mov','.avi','.mkv','.webm')):
            video_index[f] = os.path.join(root, f)

missing = []
paths = []
for i, r in df_csv.iterrows():
    fname = os.path.basename(r.file_name)
    if fname in video_index:
        paths.append(video_index[fname])
    else:
        missing.append(fname)
        paths.append(None)

df_csv['video_path'] = paths
n_missing = sum(p is None for p in paths)
print("Video matches:", len(df_csv)-n_missing, "missing:", n_missing)
if n_missing>0:
    print("First 10 missing examples:", list(dict.fromkeys(missing))[:10])
    # don't fail — keep only matched rows
    df_csv = df_csv[df_csv['video_path'].notnull()].reset_index(drop=True)

CSV loaded. Columns: ['word_name', 'singer_id', 'file_name']
Using columns -> word: word_name singer: singer_id file: file_name
Indexing video files (this may take a moment)...
Video matches: 651 missing: 0


In [9]:
# ------------------- 3) Extract frames at FPS -------------------
FRAMES_DIR = os.path.join(OUT_ROOT, "frames")
os.makedirs(FRAMES_DIR, exist_ok=True)

def extract_frames_from_video(video_path, out_dir, fps=FPS):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Cannot open", video_path); return 0
    vid_fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
    frame_interval = max(1, int(round(vid_fps / float(fps))))
    total = 0
    idx = 0
    saved = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if idx % frame_interval == 0:
            # save frame as jpg
            fn = os.path.join(out_dir, f"{Path(video_path).stem}_f{idx:05d}.jpg")
            cv2.imwrite(fn, frame)
            saved += 1
        idx += 1
        total += 1
    cap.release()
    return saved

print("Extracting frames for each video (skips if already extracted)...")
rows = []
for i, r in tqdm(df_csv.iterrows(), total=len(df_csv)):
    vpath = r.video_path
    out_sub = os.path.join(FRAMES_DIR, Path(vpath).stem)
    if len(glob.glob(os.path.join(out_sub, "*.jpg"))) == 0:
        n = extract_frames_from_video(vpath, out_sub, fps=FPS)
    else:
        n = len(glob.glob(os.path.join(out_sub, "*.jpg")))
    rows.append((r.word, r.singer_id, vpath, out_sub, n))
frames_df = pd.DataFrame(rows, columns=["word","singer_id","video_path","frames_dir","n_frames"])
print("Frames extracted summary:", frames_df.n_frames.describe())
frames_df.to_csv(os.path.join(OUT_ROOT, "frames_summary.csv"), index=False)

Extracting frames for each video (skips if already extracted)...


100%|██████████| 651/651 [13:59<00:00,  1.29s/it]

Frames extracted summary: count    651.000000
mean      30.617512
std        6.444767
min       18.000000
25%       26.000000
50%       29.000000
75%       34.000000
max       55.000000
Name: n_frames, dtype: float64





In [10]:
# ------------------- 4) MediaPipe Holistic extraction per frame -------------------
npz_out = os.path.join(OUT_ROOT, "normalized_npz")
os.makedirs(npz_out, exist_ok=True)
manifest_rows = []

mp_holistic = mp.solutions.holistic
hol = mp_holistic.Holistic(static_image_mode=True, model_complexity=1,
                           refine_face_landmarks=False, min_detection_confidence=0.5)

def process_image_file(img_path):
    img = cv2.imread(img_path)
    if img is None:
        return None
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    res = hol.process(img_rgb)
    return res

# helper to return (left21, right21, pose33) each as arrays (x,y,z,vis where vis=1 for hands)
def extract_landmarks_array(results):
    # left & right hands: 21 points each, with x,y,z; set vis=1 if present
    left = np.zeros((21,4), dtype=np.float32)
    right = np.zeros((21,4), dtype=np.float32)
    pose = np.zeros((33,4), dtype=np.float32)
    if results.left_hand_landmarks:
        for i,lm in enumerate(results.left_hand_landmarks.landmark):
            left[i,:3] = (lm.x, lm.y, lm.z)
            left[i,3] = 1.0
    if results.right_hand_landmarks:
        for i,lm in enumerate(results.right_hand_landmarks.landmark):
            right[i,:3] = (lm.x, lm.y, lm.z)
            right[i,3] = 1.0
    if results.pose_landmarks:
        for i,lm in enumerate(results.pose_landmarks.landmark):
            pose[i,0:3] = (lm.x, lm.y, lm.z)
            pose[i,3] = getattr(lm, "visibility", 0.0)
    return left, right, pose

def normalize_and_build_feature(left, right, pose):
    # left: (21,4), right: (21,4), pose: (33,4)
    # compute shoulder midpoint (pose indices: left=11, right=12 per MP)
    lsh = pose[11,:3].copy()
    rsh = pose[12,:3].copy()
    # if both zeros, fallback to zeros
    if np.allclose(lsh, 0) and np.allclose(rsh,0):
        mid = np.zeros(3, dtype=np.float32)
        dist = 1.0
    else:
        mid = (lsh + rsh) / 2.0
        dist = np.linalg.norm(lsh - rsh)
        if dist < 1e-6:
            dist = 1.0
    # center & scale
    left_xyz = (left[:,:3] - mid) / dist
    right_xyz = (right[:,:3] - mid) / dist
    pose_xyz = (pose[:,:3] - mid) / dist
    # vis columns unchanged (left[:,3], right[:,3], pose[:,3])
    # flatten into (21+21+33)*4 = 300 dims (if we include vis). That's x,y,z,vis for each.
    left_flat = np.concatenate([left_xyz, left[:,3:4]], axis=1).flatten()
    right_flat = np.concatenate([right_xyz, right[:,3:4]], axis=1).flatten()
    pose_flat = np.concatenate([pose_xyz, pose[:,3:4]], axis=1).flatten()
    base = np.concatenate([left_flat, right_flat, pose_flat])  # 300
    # Derived features: inter-hand dist, left open (dist between wrist and index MCP), right open
    # Use wrist idx 0 and index_mcp idx 5 (hand landmarks standard)
    try:
        inter_hand = np.linalg.norm(left_xyz[0] - right_xyz[0])
        left_open = np.linalg.norm(left_xyz[0] - left_xyz[5])
        right_open = np.linalg.norm(right_xyz[0] - right_xyz[5])
    except Exception:
        inter_hand, left_open, right_open = 0.0, 0.0, 0.0
    derived = np.array([inter_hand, left_open, right_open], dtype=np.float32)
    feat = np.concatenate([base, derived]).astype(np.float32)  # 303 dims
    # clip to reasonable range
    feat = np.clip(feat, -5.0, 5.0)
    return feat

print("Running Holistic on frames and saving normalized_npz (this takes time)...")
total_frames_processed = 0
for idx, row in tqdm(frames_df.iterrows(), total=len(frames_df)):
    frdir = row.frames_dir
    imgs = sorted(glob.glob(os.path.join(frdir, "*.jpg")))
    for img_path in imgs:
        res = process_image_file(img_path)
        left, right, pose = extract_landmarks_array(res)
        feat = normalize_and_build_feature(left, right, pose)
        # save npz named by frame file
        out_name = os.path.join(npz_out, Path(img_path).stem + "_norm.npz")
        np.savez_compressed(out_name, features=feat, word=row.word, singer_id=row.singer_id, orig_image=img_path)
        manifest_rows.append((img_path, out_name, row.word, row.singer_id, float(np.mean(feat)), "ok"))
        total_frames_processed += 1

hol.close()
print("Total frames processed:", total_frames_processed)

manifest_df = pd.DataFrame(manifest_rows, columns=["FramePath","normalized_npz","Word","singer_id","feat_mean","status"])
manifest_df.to_csv(os.path.join(OUT_ROOT,"normalized_manifest.csv"), index=False)
print("Saved normalized_manifest.csv with", len(manifest_df), "rows.")

# ------------------- Diagnostics (quick) -------------------
feats = np.stack([np.load(p)["features"] for p in manifest_df["normalized_npz"].values], axis=0)
print("Diagnostics: NaN", np.isnan(feats).sum(), "Inf", np.isinf(feats).sum(), "shape", feats.shape)
print("Feat dim:", feats.shape[1], "expected 303")
if feats.shape[1] != 303:
    print("WARNING: feature dim mismatch. Found:", feats.shape[1], "expected 303. Adjust preprocess or parser.")

# flag duplicates
uniq_cnt = len(np.unique(feats.reshape(feats.shape[0], -1).view([('f', feats.dtype, feats.shape[1])])))
print("Unique feature vectors:", uniq_cnt, "/", feats.shape[0])

Running Holistic on frames and saving normalized_npz (this takes time)...


100%|██████████| 651/651 [39:37<00:00,  3.65s/it]


Total frames processed: 19932
Saved normalized_manifest.csv with 19932 rows.
Diagnostics: NaN 0 Inf 0 shape (19932, 303)
Feat dim: 303 expected 303
Unique feature vectors: 19892 / 19932


In [11]:
# ------------------- 5) Build sequences per video (T_SEQ frames) -------------------
# Strategy: group frames by video stem (we used frame file stems like video_f00001.jpg)
# We'll collect frames per video in temporal order (by frame index in filename).
grouped = defaultdict(list)
for r in manifest_df.itertuples():
    stem = Path(r.FramePath).parent.name  # we saved frames under a folder named by video stem
    grouped[stem].append((r.FramePath, r.normalized_npz))

# build sequences list (one sequence = up to T_SEQ frames from same video, sliding window stride=T_SEQ)
seqs = []
labels = []
signers = []
orig_video_ids = []
for vid_stem, items in grouped.items():
    # sort by filename
    items_sorted = sorted(items, key=lambda x: x[0])
    feats_list = [np.load(npz)["features"] for _, npz in items_sorted]
    n = len(feats_list)
    if n == 0:
        continue
    # if shorter than T_SEQ -> pad by repeating last frame
    if n <= T_SEQ:
        arr = np.stack(feats_list + [feats_list[-1]]*(T_SEQ-n), axis=0)
        seqs.append(arr)
        # assign word, signer from first frame
        meta = np.load(items_sorted[0][1])
        labels.append(meta["word"].tolist() if isinstance(meta["word"], np.bytes_) else meta["word"])
        signers.append(str(meta["singer_id"]))
        orig_video_ids.append(vid_stem)
    else:
        # create non-overlapping chunks (could be sliding; use non-overlapping to avoid leakage)
        stride = T_SEQ
        for start in range(0, n, stride):
            chunk = feats_list[start:start+T_SEQ]
            if len(chunk) < T_SEQ:
                chunk = chunk + [chunk[-1]]*(T_SEQ - len(chunk))
            arr = np.stack(chunk, axis=0)
            seqs.append(arr)
            meta = np.load(items_sorted[start][1])
            labels.append(meta["word"].tolist() if isinstance(meta["word"], np.bytes_) else meta["word"])
            signers.append(str(meta["singer_id"]))
            orig_video_ids.append(vid_stem)

X = np.stack(seqs, axis=0).astype(np.float32)   # shape (N_seq, T_SEQ, F)
print("Built sequences:", X.shape)
y_words = np.array(labels)
signer_arr = np.array(signers)
# map words to numeric labels
unique_words = sorted(list(set(y_words.tolist())))
word_to_idx = {w:i for i,w in enumerate(unique_words)}
y = np.array([word_to_idx[w] for w in y_words], dtype=np.int32)
print("Unique words:", len(unique_words))

# save raw sequences
np.save(os.path.join(OUT_ROOT,"sequences.npy"), X)
np.save(os.path.join(OUT_ROOT,"labels.npy"), y)
np.save(os.path.join(OUT_ROOT,"signers.npy"), signer_arr)
pd.DataFrame({"word":y_words,"label":y,"signer_id":signer_arr,"video_id":orig_video_ids}).to_csv(os.path.join(OUT_ROOT,"sequence_manifest.csv"), index=False)
print("Saved sequences, labels, signers and manifest.")

Built sequences: (874, 32, 303)
Unique words: 31
Saved sequences, labels, signers and manifest.


In [12]:
# ------------------- 6) Filter classes with few samples (optional) -------------------
counts = Counter(y)
print("Class sample counts (top 20):", counts.most_common(20))
valid_classes = [cls for cls,cnt in counts.items() if cnt >= MIN_CLASS_SAMPLES]
mask = np.isin(y, valid_classes)
X = X[mask]
y = y[mask]
signer_arr = signer_arr[mask]
print("After filtering classes:", X.shape, "classes:", len(set(y)))

Class sample counts (top 20): [(26, 33), (8, 33), (12, 32), (22, 32), (27, 32), (10, 32), (0, 31), (1, 31), (9, 31), (21, 30), (29, 30), (15, 30), (13, 30), (7, 30), (20, 30), (28, 30), (25, 28), (23, 28), (5, 27), (4, 27)]
After filtering classes: (874, 32, 303) classes: 31


In [13]:
# ------------------- 7) Signer-exclusive split (75/15/10 by signers) -------------------
unique_signers = sorted(list(set(signer_arr.tolist())))
random.shuffle(unique_signers)
n_signers = len(unique_signers)
n_train = max(1, int(math.floor(0.75 * n_signers)))
n_val = max(1, int(math.floor(0.15 * n_signers)))
n_test = max(1, n_signers - n_train - n_val)
train_signers = unique_signers[:n_train]
val_signers   = unique_signers[n_train:n_train+n_val]
test_signers  = unique_signers[n_train+n_val:n_train+n_val+n_test]
print("Signer counts:", n_signers, "train_signers:", len(train_signers), "val:", len(val_signers), "test:", len(test_signers))

def idxs_for(signers_list):
    return np.where(np.isin(signer_arr, signers_list))[0]

train_idx = idxs_for(train_signers)
val_idx = idxs_for(val_signers)
test_idx = idxs_for(test_signers)

print("Initial split sizes (by sequences):", len(train_idx), len(val_idx), len(test_idx))
# safety: ensure no empty splits
if len(val_idx)==0 or len(test_idx)==0:
    # fallback to random split by sequences
    print("Signer split produced empty val/test. Falling back to random stratified by class if possible.")
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, random_state=SEED, stratify=y if len(set(y))>1 else None)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=SEED, stratify=y_temp if len(set(y_temp))>1 else None)
else:
    X_train, y_train = X[train_idx], y[train_idx]
    X_val,   y_val   = X[val_idx], y[val_idx]
    X_test,  y_test  = X[test_idx], y[test_idx]

print("FINAL SPLITS:")
print(" Train:", X_train.shape, y_train.shape)
print(" Val  :", X_val.shape, y_val.shape)
print(" Test :", X_test.shape, y_test.shape)

Signer counts: 7 train_signers: 5 val: 1 test: 1
Initial split sizes (by sequences): 542 164 168
FINAL SPLITS:
 Train: (542, 32, 303) (542,)
 Val  : (164, 32, 303) (164,)
 Test : (168, 32, 303) (168,)


In [14]:
# ------------------- 8) Data augmentation helpers (sequence-level) -------------------
def spatial_jitter(seq, sigma=0.01):
    # seq: (T,F) where F is multiple of 4 usually. Apply small translation to x,y pairs.
    T, F = seq.shape
    out = seq.copy()
    # Determine number of keypoints K = F//4 (if divisible), else best-effort treat as K = F//3
    if F % 4 == 0:
        K = F//4
        mat = out.reshape(T, K, 4)
        delta = np.random.uniform(-sigma, sigma, size=(T,1,2))
        mat[:,:, :2] = mat[:,:, :2] + delta
        out = mat.reshape(T, F)
    elif F % 3 == 0:
        K = F//3
        mat = out.reshape(T, K, 3)
        delta = np.random.uniform(-sigma, sigma, size=(T,1,2))
        mat[:,:, :2] = mat[:,:, :2] + delta
        out = mat.reshape(T, F)
    return out

def time_warp(seq, low=0.85, high=1.15):
    T = seq.shape[0]
    r = np.random.uniform(low, high)
    new_t = np.linspace(0, T-1, int(round(T*r)))
    from scipy.interpolate import interp1d
    f = interp1d(np.arange(T), seq, axis=0, kind='linear', fill_value="extrapolate")
    warped = f(np.linspace(0, T-1, T))
    return warped

def drop_frames(seq, max_drop=4):
    T = seq.shape[0]
    drop_n = random.randint(1, max_drop)
    keep_idx = sorted(random.sample(range(T), max(1, T-drop_n)))
    kept = seq[keep_idx]
    # resample to T
    from scipy.interpolate import interp1d
    f = interp1d(np.linspace(0,1,len(kept)), kept, axis=0, kind='linear', fill_value="extrapolate")
    return f(np.linspace(0,1,T))

def augment_sequence(seq):
    out = seq.copy()
    if random.random() < 0.6:
        out = spatial_jitter(out, sigma=0.015)
    if random.random() < 0.4:
        out = drop_frames(out, max_drop=3)
    if random.random() < 0.5:
        out = time_warp(out, 0.9, 1.1)
    # small gaussian noise
    if random.random() < 0.8:
        out = out + np.random.normal(0, 0.003, size=out.shape)
    return out.astype(np.float32)

# ------------------- 9) Simple data generator -------------------
class SeqGenerator(keras.utils.Sequence):
    def __init__(self, X, y, batch_size=BATCH_SIZE, augment=False, shuffle=True):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.augment = augment
        self.shuffle = shuffle
        self.indices = np.arange(len(X))
        self.on_epoch_end()
    def __len__(self):
        return max(1, math.ceil(len(self.X)/self.batch_size))
    def __getitem__(self, idx):
        batch_idx = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
        Xb = []
        yb = []
        for i in batch_idx:
            seq = self.X[i]
            if self.augment:
                seq = augment_sequence(seq)
            # flatten sequence to (T*F,)
            Xb.append(seq.reshape(-1))
            yb.append(self.y[i])
        Xb = np.stack(Xb).astype(np.float32)
        yb = np.array(yb, dtype=np.int32)
        return Xb, yb
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

# Build datasets
train_gen = SeqGenerator(X_train, y_train, batch_size=BATCH_SIZE, augment=True, shuffle=True)
val_gen   = SeqGenerator(X_val,   y_val,   batch_size=BATCH_SIZE, augment=False, shuffle=False)
test_gen  = SeqGenerator(X_test,  y_test,  batch_size=BATCH_SIZE, augment=False, shuffle=False)


In [15]:
# ------------------- 10) MLP model for landmarks-only (fast) -------------------
input_dim = T_SEQ * X_train.shape[2]
num_classes = len(unique_words)
def build_mlp(input_dim, num_classes):
    inp = keras.layers.Input(shape=(input_dim,))
    x = keras.layers.LayerNormalization()(inp)
    x = keras.layers.Dense(1024, activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(512, activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dropout(0.4)(x)
    out = keras.layers.Dense(num_classes, activation="softmax")(x)
    model = keras.Model(inputs=inp, outputs=out)
    return model

model = build_mlp(input_dim, num_classes)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=LR),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

# class weights to counter class imbalance
train_counts = Counter(y_train.tolist())
class_weights = {i: (len(y_train)/(num_classes * train_counts.get(i,1))) for i in range(num_classes)}
print("Class weights computed.")


Class weights computed.


In [16]:
# ------------------- 11) Callbacks: save best every epoch -------------------
ckpt_dir = os.path.join(OUT_ROOT, "checkpoints")
os.makedirs(ckpt_dir, exist_ok=True)
best_path = os.path.join(ckpt_dir, "best_model_epoch_{epoch:02d}_valacc_{val_accuracy:.4f}.keras")
# ModelCheckpoint cannot save every epoch with dynamic name directly; implement custom callback
class SaveEveryEpochCallback(keras.callbacks.Callback):
    def __init__(self, out_pattern):
        super().__init__()
        self.out_pattern = out_pattern
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_acc = logs.get("val_accuracy", 0.0)
        path = self.out_pattern.format(epoch=epoch+1, val_accuracy=val_acc)
        self.model.save(path)
        print("Saved model to", path)

save_cb = SaveEveryEpochCallback(best_path)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, verbose=1)
# no EarlyStopping as requested


In [17]:
# ------------------- 12) Train -------------------
print("Starting training. This may take a while.")
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=EPOCHS,
    callbacks=[save_cb, reduce_lr],
    class_weight=class_weights,
    verbose=2
)


Starting training. This may take a while.
Epoch 1/30


  self._warn_if_super_not_called()


Saved model to /content/pipeline_output/checkpoints/best_model_epoch_01_valacc_0.0610.keras
17/17 - 11s - 647ms/step - accuracy: 0.0406 - loss: 4.4841 - val_accuracy: 0.0610 - val_loss: 7.0149 - learning_rate: 3.0000e-04
Epoch 2/30
Saved model to /content/pipeline_output/checkpoints/best_model_epoch_02_valacc_0.0671.keras
17/17 - 9s - 529ms/step - accuracy: 0.1070 - loss: 3.6779 - val_accuracy: 0.0671 - val_loss: 6.6871 - learning_rate: 3.0000e-04
Epoch 3/30
Saved model to /content/pipeline_output/checkpoints/best_model_epoch_03_valacc_0.0610.keras
17/17 - 7s - 394ms/step - accuracy: 0.1421 - loss: 3.3371 - val_accuracy: 0.0610 - val_loss: 5.9843 - learning_rate: 3.0000e-04
Epoch 4/30
Saved model to /content/pipeline_output/checkpoints/best_model_epoch_04_valacc_0.0732.keras
17/17 - 5s - 282ms/step - accuracy: 0.1697 - loss: 3.1636 - val_accuracy: 0.0732 - val_loss: 5.4533 - learning_rate: 3.0000e-04
Epoch 5/30
Saved model to /content/pipeline_output/checkpoints/best_model_epoch_05_val

In [19]:
# ------------------- 13) Evaluate -------------------
print("Evaluating on test set:")
test_loss, test_acc = model.evaluate(test_gen, verbose=2)
print("Test loss:", test_loss, "Test acc:", test_acc)

# Predictions and classification report
# Get true labels from the test set directly
y_true = y_test

# Get predictions using the model.predict method with the test generator
# This ensures consistency with how evaluate worked and avoids potential re-iteration issues
all_predictions = model.predict(test_gen)
y_pred = np.argmax(all_predictions, axis=1).tolist()

print("Classification report (test):")
print(classification_report(y_true, y_pred, target_names=[w for w in unique_words], zero_division=0))
cm = confusion_matrix(y_true, y_pred, labels=list(range(num_classes)))
np.save(os.path.join(OUT_ROOT,"confusion_matrix.npy"), cm)
print("Saved confusion matrix and artifacts to:", OUT_ROOT)

Evaluating on test set:
6/6 - 0s - 50ms/step - accuracy: 0.1845 - loss: 3.1492
Test loss: 3.149167060852051 Test acc: 0.184523805975914
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step
Classification report (test):
                precision    recall  f1-score   support

             I       0.00      0.00      0.00         7
       alright       0.25      0.12      0.17         8
           bad       0.00      0.00      0.00         3
          cold       0.00      0.00      0.00         3
           dry       0.33      1.00      0.50         3
          fast       0.00      0.00      0.00         3
          good       1.00      0.50      0.67         4
good afternoon       0.00      0.00      0.00         8
  good evening       0.00      0.00      0.00         8
  good morning       0.09      0.25      0.13         8
    good night       0.20      0.12      0.15         8
         happy       0.00      0.00      0.00         3
            he       0.00      0.