In [1]:
import os
import json
import torch
from torch.utils.data import Dataset
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Training on device:", device)

Training on device: cuda


In [3]:
class ASLDataset(Dataset):
    # Expected number of landmarks per group in MediaPipe Holistic
    expected_counts = {
        "pose": 33,
        "face": 468,
        "left_hand": 21,
        "right_hand": 21
    }

    def __init__(self, landmark_dir, vocab):
        """
        landmark_dir: path to folder containing .json landmark files
        vocab: dict mapping word -> index
        """
        self.vocab = vocab
        self.samples = []  # List of (word, sequence_list)

        vec_len = sum(self.expected_counts.values()) * 3  # x,y,z per landmark

        for fname in os.listdir(landmark_dir):
            if not fname.endswith(".json"):
                continue
            word = fname.replace(".json", "")
            if word not in vocab:
                continue

            path = os.path.join(landmark_dir, fname)
            with open(path, "r") as f:
                frames = json.load(f)

            sequence = []
            for frame in frames:
                vec = []
                # For each group, pad or truncate to expected count
                for group, count in self.expected_counts.items():
                    landmarks = frame.get(group, [])
                    for i in range(count):
                        if i < len(landmarks):
                            p = landmarks[i]
                            vec.extend([p["x"], p["y"], p["z"]])
                        else:
                            vec.extend([0.0, 0.0, 0.0])
                # sanity check
                assert len(vec) == vec_len, f"Expected {vec_len}, got {len(vec)}"
                sequence.append(vec)

            self.samples.append((word, sequence))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        word, sequence = self.samples[idx]
        word_idx = self.vocab[word]
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32)
        return word_idx, sequence_tensor

In [4]:
class TextToLandmarkLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(TextToLandmarkLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, seq_len):
        emb = self.embedding(x)  # x: [batch_size]
        emb = emb.unsqueeze(1).repeat(1, seq_len, 1)  # repeat for sequence
        out, _ = self.lstm(emb)
        out = self.fc(out)  # [batch, seq_len, output_dim]
        return out

In [5]:
# Build vocab
vocab = {word: idx for idx, word in enumerate(os.listdir("landmark_data"))}
vocab = {k.replace(".json", ""): v for k, v in vocab.items()}
dataset = ASLDataset("landmark_data", vocab)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)

model = TextToLandmarkLSTM(
    vocab_size=len(vocab),
    embedding_dim=32,
    hidden_dim=128,
    output_dim=len(dataset[0][1][0])  # vector length per frame
).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

for epoch in range(130):
    model.train()
    total_loss = 0

    for batch in dataloader:
        # Unpack batch
        words, sequences = zip(*batch)

        # Prepare targets: pad in time as before
        max_len = max(seq.size(0) for seq in sequences)
        padded = torch.zeros(len(batch), max_len, sequences[0].size(1), device=device)
        for i, seq in enumerate(sequences):
            padded[i, : seq.size(0), :] = seq.to(device)

        inputs = torch.tensor(words, device=device)       # word indices
        targets = padded                                  # landmark sequences

        optimizer.zero_grad()
        outputs = model(inputs, max_len)                  # outputs on device
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1:02d} — Loss: {total_loss:.4f}")

Epoch 01 — Loss: 5.4046
Epoch 02 — Loss: 2.9803
Epoch 03 — Loss: 2.7990
Epoch 04 — Loss: 2.7517
Epoch 05 — Loss: 2.6629
Epoch 06 — Loss: 2.5724
Epoch 07 — Loss: 2.5772
Epoch 08 — Loss: 2.4160
Epoch 09 — Loss: 2.4532
Epoch 10 — Loss: 2.2755
Epoch 11 — Loss: 2.3270
Epoch 12 — Loss: 2.2565
Epoch 13 — Loss: 2.3027
Epoch 14 — Loss: 2.1828
Epoch 15 — Loss: 2.2066
Epoch 16 — Loss: 2.0874
Epoch 17 — Loss: 2.0184
Epoch 18 — Loss: 1.9629
Epoch 19 — Loss: 1.7855
Epoch 20 — Loss: 1.8249
Epoch 21 — Loss: 1.7895
Epoch 22 — Loss: 1.6309
Epoch 23 — Loss: 1.5035
Epoch 24 — Loss: 1.5592
Epoch 25 — Loss: 1.4681
Epoch 26 — Loss: 1.1837
Epoch 27 — Loss: 1.1731
Epoch 28 — Loss: 1.1535
Epoch 29 — Loss: 1.0254
Epoch 30 — Loss: 1.0376
Epoch 31 — Loss: 1.0093
Epoch 32 — Loss: 0.9316
Epoch 33 — Loss: 0.9003
Epoch 34 — Loss: 0.8547
Epoch 35 — Loss: 0.9170
Epoch 36 — Loss: 0.8614
Epoch 37 — Loss: 0.8503
Epoch 38 — Loss: 0.8034
Epoch 39 — Loss: 0.8638
Epoch 40 — Loss: 0.7710
Epoch 41 — Loss: 0.7500
Epoch 42 — Loss:

In [15]:
len(dataset[0][1][0])

1629

In [16]:
(33 + 468 + 21 + 21) * 3

1629

In [6]:
save_dir = os.path.join("models", "model_3")
os.makedirs(save_dir, exist_ok=True)

In [7]:
model_path = os.path.join(save_dir, "asl_lstm_state_dict.pth")
torch.save(model.state_dict(), model_path)
print(f"✅ Model weights saved to {model_path}")

✅ Model weights saved to models\model_3\asl_lstm_state_dict.pth


In [8]:
vocab_path = os.path.join(save_dir, "vocab.json")
with open(vocab_path, "w") as f:
    json.dump(vocab, f, indent=2)
print(f"✅ Vocab mapping saved to {vocab_path}")

✅ Vocab mapping saved to models\model_3\vocab.json


Prediction Pipeline

In [14]:
# vocab_size=len(vocab)
# embedding_dim=32
# hidden_dim=128
# output_dim=len(dataset[0][1][0])

In [3]:
# 2. Load vocabulary
vocab_path = os.path.join("models", "model_2", "vocab.json")
with open(vocab_path, "r") as f:
    vocab = json.load(f)
vocab_size = len(vocab)

In [4]:
EMBED_DIM = 32
HID_DIM   = 128
# MediaPipe Holistic landmarks: pose 33, face 468, left_hand 21, right_hand 21
OUTPUT_DIM = (33 + 468 + 21 + 21) * 3  

In [11]:
model_loaded = TextToLandmarkLSTM(vocab_size, EMBED_DIM, HID_DIM, OUTPUT_DIM)
model_path = os.path.join(save_dir, "asl_lstm_state_dict.pth")
model_loaded.load_state_dict(torch.load(model_path))
model_loaded.to(device)
model_loaded.eval()

TextToLandmarkLSTM(
  (embedding): Embedding(308, 32)
  (lstm): LSTM(32, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1629, bias=True)
)

In [12]:
def predict_landmarks(word, seq_len=30):
    if word not in vocab:
        raise ValueError(f"Word '{word}' not in vocab")
    idx = torch.tensor([vocab[word]], device=device)
    with torch.no_grad():
        preds = model_loaded(idx, seq_len)           # → [1, seq_len, OUTPUT_DIM]
    return preds.squeeze(0).cpu().tolist()    # → list of seq_len vectors

In [13]:
seq_length = 30
preds = predict_landmarks("about", seq_length)

In [14]:
preds

[[0.5449190735816956,
  0.2488538771867752,
  -1.0143712759017944,
  0.5677078366279602,
  0.18874302506446838,
  -0.909540057182312,
  0.5993417501449585,
  0.1972484588623047,
  -0.9057087302207947,
  0.6143903732299805,
  0.18700867891311646,
  -0.9444970488548279,
  0.5073528289794922,
  0.20955990254878998,
  -0.9138498902320862,
  0.515204668045044,
  0.2286495417356491,
  -0.9411036372184753,
  0.4698357582092285,
  0.2110099494457245,
  -0.9291871786117554,
  0.6398708820343018,
  0.24205729365348816,
  -0.5305259823799133,
  0.47970107197761536,
  0.2505401372909546,
  -0.5485091209411621,
  0.5969812870025635,
  0.31779351830482483,
  -0.8321307897567749,
  0.5316140651702881,
  0.31779158115386963,
  -0.8068298101425171,
  0.7619025111198425,
  0.5999124050140381,
  -0.262173593044281,
  0.374621719121933,
  0.6228153109550476,
  -0.30011898279190063,
  0.8207821846008301,
  1.0127050876617432,
  -0.27350473403930664,
  0.25516271591186523,
  1.035829782485962,
  -0.61585003

In [47]:
import numpy as np

def unflatten_landmarks_frame(flat_vec):
    """
    Turn a flat (1629,) prediction into a dict of landmark lists like MediaPipe.
    """
    arr = np.asarray(flat_vec, dtype=float)
    groups = [("pose", 33), ("face", 468), ("left_hand", 21), ("right_hand", 21)]
    coords = 3

    result = {}
    idx = 0
    for name, count in groups:
        pts = []
        length = count * coords
        chunk = arr[idx : idx + length]
        for i in range(count):
            x, y, z = chunk[i*coords:(i+1)*coords]
            pts.append({"x": float(x), "y": float(y), "z": float(z)})
        result[name] = pts
        idx += length

    assert idx == arr.size, f"Consumed {idx} of {arr.size}"
    return result

def unflatten_landmarks(preds):
    """
    If preds is a single flat vector → returns one dict.
    If preds is a list/array of flat vectors → returns list of dicts.
    """
    arr = np.asarray(preds)
    if arr.ndim == 1:
        # single frame
        return unflatten_landmarks_frame(arr)
    elif arr.ndim == 2:
        # sequence of frames
        return [unflatten_landmarks_frame(frame) for frame in arr]
    else:
        raise ValueError(f"Expected 1D or 2D array, got shape {arr.shape}")

# --- Example Usage ---

# preds = predict_landmarks("about", seq_len=30)  # list of 30 flat vectors
# structured = unflatten_landmarks(preds)
# Now structured is a list of 30 dicts, each with keys "pose","face","left_hand","right_hand".
# You can then pass structured[i] into your stickman overlay logic.


In [45]:
len(preds[0])

1629

In [48]:
arranged_preds = unflatten_landmarks(preds)
arranged_preds

[{'pose': [{'x': 0.5449190735816956,
    'y': 0.2488538771867752,
    'z': -1.0143712759017944},
   {'x': 0.5677078366279602,
    'y': 0.18874302506446838,
    'z': -0.909540057182312},
   {'x': 0.5993417501449585,
    'y': 0.1972484588623047,
    'z': -0.9057087302207947},
   {'x': 0.6143903732299805,
    'y': 0.18700867891311646,
    'z': -0.9444970488548279},
   {'x': 0.5073528289794922,
    'y': 0.20955990254878998,
    'z': -0.9138498902320862},
   {'x': 0.515204668045044, 'y': 0.2286495417356491, 'z': -0.9411036372184753},
   {'x': 0.4698357582092285,
    'y': 0.2110099494457245,
    'z': -0.9291871786117554},
   {'x': 0.6398708820343018,
    'y': 0.24205729365348816,
    'z': -0.5305259823799133},
   {'x': 0.47970107197761536,
    'y': 0.2505401372909546,
    'z': -0.5485091209411621},
   {'x': 0.5969812870025635,
    'y': 0.31779351830482483,
    'z': -0.8321307897567749},
   {'x': 0.5316140651702881,
    'y': 0.31779158115386963,
    'z': -0.8068298101425171},
   {'x': 0.76190