In [1]:
import sys
sys.path.append("..")

In [2]:
from mmcv import Config, DictAction
from mmcv.runner import load_checkpoint
from models import build_posenet
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm import tqdm
from mmpose.core.evaluation.top_down_eval import _get_max_preds



In [3]:
class CustomPoseModel(nn.Module):
    def __init__(self, backbone, keypoint_head):
        super().__init__()
        self.backbone = backbone
        self.keypoint_head = keypoint_head

    def forward(self, x):
        # Feature extraction
        features = self.backbone(x)
        # Keypoint head (dự đoán heatmap)
        feature = features[-1]
        out = self.keypoint_head(feature)
        return out

In [4]:
cfg_file = "../configs/top_down/lite_hrnet/mpii/litehrnet_30_mpii_256x256.py"
cfg = Config.fromfile(cfg_file)
pretrained_model = build_posenet(cfg.model)



In [5]:
checkpoint = load_checkpoint(pretrained_model, '../ckpts/litehrnet_30_mpii_256x256.pth', map_location='cpu')

load checkpoint from local path: ../ckpts/litehrnet_30_mpii_256x256.pth
The model and loaded state dict do not match exactly

size mismatch for keypoint_head.final_layer.weight: copying a param with shape torch.Size([16, 40, 1, 1]) from checkpoint, the shape in current model is torch.Size([30, 40, 1, 1]).
size mismatch for keypoint_head.final_layer.bias: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([30]).


In [14]:
class KeypointCoordHead(nn.Module):
    def __init__(self, in_channels, num_keypoints):
        super().__init__()
        self.fc = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),  # (B, C, 1, 1)
            nn.Flatten(),             # (B, C)
            nn.Linear(in_channels, num_keypoints * 2)  # (B, K*2)
        )
        self.num_keypoints = num_keypoints

    def forward(self, x):
        out = self.fc(x)  # (B, K*2)
        out = out.view(-1, self.num_keypoints, 2)
        out = torch.sigmoid(out) # range [0, 1]
        return out

In [7]:
backbone = pretrained_model.backbone
# keypoint_head = pretrained_model.keypoint_head
keypoint_head = KeypointCoordHead(40, 30)

In [8]:
model = CustomPoseModel(backbone, keypoint_head)

In [9]:
x = torch.randn(2, 3, 256, 256)
out = model(x)
out.shape

torch.Size([2, 30, 2])

In [10]:
from utils.dataset import FaceKeypointDataset

Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/hoang/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/hoang/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/hoang/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/hoang/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/hoang/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127.5
set det

In [11]:
target_size = (256, 256) # (w, h)

train_dataset = FaceKeypointDataset(data_dir="../data_split/train/", output_size=target_size)
test_dataset = FaceKeypointDataset(data_dir="../data_split/test/", output_size=target_size)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=4)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=2)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

In [12]:
# Heatmap → keypoint coordinates
def _get_max_preds_torch(heatmaps: torch.Tensor):
    """
    Args:
        heatmaps: Tensor of shape (N, K, H, W)

    Returns:
        preds: Tensor of shape (N, K, 2) - predicted keypoint coordinates
        maxvals: Tensor of shape (N, K, 1) - confidence scores
    """
    assert heatmaps.ndim == 4, "Heatmaps should be 4D tensor (N, K, H, W)"

    N, K, H, W = heatmaps.shape
    heatmaps_reshaped = heatmaps.view(N, K, -1)
    maxvals, idx = torch.max(heatmaps_reshaped, dim=2, keepdim=True)

    preds = idx.repeat(1, 1, 2).float()
    preds[..., 0] = preds[..., 0] % W  # x coord
    preds[..., 1] = preds[..., 1] // W  # y coord

    # Mask out invalid preds (where confidence == 0)
    pred_mask = maxvals > 0
    preds *= pred_mask.float()

    return preds, maxvals

In [15]:
heatmaps = np.random.rand(16, 30, 64, 64)
preds, maxvals = _get_max_preds(heatmaps)
print(preds.shape)
heatmaps_t = torch.from_numpy(heatmaps)
preds_t, maxvals_t = _get_max_preds_torch(heatmaps_t)
print(preds_t.numpy().shape)
np.array_equal(preds, preds_t)

(16, 30, 2)
(16, 30, 2)


True

In [18]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for images, gt_keypoints, _ in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        # images = images.cuda()
        # gt_keypoints = gt_keypoints.cuda()

        # Dự đoán heatmaps
        pred_heatmaps = model(images)
        
        # Chuyển heatmaps thành tọa độ keypoints (x, y)
        # pred_keypoints, _ = _get_max_preds_torch(pred_heatmaps)
        # pred_keypoints = pred_keypoints.requires_grad_(True)
        pred_keypoints = pred_heatmaps
        # Normalize gt keypoints
        gt_keypoints[..., 0] /= target_size[0]  # x / w
        gt_keypoints[..., 1] /= target_size[1]  # y / h

        # Tính loss giữa các keypoints dự đoán và groundtruth
        loss = criterion(pred_keypoints, gt_keypoints)
        # print(pred_keypoints, gt_keypoints)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, gt_keypoints, _ in tqdm(test_loader, desc=f"Epoch {epoch+1} [Val]"):
            # images = images.cuda()
            # gt_keypoints = gt_keypoints.cuda()

            # Dự đoán heatmaps
            pred_heatmaps = model(images)

            # Chuyển heatmaps thành tọa độ keypoints (x, y)
            # pred_keypoints, _ = _get_max_preds_torch(pred_heatmaps)
            pred_keypoints = pred_heatmaps
            gt_keypoints[..., 0] /= target_size[0]
            gt_keypoints[..., 1] /= target_size[1]

            # Tính loss giữa các keypoints dự đoán và groundtruth
            loss = criterion(pred_keypoints, gt_keypoints)

            val_loss += loss.item()

    print(f"Epoch {epoch+1} | Train Loss: {train_loss/len(train_loader)} | Val Loss: {val_loss/len(test_loader)}")

Epoch 1 [Train]: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [03:50<00:00, 10.46s/it]
Epoch 1 [Val]: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:41<00:00,  3.79s/it]


Epoch 1 | Train Loss: 0.026985036124559967 | Val Loss: 0.022187543016943066


Epoch 2 [Train]: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [03:09<00:00,  8.60s/it]
Epoch 2 [Val]: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:24<00:00,  2.25s/it]


Epoch 2 | Train Loss: 0.019928581123663622 | Val Loss: 0.01324667383662679


Epoch 3 [Train]: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [02:58<00:00,  8.10s/it]
Epoch 3 [Val]: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:28<00:00,  2.57s/it]


Epoch 3 | Train Loss: 0.011884519491683353 | Val Loss: 0.0069945774633776055


Epoch 4 [Train]: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [03:29<00:00,  9.54s/it]
Epoch 4 [Val]: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:42<00:00,  3.88s/it]


Epoch 4 | Train Loss: 0.005767777646807107 | Val Loss: 0.0032019981352443046


Epoch 5 [Train]: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [02:55<00:00,  8.00s/it]
Epoch 5 [Val]: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:27<00:00,  2.49s/it]


Epoch 5 | Train Loss: 0.0025231385000304067 | Val Loss: 0.0016719363735650074


Epoch 6 [Train]: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [02:22<00:00,  6.47s/it]
Epoch 6 [Val]: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:36<00:00,  3.36s/it]


Epoch 6 | Train Loss: 0.000990283641096374 | Val Loss: 0.0007545578621581874


Epoch 7 [Train]:  23%|█████████████████████████████████████▌                                                                                                                               | 5/22 [00:48<02:44,  9.66s/it]


KeyboardInterrupt: 

In [2]:
# UDA
from utils.dataset import UDAFaceKeypointDataset

In [3]:
uda_dataset = UDAFaceKeypointDataset(data_dir='../data')

In [6]:
for item in uda_dataset:
    image_base, image_aug, M_base, M_aug, _ = item.values()
    print(image_base, image_aug, M_base, M_aug)
    break

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


tensor([[[ 1.8379,  1.8379,  1.8379,  ...,  1.5297,  1.5125,  1.5125],
         [ 1.8379,  1.8379,  1.8550,  ...,  1.5125,  1.5125,  1.5297],
         [ 1.8550,  1.8379,  1.8379,  ...,  1.4954,  1.4954,  1.5125],
         ...,
         [-1.2445, -1.2788, -0.8678,  ..., -0.8507, -1.0904, -0.7479],
         [-1.1075, -0.8849, -0.8849,  ..., -1.1589, -0.1314, -1.2445],
         [-0.7137, -0.6109, -1.1418,  ..., -1.1932, -1.3644, -0.1143]],

        [[ 1.6933,  1.6933,  1.6933,  ...,  1.3782,  1.3606,  1.3606],
         [ 1.6933,  1.6933,  1.7108,  ...,  1.3606,  1.3606,  1.3782],
         [ 1.7108,  1.6933,  1.6933,  ...,  1.3431,  1.3431,  1.3606],
         ...,
         [-1.1954, -1.2304, -0.8277,  ..., -0.7752, -1.0203, -0.6702],
         [-1.0378, -0.8102, -0.8277,  ..., -1.0903, -0.0399, -1.1954],
         [-0.6352, -0.5301, -1.0728,  ..., -1.1253, -1.3004, -0.0399]],

        [[ 1.2457,  1.2457,  1.2457,  ...,  0.9668,  0.9494,  0.9494],
         [ 1.2457,  1.2457,  1.2631,  ...,  0