In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data as data_utils
from torchvision import datasets, models, transforms

from sklearn.model_selection import KFold, train_test_split

import albumentations as A
# import albumentations_experimental as AE
from albumentations.pytorch import ToTensorV2
import cv2
from tqdm import tqdm

# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random, sys
import pandas as pd
import time
import copy

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.engine import DefaultTrainer

from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader

# Connect your script to Neptune
# import neptune
# import neptune_config

from detectron2.structures import BoxMode

In [3]:
class Trainer(DefaultTrainer):
    """
    We use the "DefaultTrainer" which contains a number pre-defined logic for
    standard training workflow. They may not work for you, especially if you
    are working on a new research project. In that case you can use the cleaner
    "SimpleTrainer", or write your own training loop.
    """

    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        """
        Create evaluator(s) for a given dataset.
        This uses the special metadata "evaluator_type" associated with each builtin dataset.
        For your own dataset, you can simply create an evaluator manually in your
        script and do not have to worry about the hacky if-else logic here.
        """
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
        evaluator_list = []
        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
        if evaluator_type in ["coco", "coco_panoptic_seg"]:
            evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
        if len(evaluator_list) == 0:
            raise NotImplementedError(
                "no Evaluator for the dataset {} with the type {}".format(
                    dataset_name, evaluator_type
                )
            )
        if len(evaluator_list) == 1:
            return evaluator_list[0]
        return DatasetEvaluators(evaluator_list)

In [4]:
def train_val_split(imgs, keypoints, random_state=42):
    d = dict()
    for file in imgs:
        key = ''.join(file.split('-')[:-1])
        if key not in d.keys():
            d[key] = [file]
        else:
            d[key].append(file)
            
    np.random.seed(random_state)
    trains = []
    validations = []
    for key, value in d.items():
        r = np.random.randint(len(value), size=2)
        for i in range(len(value)):
            if i in r:
                validations.append(np.where(imgs == value[i])[0][0])
            else:
                trains.append(np.where(imgs == value[i])[0][0])
    return (
        imgs[trains], imgs[validations],
        keypoints[trains], keypoints[validations]
    )


In [5]:
def train_val_split2(augmented, train):
    train_imgs = train.iloc[:, 0].to_numpy()
    train_keypoints = train.iloc[:, 1:].to_numpy()
    aug_imgs = augmented.iloc[:, 0].to_numpy()
    aug_keypoints = augmented.iloc[:, 1:].to_numpy()
    return aug_imgs, train_imgs, aug_keypoints, train_keypoints

In [6]:
def get_data_dicts(data_dir, imgs, keypoints, phase):
#     train_dir = os.path.join(data_dir, "augmented" if phase=="train" else "train_imgs")
    train_dir = os.path.join(data_dir, "train_imgs")
    dataset_dicts = []

    for idx, item in tqdm(enumerate(zip(imgs, keypoints))):
        img, keypoint = item[0], item[1]

        record = {}
        filepath = os.path.join(train_dir, img)
        record["height"], record["width"] = cv2.imread(filepath).shape[:2]
        record["file_name"] = filepath
        record["image_id"] = idx

        keypoints_v = []
        for i, keypoint_ in enumerate(keypoint):
            keypoints_v.append(keypoint_) # if coco set, should be added 0.5
            if i % 2 == 1:
                keypoints_v.append(2)

        x = keypoint[0::2]
        y = keypoint[1::2]
        x_min, x_max = min(x), max(x)
        y_min, y_max = min(y), max(y)

        obj = {
            "bbox": [x_min, y_min, x_max, y_max],
            "bbox_mode": BoxMode.XYXY_ABS,
            "category_id": 0,
            "keypoints": keypoints_v
        }

        record["annotations"] = [obj]
        dataset_dicts.append(record)
    return dataset_dicts

In [7]:
data_dir = "../data/"
# aug_df = pd.read_csv(os.path.join(data_dir, "augmented.csv"))
train_df = pd.read_csv(os.path.join(data_dir, "train_df_modified.csv"))

keypoint_names = train_df.columns.to_list()[1:]
keypoint_flip_map = []
for i in range(0, len(keypoint_names) // 2, 2):
    keypoint_flip_map.append((keypoint_names[i], keypoint_names[i+1]))

columns = train_df.columns[1:].to_list()[::2]
keypoint_names = [
    label.replace("_x", '').replace("_y", '') for label in columns
]

In [11]:
imgs = train_df.iloc[:, 0].to_numpy()
keypoints = train_df.iloc[:, 1:].to_numpy()
imgs_train, imgs_val, keypoints_train, keypoints_val = \
    train_val_split(imgs, keypoints, random_state=42)

imgs_d = {
    "train": imgs_train,
    "val": imgs_val
}
keypoints_d = {
    "train": keypoints_train,
    "val": keypoints_val
}

for d in ["train", "val"]:
    DatasetCatalog.register(
        "keypoints_" + d,
        lambda d=d: get_data_dicts(
            data_dir, imgs_d[d], keypoints_d[d], phase=d
        )
    )
    MetadataCatalog.get("keypoints_" + d).set(
        thing_classes=["human"]
    )
    MetadataCatalog.get("keypoints_" + d).set(
        keypoint_names=keypoint_names
    )
    MetadataCatalog.get("keypoints_" + d).set(
        keypoint_flip_map=keypoint_flip_map
    )
    MetadataCatalog.get("keypoints_" + d).set(
        evaluator_type="coco"
    )

motions_metadata = MetadataCatalog.get("keypoints_train")
print(motions_metadata)

Metadata(evaluator_type='coco', keypoint_flip_map=[('nose_x', 'nose_y'), ('left_eye_x', 'left_eye_y'), ('right_eye_x', 'right_eye_y'), ('left_ear_x', 'left_ear_y'), ('right_ear_x', 'right_ear_y'), ('left_shoulder_x', 'left_shoulder_y'), ('right_shoulder_x', 'right_shoulder_y'), ('left_elbow_x', 'left_elbow_y'), ('right_elbow_x', 'right_elbow_y'), ('left_wrist_x', 'left_wrist_y'), ('right_wrist_x', 'right_wrist_y'), ('left_hip_x', 'left_hip_y')], keypoint_names=['nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear', 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', 'right_knee', 'left_ankle', 'right_ankle', 'neck', 'left_palm', 'right_palm', 'spine2(back)', 'spine1(waist)', 'left_instep', 'right_instep'], name='keypoints_train', thing_classes=['human'])


In [12]:
# keypoint_rcnn_R_50_FPN_3x.yaml
# keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("keypoints_train",)
cfg.DATASETS.TEST = ("keypoints_val",)
cfg.DATALOADER.NUM_WORKERS = 0
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml")  # Let training initialize from model zoo
cfg.SOLVER.IMS_PER_BATCH = 2
# cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR
cfg.SOLVER.BASE_LR = 0.001  # pick a good LR
cfg.SOLVER.MAX_ITER = 5000    # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
cfg.SOLVER.STEPS = []         # do not decay learning rate
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 256   # faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 24
# NOTE: this config means the number of classes, but a few popular unofficial tutorials incorrect uses num_classes+1 here.

cfg.TEST.KEYPOINT_OKS_SIGMAS = kpt_oks_sigmas=np.ones((24, 1), dtype=float).tolist()
cfg.TEST.EVAL_PERIOD = 500

In [13]:
# evaluator = COCOEvaluator("keypoints_val", ("bbox", "keypoints"), False, output_dir="./output/", kpt_oks_sigmas=np.ones((24, 1), dtype=float).tolist())

In [14]:
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = Trainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()
# trainer.test(model=trainer.model, cfg=cfg, evaluators=evaluator)

# trainer.test(cfg, trainer.model)

[32m[03/28 16:18:50 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

3679it [01:01, 60.18it/s]

[32m[03/28 16:19:51 d2.data.build]: [0mRemoved 0 images with no usable annotations. 3679 images left.
[32m[03/28 16:19:51 d2.data.build]: [0mRemoved 0 images with fewer than 1 keypoints.





[32m[03/28 16:19:51 d2.data.build]: [0mDistribution of instances among all 1 categories:
[36m|  category  | #instances   |
|:----------:|:-------------|
|   human    | 3679         |
|            |              |[0m
[32m[03/28 16:19:51 d2.data.common]: [0mSerializing 3679 elements to byte tensors and concatenating them all ...
[32m[03/28 16:19:52 d2.data.common]: [0mSerialized dataset takes 4.77 MiB
[32m[03/28 16:19:52 d2.data.dataset_mapper]: [0mAugmentations used in training: [ResizeShortestEdge(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style='choice'), RandomFlip()]
[32m[03/28 16:19:52 d2.data.build]: [0mUsing training sampler TrainingSampler


model_final_5ad38f.pkl: 491MB [00:59, 8.18MB/s]                                                                                                                                               
Skip loading parameter 'roi_heads.keypoint_head.score_lowres.weight' to the model due to incompatible shapes: (512, 17, 4, 4) in the checkpoint but (512, 24, 4, 4) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.keypoint_head.score_lowres.bias' to the model due to incompatible shapes: (17,) in the checkpoint but (24,) in the model! You might want to double check if this is expected.


[32m[03/28 16:20:52 d2.engine.train_loop]: [0mStarting training from iteration 0


	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:766.)
  return x.nonzero().unbind(1)


[32m[03/28 16:21:24 d2.utils.events]: [0m eta: 2:09:17  iter: 19  total_loss: 8.397  loss_cls: 0.102  loss_box_reg: 0.106  loss_keypoint: 8.191  loss_rpn_cls: 0.001  loss_rpn_loc: 0.004  time: 1.5395  data_time: 0.1172  lr: 0.000020  max_mem: 6231M
[32m[03/28 16:21:55 d2.utils.events]: [0m eta: 2:08:36  iter: 39  total_loss: 8.323  loss_cls: 0.079  loss_box_reg: 0.106  loss_keypoint: 8.145  loss_rpn_cls: 0.001  loss_rpn_loc: 0.004  time: 1.5376  data_time: 0.1176  lr: 0.000040  max_mem: 6231M
[32m[03/28 16:22:27 d2.utils.events]: [0m eta: 2:08:17  iter: 59  total_loss: 8.218  loss_cls: 0.073  loss_box_reg: 0.080  loss_keypoint: 8.051  loss_rpn_cls: 0.002  loss_rpn_loc: 0.004  time: 1.5517  data_time: 0.1156  lr: 0.000060  max_mem: 6231M
[32m[03/28 16:22:57 d2.utils.events]: [0m eta: 2:07:34  iter: 79  total_loss: 8.131  loss_cls: 0.057  loss_box_reg: 0.082  loss_keypoint: 7.980  loss_rpn_cls: 0.001  loss_rpn_loc: 0.004  time: 1.5385  data_time: 0.1104  lr: 0.000080  max_mem: 62

516it [00:08, 60.05it/s]

[32m[03/28 16:33:39 d2.data.build]: [0mDistribution of instances among all 1 categories:
[36m|  category  | #instances   |
|:----------:|:-------------|
|   human    | 516          |
|            |              |[0m
[32m[03/28 16:33:39 d2.data.common]: [0mSerializing 516 elements to byte tensors and concatenating them all ...
[32m[03/28 16:33:39 d2.data.common]: [0mSerialized dataset takes 0.67 MiB
[32m[03/28 16:33:39 d2.data.dataset_mapper]: [0mAugmentations used in training: [ResizeShortestEdge(short_edge_length=(800, 800), max_size=1333, sample_style='choice')]
[32m[03/28 16:33:39 d2.evaluation.coco_evaluation]: [0m'keypoints_val' is not registered by `register_coco_instances`. Therefore trying to convert it to COCO format ...
[32m[03/28 16:33:39 d2.data.datasets.coco]: [0mConverting annotations of dataset 'keypoints_val' to COCO format ...)



516it [00:06, 78.30it/s]

[32m[03/28 16:33:46 d2.data.datasets.coco]: [0mConverting dataset dicts into COCO format





[32m[03/28 16:33:46 d2.data.datasets.coco]: [0mConversion finished, #images: 516, #annotations: 516
[32m[03/28 16:33:46 d2.data.datasets.coco]: [0mCaching COCO format annotations at './output\inference\keypoints_val_coco_format.json' ...
[32m[03/28 16:33:46 d2.evaluation.evaluator]: [0mStart inference on 516 images
[32m[03/28 16:33:49 d2.evaluation.evaluator]: [0mInference done 11/516. 0.2333 s / img. ETA=0:02:23
[32m[03/28 16:33:54 d2.evaluation.evaluator]: [0mInference done 29/516. 0.2319 s / img. ETA=0:02:20
[32m[03/28 16:33:59 d2.evaluation.evaluator]: [0mInference done 47/516. 0.2321 s / img. ETA=0:02:16
[32m[03/28 16:34:05 d2.evaluation.evaluator]: [0mInference done 65/516. 0.2322 s / img. ETA=0:02:11
[32m[03/28 16:34:10 d2.evaluation.evaluator]: [0mInference done 83/516. 0.2325 s / img. ETA=0:02:06
[32m[03/28 16:34:15 d2.evaluation.evaluator]: [0mInference done 101/516. 0.2324 s / img. ETA=0:02:01
[32m[03/28 16:34:20 d2.evaluation.evaluator]: [0mInference don

TypeError: 'numpy.float64' object cannot be interpreted as an integer

In [12]:
# Inference should use the config with parameters that are used in training
# cfg now already contains everything we've set previously. We changed it a little bit for inference:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7   # set a custom testing threshold
predictor = DefaultPredictor(cfg)

preds = []
files = []
test_dir = os.path.join(data_dir, "test_imgs")
test_list = os.listdir(test_dir)
test_list.sort()
except_list = []
for file in tqdm(test_list):
    filepath = os.path.join(test_dir, file)
    # print(filepath)
    im = cv2.imread(filepath)
    outputs = predictor(im)
    outputs = outputs["instances"].to("cpu").get("pred_keypoints").numpy()
    files.append(file)
    pred = []
    try:
        for out in outputs[0]:
            pred.extend([float(e) for e in out[:2]])
    except:
        except_list.append(filepath)
        print(filepath)
    preds.append(pred)

100%|██████████████████████████████████████████████████████████████████████████████| 1600/1600 [08:11<00:00,  3.25it/s]


In [13]:
df_sub = pd.read_csv(f"../data/sample_submission.csv")
df = pd.DataFrame(columns=df_sub.columns)
df["image"] = files
df.iloc[:, 1:] = preds

df.to_csv(f"submissions.csv", index=False)