In [1]:
import os
import json
from itertools import combinations

import cv2
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine, pdist

from PIL import Image
import clip
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torchvision.datasets.folder import pil_loader

import lightgbm as lgb

In [2]:
def make_blob_detector():
    params = cv2.SimpleBlobDetector_Params()
    params.filterByColor = True
    params.blobColor = 255
    params.minThreshold = 253
    params.maxThreshold = 255
    params.thresholdStep = 1
    params.minDistBetweenBlobs = 0
    params.filterByArea = True
    params.maxArea = 1000
    params.filterByConvexity = False
    params.filterByInertia = False
    
    return cv2.SimpleBlobDetector_create(params)

def get_num_keypoint(image_path, detector):
    im = cv2.imread(image_path)
    im_tf = (im == [0, 0, 255]).all(-1, keepdims = True)
    keypoints = detector.detect(im_tf.astype(np.uint8)*255)
    return len(keypoints)

def included_angle_cos_dist(v1, v2 ,v3):
    return cosine(v1-v2, v3-v2)

def cos(vec1, vec2):
    return np.dot(vec1, vec2) / np.linalg.norm(vec1) / np.linalg.norm(vec2)

def cos_vec3(v1, v2, v3):
    vec1, vec2 = v1 - v2, v3 - v2
    return cos(vec1, vec2)

def clockwise_sin2d(vec1, vec2):
    return (vec1[0] * vec2[1] - vec1[1] * vec2[0]) / np.linalg.norm(vec1) / np.linalg.norm(vec2)

def clockwise_sin2d_vec3(v1, v2, v3):
    vec1, vec2 = v1 - v2, v3 - v2
    return (clockwise_sin2d(vec1, vec2))

In [3]:
# 끼인각을 찾기 위한 3개 point의 index set
angle_triples = (
    (0, 1, 2), # thumb
    (1, 2, 3),
    (2, 3, 4),

    (0, 5, 6), # index
    (5, 6, 7),
    (6, 7, 8),

    (0, 9, 10), # middle
    (9, 10, 11),
    (10, 11, 12),

    (0, 13, 14), # ring
    (13, 14, 15),
    (14, 15, 16),

    (0, 17, 18), # little
    (17, 18, 19),
    (18, 19, 20),

    (1, 0, 5),
    (17, 0, 5),
    (0, 5, 9),
    (5, 9, 13),
    (9, 13, 17),
    (13, 17, 0),

    (6, 5, 9),
    (5, 9, 10),
    (10, 9, 13),
    (9, 13, 14),
    (14, 13, 17),
    (13, 17, 18)
)

def gen_triples(array):
    """
    위 3개 point index를 활용하여 array로부터 3개 point set을 반환하는 generator
    """
    for idxs in angle_triples:
        v1 = array[idxs[0]]
        v2 = array[idxs[1]]
        v3 = array[idxs[2]]
        yield v1, v2, v3

def cos_of_triples(array):
    """
    21개 행을 갖는 keypoint array의 cosine list를 반환
    """
    return list(map(lambda x: cos_vec3(*x), gen_triples(array)))

def sin_of_triples(array):
    """
    21개 행을 갖는 keypoint array의 sine list를 반환
    """
    return list(map(lambda x: clockwise_sin2d_vec3(*x), gen_triples(array)))


def pairwise_diff(array):
    """
    21개 keypoint의 모든 pair에 대하여 distance를 산출
    21C2 = 210개의 comibation
    """
    return np.concatenate(list(map(lambda x: x[0]-x[1], combinations(array, 2))), 0)

In [4]:
def preprocess_file_info(file_info, image_dir, split, label_info_mapping, blob_detector, flip=False):
    rows = []
    row_file = {}

    row_file["id"] = file_info["id"]
    row_file["n_img"] = len(file_info["annotations"])
    row_file["flip"] = flip

    # All frame freature
    all_frame_ann_array = np.array([kps["data"] for kps in file_info["annotations"]])

    if flip:
        all_frame_ann_array[:, :, 0] = 1920 - all_frame_ann_array[:, :, 0]
    if row_file["id"] == 475:
        all_frame_keypoint_array = all_frame_ann_array[:, 21:]
    elif row_file["id"] == 543:
        all_frame_keypoint_array = all_frame_ann_array[:, :21]

    all_frame_keypoint_array = all_frame_ann_array.reshape(all_frame_ann_array.shape[0], -1, 21, 3)
    file_all_frame_one_box = np.c_[all_frame_keypoint_array.min(2), all_frame_keypoint_array.max(2)]

    all_frame_palm_width = np.linalg.norm(all_frame_keypoint_array[:, :, [5, 9, 13], :2] - all_frame_keypoint_array[:, :, [9, 13, 17], :2], axis=-1)
    pw_max_diff = (all_frame_palm_width.max(0) - all_frame_palm_width.min(0)).mean(-1)
    file_palm_height = np.linalg.norm(all_frame_keypoint_array[:, :, 0, :2] - all_frame_keypoint_array[:, :, 5, :2], axis=-1, keepdims=True)
    file_index_move_mean = (all_frame_keypoint_array[:, :, 8, :2] - all_frame_keypoint_array[:, :, 0, :2]) / file_palm_height

    if split == "train":
        # Split이 train인 경우 label 정보 추가
        row_file["pose_id"] = file_info["action"][0]
        if row_file["id"] == 282: # 282의 mis-label
            row_file["pose_id"] = 74 # Fix mis-label, 약속1 -> 약속2
        row_file.update(label_info_mapping[row_file["pose_id"]])
        if flip:
            if row_file["hand_type"] == "left":
                row_file["hand_type"] = "right"
            elif row_file["hand_type"] == "right":
                row_file["hand_type"] = "left"
    for bbox_idx in range(all_frame_keypoint_array.shape[1]):
        row_bbox = {"bbox_num": bbox_idx}
        row_bbox.update(row_file)
        row_bbox["pw_max_diff"] = pw_max_diff[bbox_idx]
        row_bbox["index_move_mean"] = pdist(file_index_move_mean[:, bbox_idx]).mean()
        row_bbox["index_move_x_mean"] = pdist(file_index_move_mean[:, bbox_idx, :1]).mean()
        all_frame_one_box = file_all_frame_one_box[:, bbox_idx]

        for keypoint_idx in range(row_file["n_img"]):
            row = {}
            row["img_num"] = keypoint_idx
            row.update(row_bbox)
            ann_array = all_frame_keypoint_array[keypoint_idx].reshape(-1, 3)

            keypoint_array = all_frame_keypoint_array[keypoint_idx, bbox_idx]

            image_path = os.path.join(image_dir, f"{row['id']}/{keypoint_idx}.png")
            if "n_blob_keypoints" not in row_file:
                row_file["n_blob_keypoints"] = get_num_keypoint(image_path, blob_detector)
                row_file["n_hands"] = "single" if row_file["n_blob_keypoints"] < 23 else "both" # 한손의 경우 21개보다 많은 경우는 없지만 여유롭게 잡음
                row_bbox["n_blob_keypoints"] = row_file["n_blob_keypoints"]
                row_bbox["n_hands"] = row_file["n_hands"]
                row["n_blob_keypoints"] = row_file["n_blob_keypoints"]
                row["n_hands"] = row_file["n_hands"]

            # 한 손 bbox의 너비와 높이
            one_bbox = all_frame_one_box[keypoint_idx]
            for i, b in enumerate(one_bbox):
                row[f"one_bbox_{i}"] = b
            row["one_bbox_w"] = one_bbox[3] - one_bbox[0]
            row["one_bbox_h"] = one_bbox[4] - one_bbox[1]

            # 양손 bbox의 너비와 높이
            bbox = np.r_[ann_array.min(0), ann_array.max(0)]
            for i, b in enumerate(bbox):
                row[f"bbox_{i}"] = b
            row["bbox_w"] = bbox[3] - bbox[0]
            row["bbox_h"] = bbox[4] - bbox[1]

            # 여러 3개쌍 key point에 대한 cos / sin 값
            coses = cos_of_triples(keypoint_array[:, :2])
            for i, cos_anlge in enumerate(coses):
                row[f"cos_{i}"] = cos_anlge
            sins = sin_of_triples(keypoint_array[:, :2])
            for i, sin_angle in enumerate(sins):
                row[f"sin_{i}"] = sin_angle
            for i, sin_angle in enumerate(sins):
                row[f"sin_abs_{i}"] = abs(sin_angle)

            # 각 point의 pairwise diff와 distance를 normalization하기 위한 손바닥 높이
            # 2d에 대한 scale이 성능이 좋아보임
            palm_height = np.linalg.norm(keypoint_array[0, :2] - keypoint_array[5, :2])

            # 모든 point 쌍에 대한 차
            # 3차원이기에 pair 수 210 * 3 = 630개
            diffs = pairwise_diff(keypoint_array) / palm_height
            for i, diff in enumerate(diffs):
                row[f"diff_{i}"] = diff

            # 모든 point 쌍에 대한 Euclidean distance
            # 2차원 distance보다 3차원 distance가 성능상 좋았음
            dists = pdist(keypoint_array) / palm_height
            for i, dist in enumerate(dists):
                row[f"dist_{i}"] = dist

            thumb_knuckle = keypoint_array[4, :2] - keypoint_array[3, :2]
            index_knuckle = keypoint_array[8, :2] - keypoint_array[7, :2]
            palm_knuckle = keypoint_array[13, :2] - keypoint_array[17, :2]

            row["thumb_index_cos"] = cos(thumb_knuckle, index_knuckle)
            row["thumb_palm_cos"] = cos(thumb_knuckle, palm_knuckle)
            row["dist_index"] = np.linalg.norm(keypoint_array[8, :2] - keypoint_array[5, :2]) / palm_height
            row["dist_middle"] = np.linalg.norm(keypoint_array[12, :2] - keypoint_array[9, :2]) / palm_height
            row["dist_ring"] = np.linalg.norm(keypoint_array[16, :2] - keypoint_array[13, :2]) / palm_height
            row["dist_little"] = np.linalg.norm(keypoint_array[20, :2] - keypoint_array[17, :2]) / palm_height

            row["fold_thumb"] = row["thumb_palm_cos"] < 0.2
            row["fold_index"] = row["dist_index"] < 0.5
            row["fold_middle"] = row["dist_middle"] < 0.5
            row["fold_ring"] = row["dist_ring"] < 0.5
            row["fold_little"] = row["dist_little"] < 0.5

            rows.append(row)
    return rows


def data2df(data_path, image_dir, split, label_info_mapping, csv_path=None):
    """
    각 split의 keypoint 및 label 데이터를 처리하여 csv로 저장하고 data frame을 반환
    """
    
    print(data_path)
    print(image_dir)
    print(split)
    
    '''ego-vision\new_jsons\train
       ego-vision
       train'''
    
    data_list = os.listdir(data_path)
    print(len(data_list))
    image_dir = os.path.join(image_dir, split)
    
    blob_detector = make_blob_detector()
    
    print("Preprocessing")
    rows = []
    for file_name in tqdm(data_list, split):
        json_path = os.path.join(data_path, file_name)
        json_path +=  '/' + str(file_name) + '.json'
        json_path = json_path.replace('\\', '/')
        with open(json_path) as f:
            file_info = json.load(f)
        frame_rows = preprocess_file_info(file_info, image_dir, split, label_info_mapping, blob_detector, False)
        flip_frame_row = preprocess_file_info(file_info, image_dir, split, label_info_mapping, blob_detector, True)
        if frame_rows[0]["id"] in (490, 586, 596, 613):
            continue
        rows.extend(frame_rows)
        rows.extend(flip_frame_row)
        
    df = pd.DataFrame(rows).sort_values("id").reset_index(drop=True)
    if csv_path is not None:
        df.to_csv(csv_path, index=False, encoding="cp949")
    return df

In [5]:
data_path = "ego-vision"
new_json_path = os.path.join(data_path, "new_jsons")

submission_test_path = "."
os.makedirs(submission_test_path, exist_ok=True)
os.chdir(submission_test_path)

"""
pose_name 정보를 좀더 구체적으로 수정함
1. 약속 class를 엄지의 위치에 따라 약속1, 약속2로 나눔
2. '숫자 1', '숫자1'과 같이 숫자 클래스가 공백으로 나누어져있기에 이를 병합
3. 동그라미가 한손일 때와 양손일 때의 패턴의 차이가 크기에 이를 나눔
4. 손하트가 my_hand와 your_hand의 차이가 크기에 이를 나눔
=> pose_name 종류의 수가 41개로 바뀜
"""
label_info = pd.read_csv(os.path.join(data_path, "hand_gesture_pose.csv"))
label_info.pose_name = label_info.pose_name.str.replace("숫자 ", "숫자")

label_info.loc[label_info.pose_id == 29, "pose_name"] = "약속1"
label_info.loc[label_info.pose_id == 54, "pose_name"] = "약속1"
label_info.loc[label_info.pose_id == 79, "pose_name"] = "약속1"
label_info.loc[label_info.pose_id == 129, "pose_name"] = "약속1"
label_info.loc[label_info.pose_id == 154, "pose_name"] = "약속1"
label_info.loc[label_info.pose_id == 49, "pose_name"] = "약속2"
label_info.loc[label_info.pose_id == 74, "pose_name"] = "약속2"
label_info.loc[label_info.pose_id == 124, "pose_name"] = "약속2"
label_info.loc[label_info.pose_id == 149, "pose_name"] = "약속2"
label_info.loc[label_info.pose_id == 174, "pose_name"] = "약속2"


label_info.loc[label_info.pose_id == 90, "pose_name"] = "동그라미-양손"
label_info.loc[label_info.pose_id == 190, "pose_name"] = "동그라미-양손"
label_info.loc[label_info.pose_id == 145, "pose_name"] = "손하트-yourhand"
assert label_info.pose_name.value_counts().shape[0] == 41 # 수정한 pose_name를 확인

label_info_mappnig = {row.pop("pose_id"): row for row in label_info.to_dict("records")}
train_data_path = os.path.join(data_path, "train")
train_new_json_path = os.path.join(new_json_path, "train")
train_df = data2df(train_new_json_path, data_path, "train", label_info_mappnig, "train_annotation_frame.csv")

test_data_path = os.path.join(data_path, "test")
test_new_json_path = os.path.join(new_json_path, "test")
test_df = data2df(test_new_json_path, data_path, "test", label_info_mappnig, "test_annotation_frame.csv")

ego-vision\new_jsons\train
ego-vision
train
649
Preprocessing


train: 100%|█████████████████████████████████████████████████████████████████████████| 649/649 [01:40<00:00,  6.45it/s]


ego-vision\new_jsons\test
ego-vision
test
217
Preprocessing


test: 100%|██████████████████████████████████████████████████████████████████████████| 217/217 [00:33<00:00,  6.42it/s]


In [6]:
def process_bbox(image, bbox_xyxy):
    """
    1. bbox 확장 방법
        Fixed pixel 확장: 100픽셀 확장
    2. bbox clipping: bbox가 이미지 영역을 넘어갈 때 처리 방법
        zero-padded bbox
    """
    thr = 100
    bbox_xyxy[0] -= thr
    bbox_xyxy[2] += thr
    bbox_xyxy[1] -= thr
    bbox_xyxy[3] += thr
    new_xyxy = bbox_xyxy
    image = image.crop(new_xyxy)
    return image


class CsvDataset(Dataset):
    """
    Preprocessing을 거쳐 만들어진 csv 파일을 활용해 CLIP에 사용할 Dataset 생성
    """
    def __init__(self, image_root, csv_path, transform=None) -> None:
        super().__init__()
        self.image_root = image_root
        self.transform = transform
        self.data_info = pd.read_csv(csv_path, encoding="cp949")

    def __getitem__(self, index):
        item = self.data_info.iloc[index]
        file_id = int(item.id)
        img_num = item.img_num
        img_path = os.path.join(self.image_root, f"{file_id}/{img_num}.png")
        image = pil_loader(img_path)
        if item.flip:
            image = image.transpose(method=Image.FLIP_LEFT_RIGHT)
        bbox_xyxy = np.array([item.one_bbox_0, item.one_bbox_1, item.one_bbox_3, item.one_bbox_4])
        image = process_bbox(image, bbox_xyxy)
        image = self.transform(image)
        return image

    def __len__(self):
        return self.data_info.shape[0]


def get_features(dataset):
    """
    데이터 저장을 위한 CLIP feature 생성
    """
    all_features = []
    with torch.no_grad():
        for images in tqdm(DataLoader(dataset, batch_size=100, num_workers=0)):
            features = model.encode_image(images.to(device))
            all_features.append(features)
    return torch.cat(all_features).cpu().numpy()

In [7]:
# Load the model
import clip
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

train = CsvDataset(os.path.join(data_path, "train"), "train_annotation_frame.csv", preprocess)
test = CsvDataset(os.path.join(data_path, "test"), "test_annotation_frame.csv", preprocess)

# Calculate the image features

train_features = get_features(train)
train_clip_features = pd.DataFrame(train_features)
train_clip_features.columns = [f"clip_bbox_feature_{i}" for i in range(train_features.shape[1])]
pd.concat((train.data_info.id, train.data_info.img_num, train.data_info.bbox_num, train.data_info.flip, train_clip_features), axis=1).to_csv("train_annotation_clip_bbox100_frame.csv", index=False)

test_features = get_features(test)
test_clip_features = pd.DataFrame(test_features)
test_clip_features.columns = [f"clip_bbox_feature_{i}" for i in range(test_features.shape[1])]
pd.concat((test.data_info.id, test.data_info.img_num, test.data_info.bbox_num, test.data_info.flip, test_clip_features), axis=1).to_csv("test_annotation_clip_bbox100_frame.csv", index=False)

100%|████████████████████████| 353976522/353976522 [00:52<00:00, 6681305.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 139/139 [21:54<00:00,  9.46s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [07:37<00:00,  9.35s/it]


In [8]:
train_annotation_path = "train_annotation_frame.csv"
train_annotation_clip_bbox_path = "train_annotation_clip_bbox100_frame.csv"

data_test_path = os.path.join(data_path, "test")
test_annotation_path = "test_annotation_frame.csv"
test_annotation_clip_bbox_path = "test_annotation_clip_bbox100_frame.csv"

train_df = pd.read_csv(train_annotation_path, encoding="cp949")
train_df_clip_bbox = pd.read_csv(train_annotation_clip_bbox_path)

test_df = pd.read_csv(test_annotation_path, encoding="cp949")
test_df_clip_bbox = pd.read_csv(test_annotation_clip_bbox_path)

train_df = train_df.merge(train_df_clip_bbox, on=["id", "img_num", "bbox_num", "flip"])
test_df = test_df.merge(test_df_clip_bbox, on=["id", "img_num", "bbox_num", "flip"])

In [9]:
train_df["n_hands"] = pd.Categorical(train_df["n_hands"])
test_df["n_hands"] = pd.Categorical(test_df["n_hands"])

features = \
    [f"diff_{i}" for i in range(630)] + \
    [f"dist_{i}" for i in range(210)] + \
    [f"cos_{i}" for i in range(27)] + \
    [f"sin_{i}" for i in range(27)] + \
    [f"sin_abs_{i}" for i in range(27)] + \
    [f"clip_bbox_feature_{i}" for i in range(512)] + \
    ["n_blob_keypoints", "n_hands", "bbox_w", "bbox_h", "one_bbox_w", "one_bbox_h"] + \
    ["thumb_index_cos", "thumb_palm_cos", "fold_thumb", "fold_index", "fold_middle", "fold_ring", "fold_little"] + \
    ["pw_max_diff", "index_move_mean", "index_move_x_mean"]

In [10]:
def train_model(train_df, test_df, features, target_col, path="log", add_params={}):
    print("Start training")
    print(f"Target: {target_col}")

    os.makedirs(path, exist_ok=True)

    train_df[target_col] = pd.Categorical(train_df[target_col])

    x_train = train_df[features]
    y_train = train_df[target_col].cat.codes
    num_class = len(set(y_train))

    dtrain = lgb.Dataset(x_train, label=y_train)

    x_test = test_df[features]

    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": num_class,
        "verbosity": -1,
    }
    params.update(add_params)
    train_df[f"pred_{target_col}"] = pd.Categorical(train_df[target_col])

    model = lgb.train(params, dtrain)
    result = model.predict(x_test)
    test_df[f"pred_{target_col}"] = pd.Categorical(result.argmax(-1))

    if target_col == "hand_type":
        new_df = pd.concat((test_df[["id"]], pd.DataFrame(result)), axis=1)
        flip_temp = new_df[test_df.flip].copy()
        new_df.loc[test_df.flip, 1] = flip_temp.loc[:, 2]
        new_df.loc[test_df.flip, 2] = flip_temp.loc[:, 1]
    else:
        new_df = pd.concat((test_df[["id"]], pd.DataFrame(result)), axis=1)
    merged_df = new_df.groupby("id", as_index=False).mean()
    result = merged_df.iloc[:, 1:(1+num_class)].to_numpy()
    y_pred = result.argmax(-1)

    result_df = pd.concat((merged_df.id, pd.DataFrame(np.concatenate((y_pred[:, None], result), 1))), ignore_index=True, axis=1)
    result_df.columns = ["id", f"pred_{target_col}"] + train_df[target_col].cat.categories.to_list()

    result_df.to_csv(f"{path}/result_test_{target_col}.csv", index=False)
    return result_df

In [11]:
save_path = "submission"
targets = ("gesture_type", "hand_type", "pose_name")
train_add_params = [
    {
        "feature_pre_filter": False,
        "lambda_l1": 1.7657137779105168e-06,
        "lambda_l2": 2.3530332427385596e-06,
        "num_leaves": 6,
        "feature_fraction": 0.4,
        "bagging_fraction": 0.6541928796037666,
        "bagging_freq": 5,
        "min_child_samples": 20
    },
    {
        "feature_pre_filter": False,
        "lambda_l1": 1.5204270129130175e-08,
        "lambda_l2": 0.31158648353398066,
        "num_leaves": 140,
        "feature_fraction": 0.41600000000000004,
        "bagging_fraction": 0.7616580256435892,
        "bagging_freq": 4,
        "min_child_samples": 20
    },
    {
        "num_boost_round": 10000,
        "feature_pre_filter": False,
        "min_data_in_leaf": 100,
        "lambda_l1": 8.685219254418955e-07,
        "lambda_l2": 8.519494831720772,
        "num_leaves": 141,
        "feature_fraction": 0.41600000000000004,
        "bagging_fraction": 0.44262163491880324,
        "bagging_freq": 1,
    }
]

result_dfs = {}
for target_col, add_params in tqdm(zip(targets, train_add_params)):
    result_dfs[target_col] = train_model(train_df, test_df, features, target_col, save_path, add_params)
    features.append(f"pred_{target_col}")

0it [00:00, ?it/s]

Start training
Target: gesture_type


1it [00:02,  2.97s/it]

Start training
Target: hand_type




Start training
Target: pose_name


3it [09:44, 194.85s/it]


In [12]:
for target_col in ("hand_type", "gesture_type"):
    probs = result_dfs[target_col].iloc[:, 2:].to_numpy()
    one_hot = np.zeros_like(probs)
    for i, p in enumerate(probs.argmax(-1)):
        one_hot[i, p] = 1.
    result_dfs[target_col].iloc[:, 2:] = one_hot

In [13]:
sample_submision_path = os.path.join(data_path, "sample_submission.csv")
submission = pd.read_csv(sample_submision_path)
for row in label_info.to_dict("records"):
    col_prob = np.ones(result_dfs[target_col].shape[0], dtype=float)
    for target_col in targets:
        col_prob *= result_dfs[target_col][row[target_col]]
    submission[f"Label_{row['pose_id']}"] = col_prob
submission.iloc[:, 1:] = submission.iloc[:, 1:].to_numpy() / np.sum(submission.iloc[:, 1:].to_numpy(), axis=1, keepdims=True)
submission.to_csv(f"{save_path}/submission.csv", index=False)