In [2]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms
import torch.nn.functional as F
from torch.utils.data import Dataset

import json
import os
import glob
from tqdm.notebook import tqdm
import numpy as np

from PIL import Image
import cv2
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# Version 1 (best)
class FeatureAggregator(nn.Module):
    def __init__(self, img_encoder, feat_dim):
        super().__init__()
        self.img_encoder = img_encoder
        self.img_encoder.eval()
        self.img_head = self.create_hidden_layer(1000, 512)
        self.bbox_head = self.create_hidden_layer(4, 512)
        self.agg_head = self.create_hidden_layer(512, 1024)
        self.final_layer = nn.Linear(1024, feat_dim)
    
    def forward(self, img, bbox):
        '''
        img: (bs, 3, h, w)
        bbox: (bs, 4)
        '''
        img_feat = self.img_encoder(img)
        img_feat = self.img_head(img_feat)
        bbox_feat = self.bbox_head(bbox)
        feat = img_feat + bbox_feat
        feat = self.agg_head(feat)
        feat = self.final_layer(feat)
        return feat
    
    def create_hidden_layer(self, in_dim, out_dim):
        layer = nn.Sequential(
            nn.Linear(in_dim, out_dim),
            nn.BatchNorm1d(out_dim),
            nn.LeakyReLU(0.01)
        )
        return layer

# # Version 2
# class FeatureAggregator(nn.Module):
#     def __init__(self, img_encoder, feat_dim):
#         super().__init__()
#         self.img_encoder = img_encoder
#         self.img_encoder.eval()
#         self.img_head = self.create_hidden_layer(1000, feat_dim)
#         self.bbox_head = self.create_hidden_layer(4, feat_dim)
#         self.agg_head = self.create_hidden_layer(feat_dim, feat_dim)
    
#     def forward(self, img, bbox):
#         '''
#         img: (bs, 3, h, w)
#         bbox: (bs, 4)
#         '''
#         img_feat = self.img_encoder(img)
#         img_feat = self.img_head(img_feat)
#         bbox_feat = self.bbox_head(bbox)
# #         feat = torch.cat([img_feat, bbox_feat], dim=-1)
#         feat = img_feat + bbox_feat
#         feat = self.agg_head(feat)
#         return feat
    
#     def create_hidden_layer(self, in_dim, out_dim):
#         layer = nn.Sequential(
#             nn.Linear(in_dim, 256),
#             nn.BatchNorm1d(256),
#             nn.LeakyReLU(0.01),
#             nn.Linear(256, out_dim)
#         )
#         return layer

# # Version 3
# class FeatureAggregator(nn.Module):
#     def __init__(self, img_encoder, feat_dim):
#         super().__init__()
#         self.img_encoder = img_encoder
#         self.img_encoder.eval()
        
#         self.jk_img_head = self.create_hidden_layer(1000, feat_dim)
#         self.jk_bbox_head = self.create_hidden_layer(4, feat_dim)
#         self.jk_agg_head = self.create_hidden_layer(feat_dim*2, feat_dim)
        
#         self.sdcl_img_head = self.create_hidden_layer(1000, feat_dim)
#         self.sdcl_bbox_head = self.create_hidden_layer(4, feat_dim)
#         self.sdcl_agg_head = self.create_hidden_layer(feat_dim*2, feat_dim)
        
#         self.cap_img_head = self.create_hidden_layer(1000, feat_dim)
#         self.cap_bbox_head = self.create_hidden_layer(4, feat_dim)
#         self.cap_agg_head = self.create_hidden_layer(feat_dim*2, feat_dim)
    
#     def forward(self, img, bbox, obj_id):
#         '''
#         img: (bs, 3, h, w)
#         bbox: (bs, 4)
#         '''
#         img_feat = self.img_encoder(img)
#         if obj_id == 0:
#             img_feat = self.jk_img_head(img_feat)
#             bbox_feat = self.jk_bbox_head(bbox)
#             feat = torch.cat([img_feat, bbox_feat], dim=-1)
#             feat = self.jk_agg_head(feat)
#         if obj_id == 1:
#             img_feat = self.sdcl_img_head(img_feat)
#             bbox_feat = self.sdcl_bbox_head(bbox)
#             feat = torch.cat([img_feat, bbox_feat], dim=-1)
#             feat = self.sdcl_agg_head(feat)
#         if obj_id == 2:
#             img_feat = self.cap_img_head(img_feat)
#             bbox_feat = self.cap_bbox_head(bbox)
#             feat = torch.cat([img_feat, bbox_feat], dim=-1)
#             feat = self.cap_agg_head(feat)
            
#         return feat
    
    
#     def create_hidden_layer(self, in_dim, out_dim):
#         layer = nn.Sequential(
#             nn.Linear(in_dim, 256),
#             nn.BatchNorm1d(256),
#             nn.LeakyReLU(0.01),
#             nn.Linear(256, out_dim)
#         )
#         return layer

In [4]:
class CustomDataset(Dataset):
    def __init__(self, img_dir, frame_ids, json_path, inp_size:int, orig_im_size=(1080, 1920)):
        self.img_dir = img_dir
        self.frame_ids = frame_ids
        with open(json_path, 'r') as f:
            self.data = json.load(f)
        self.inp_size = inp_size
        self.orig_h = orig_im_size[0]
        self.orig_w = orig_im_size[1]
        
    def __getitem__(self, idx):
        frame_id = self.frame_ids[idx]
        frame_info = self.data[frame_id.replace('_', '/')+'.jpg']
        
        jk_imgs = []
        sdcl_imgs = []
        cap_imgs = []
        jk_bboxes = []
        sdcl_bboxes = []
        cap_bboxes = []
        for grp_id, detect_results in frame_info.items():
            
            # Create placeholders for non-existing data
            jk_imgs.append(None)
            sdcl_imgs.append(None)
            cap_imgs.append(None)
            jk_bboxes.append(None)
            sdcl_bboxes.append(None)
            cap_bboxes.append(None)
            
            for obj, bbox in detect_results.items():
                bbox = list(map(float, bbox))
                bbox[0] = bbox[0]/self.orig_w
                bbox[1] = bbox[1]/self.orig_h
                bbox[2] = bbox[2]/self.orig_w
                bbox[3] = bbox[3]/self.orig_h
                img = cv2.imread(os.path.join(self.img_dir, frame_id, obj, f'{grp_id}.jpg'))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = Image.fromarray(img)
                img = self.preprocess(img)
                if obj == 'jockey':
                    jk_imgs[-1] = img
                    jk_bboxes[-1] = torch.tensor(bbox)
                if obj == 'sdcl':
                    sdcl_imgs[-1] = img
                    sdcl_bboxes[-1] = torch.tensor(bbox)
                if obj == 'cap':
                    cap_imgs[-1] = img
                    cap_bboxes[-1] = torch.tensor(bbox)
                
        return jk_imgs, sdcl_imgs, cap_imgs, jk_bboxes, sdcl_bboxes, cap_bboxes
    
    def preprocess(self, img: Image.Image) -> torch.tensor:
        w, h = img.size
        long_edge = max(w, h)
        resize_ratio = self.inp_size / long_edge
        resize_shape = (round(h*resize_ratio), round(w*resize_ratio))
        w_diff, h_diff = (self.inp_size - resize_shape[1]), (self.inp_size - resize_shape[0])
        l_pad = w_diff//2
        r_pad = w_diff - l_pad
        t_pad = h_diff//2
        b_pad = h_diff - t_pad
        padding = (l_pad, t_pad, r_pad, b_pad)

        transform = transforms.Compose([
            transforms.Resize(resize_shape),  # interpolation `BILINEAR` is applied by default
            transforms.Pad(padding=padding, fill=0, padding_mode='constant'),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        image = transform(img)

        return image
    
    def __len__(self):
        return len(self.frame_ids)

In [5]:
json_path = '/home/jayson/gcn_clustering/data/extracted_data.json'
model_dst = '/home/jayson/gcn_clustering/triplet'

inp_size = 64
feat_dim = 512
dist_metric = 'l2'
k1 = 8  # Number of 1-hop nearest neighbors

weights = glob.glob(f'/home/jayson/gcn_clustering/triplet_models/triplet_{dist_metric}_{feat_dim}/*.pth')
weight_path = sorted(weights, key=lambda x: float(x.split('_')[-1].replace('.pth', '')), reverse=True)[0]
print(weight_path)
feat_agg = torch.load(weight_path, map_location=device)
feat_agg.eval()

if dist_metric == 'cosine':
    dist = nn.CosineSimilarity(dim=1, eps=1e-6)
elif dist_metric == 'l2':
    dist = nn.PairwiseDistance(p=2.0, eps=1e-06, keepdim=True)

# Train
train_img_dir = '/home/jayson/gcn_clustering/data/all_extracted/train'
train_img_paths = glob.glob('/home/jayson/gcn_clustering/data/all_extracted/train/*/*/*.jpg')
train_frame_ids = list(set(map(lambda x: x.split('/')[7], train_img_paths)))
train_set = CustomDataset(train_img_dir, train_frame_ids, json_path, inp_size)

# Val
val_img_dir = '/home/jayson/gcn_clustering/data/all_extracted/val'
val_img_paths = glob.glob('/home/jayson/gcn_clustering/data/all_extracted/val/*/*/*.jpg')
val_frame_ids = list(set(map(lambda x: x.split('/')[7], val_img_paths)))
val_set = CustomDataset(val_img_dir, val_frame_ids, json_path, inp_size)

# Train
test_img_dir = '/home/jayson/gcn_clustering/data/all_extracted/test'
test_img_paths = glob.glob('/home/jayson/gcn_clustering/data/all_extracted/test/*/*/*.jpg')
test_frame_ids = list(set(map(lambda x: x.split('/')[7], test_img_paths)))
test_set = CustomDataset(test_img_dir, test_frame_ids, json_path, inp_size)

/home/jayson/gcn_clustering/triplet_models/triplet_l2_512/feat_agg_13300.0943.pth


In [6]:
def extract_save_graph_data(dataset, data_type):
    features = []  # frame node features  [n_frames, n_nodes, f_dim]
    labels = []  # index for each node within the same frame  [n_frames, n_nodes]
    knn_graphs = []  # kNN graphs (neighbors for each node is a list)  [n_frames, n_nodes, k1]
    obj_types = []  # indicating what object type the feature node belongs to (0-jk, 1-sdcl, 2-cap)  [n_frames, n_nodes]
    for data in tqdm(dataset):
        frame_f = []
        frame_grp_ids = []
        frame_graph = []
        frame_obj = []
        for grp_id, (jk_img, sdcl_img, cap_img, jk_bbox, sdcl_bbox, cap_bbox) in enumerate(zip(*data)):
            if jk_img is not None:
                jk_img = jk_img.unsqueeze(0).to(device)
                jk_bbox = jk_bbox.unsqueeze(0).to(device)
                with torch.no_grad():
                    jk_f = feat_agg(jk_img, jk_bbox).cpu()
                frame_f.append(jk_f.squeeze(0).tolist())
                frame_grp_ids.append(grp_id)
                frame_obj.append(0)
            if sdcl_img is not None:
                sdcl_img = sdcl_img.unsqueeze(0).to(device)
                sdcl_bbox = sdcl_bbox.unsqueeze(0).to(device)
                with torch.no_grad():
                    sdcl_f = feat_agg(sdcl_img, sdcl_bbox).cpu()
                frame_f.append(sdcl_f.squeeze(0).tolist())
                frame_grp_ids.append(grp_id)
                frame_obj.append(1)
            if cap_img is not None:
                cap_img = cap_img.unsqueeze(0).to(device)
                cap_bbox = cap_bbox.unsqueeze(0).to(device)
                with torch.no_grad():
                    cap_f = feat_agg(cap_img, cap_bbox).cpu()
                frame_f.append(cap_f.squeeze(0).tolist())
                frame_grp_ids.append(grp_id)
                frame_obj.append(2)

        for i, node_f in enumerate(frame_f):
            node_f = torch.tensor(node_f, device=device).unsqueeze(0)
            candidates = torch.tensor(frame_f, device=device)
            if dist_metric == 'cosine':
                node_dists = 1 - dist(node_f, candidates)
            elif dist_metric == 'l2':
                node_dists = dist(node_f, candidates)
#             node_type = frame_obj[i]
#             candidate_indices = torch.where(torch.tensor(frame_obj) != node_type)[0]
#             node_dists = node_dists[candidate_indices]
#             sorted_neighbors = [i] + node_dists.squeeze(-1).sort(descending=False).indices.tolist()
            sorted_neighbors = node_dists.squeeze(-1).sort(descending=False).indices.tolist()
            top_k1_nn = [n for n in sorted_neighbors][:k1+1]
            frame_graph.append(top_k1_nn)

        features.append(frame_f)
        labels.append(frame_grp_ids)
        knn_graphs.append(frame_graph)
        obj_types.append(frame_obj)
        
    dst = f'/home/jayson/gcn_clustering/features/{dist_metric}_{feat_dim}_k{k1}/{data_type}'
    if not os.path.exists(dst):
        os.makedirs(dst)

    with open(os.path.join(dst, 'feat.json'), 'w') as f:
        json.dump(features, f)
    with open(os.path.join(dst, 'label.json'), 'w') as f:
        json.dump(labels, f)
    with open(os.path.join(dst, 'knn_graph.json'), 'w') as f:
        json.dump(knn_graphs, f)
    with open(os.path.join(dst, 'obj_type.json'), 'w') as f:
        json.dump(obj_types, f)
            
    return features, knn_graphs, labels, obj_types

In [None]:
extract_save_graph_data(train_set, 'train')
extract_save_graph_data(val_set, 'val')
extract_save_graph_data(test_set, 'test')

  0%|          | 0/8016 [00:00<?, ?it/s]

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [35]:
knn_graphs

[[[0, 0, 1, 3, 7, 6, 5, 4, 2],
  [1, 0, 1, 11, 10, 9, 7, 4, 8],
  [2, 1, 0, 3, 7, 6, 4, 5, 2],
  [3, 2, 3, 1, 0, 7, 6, 5, 4],
  [4, 2, 3, 0, 1, 7, 6, 4, 5],
  [5, 3, 1, 0, 7, 6, 5, 4, 2],
  [6, 3, 0, 1, 7, 6, 4, 2, 5],
  [7, 4, 6, 5, 7, 0, 1, 3, 2],
  [8, 9, 8, 7, 6, 11, 10, 0, 1],
  [9, 5, 7, 6, 4, 1, 0, 3, 2],
  [10, 4, 6, 5, 7, 0, 1, 3, 2],
  [11, 5, 6, 7, 4, 1, 0, 3, 2],
  [12, 7, 6, 5, 0, 4, 1, 3, 2],
  [13, 1, 7, 6, 4, 5, 0, 3, 2]]]

## Feature Visualization

In [22]:
def get_features(dataset):
    idx = np.random.randint(len(train_set))
    data = train_set[idx]
    frame_f = []
    frame_grp_ids = []
    frame_graph = []
    frame_obj = []
    for grp_id, (jk_img, sdcl_img, cap_img, jk_bbox, sdcl_bbox, cap_bbox) in enumerate(zip(*data)):
        if jk_img is not None:
            jk_img = jk_img.unsqueeze(0).to(device)
            jk_bbox = jk_bbox.unsqueeze(0).to(device)
            with torch.no_grad():
                jk_f = feat_agg(jk_img, jk_bbox).cpu()
            frame_f.append(jk_f.squeeze(0).tolist())
            frame_grp_ids.append(grp_id)
            frame_obj.append(0)
        if sdcl_img is not None:
            sdcl_img = sdcl_img.unsqueeze(0).to(device)
            sdcl_bbox = sdcl_bbox.unsqueeze(0).to(device)
            with torch.no_grad():
                sdcl_f = feat_agg(sdcl_img, sdcl_bbox).cpu()
            frame_f.append(sdcl_f.squeeze(0).tolist())
            frame_grp_ids.append(grp_id)
            frame_obj.append(1)
        if cap_img is not None:
            cap_img = cap_img.unsqueeze(0).to(device)
            cap_bbox = cap_bbox.unsqueeze(0).to(device)
            with torch.no_grad():
                cap_f = feat_agg(cap_img, cap_bbox).cpu()
            frame_f.append(cap_f.squeeze(0).tolist())
            frame_grp_ids.append(grp_id)
            frame_obj.append(2)

    for i, node_f in enumerate(frame_f):
        node_f = torch.tensor(node_f, device=device).unsqueeze(0)
        candidates = torch.tensor(frame_f, device=device)
        if dist_metric == 'cosine':
            node_dists = 1 - dist(node_f, candidates)
        elif dist_metric == 'l2':
            node_dists = dist(node_f, candidates)
        sorted_neighbors = node_dists.sort(descending=False).indices.tolist()
        top_k1_nn = [n for n in sorted_neighbors if n != i][:k1]
        frame_graph.append(top_k1_nn)

    return frame_f, frame_grp_ids, frame_graph, frame_obj

In [23]:
# Test set
test_img_dir = '/home/jayson/gcn_clustering/data/all_extracted/test'
test_img_paths = glob.glob('/home/jayson/gcn_clustering/data/all_extracted/test/*/*/*.jpg')
test_frame_ids = list(set(map(lambda x: x.split('/')[-3], train_img_paths)))
test_set = CustomDataset(test_img_dir, test_frame_ids, json_path, inp_size)

In [29]:
import plotly.express as px
from sklearn.decomposition import PCA

frame_f, frame_grp_ids, frame_graph, frame_obj = get_features(test_set)

X = []
ids = []
pivot = 15
pivot_obj = frame_obj[pivot]
X.append(frame_f[pivot])
ids.append(-1)

if pivot_obj == 0:
    non_pivot_obj = np.random.choice([1,2])
elif pivot_obj == 1:
    non_pivot_obj = np.random.choice([0,2])
elif pivot_obj == 2:
    non_pivot_obj = np.random.choice([0,1])
    
non_pivot_i = np.where(np.array(frame_obj) == non_pivot_obj)[0]
X += [list(arr) for arr in np.array(frame_f)[non_pivot_i]]
ids += list(np.array(frame_grp_ids)[non_pivot_i])

print(f'Pivot grp id: {frame_grp_ids[pivot]}')

X = np.array(X)

pca = PCA(n_components=2)
components = pca.fit_transform(X)

print(pca.explained_variance_ratio_)

fig = px.scatter(components, x=0, y=1, color=ids)
fig.show()

Pivot grp id: 5
[0.9691971  0.02436394]


In [18]:
import plotly.express as px
from sklearn.decomposition import PCA
import chart_studio.plotly as py


X = np.array(X)

pca = PCA(n_components=3)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=ids,
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()
# py.iplot(fig, filename='jupyter-parametric_plot')