<a href="https://colab.research.google.com/github/jo1jun/Human-Action-Recog-VIBE/blob/main/HMAR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Google Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Prepare VIBE

In [None]:
# Clone the repo
!git clone https://github.com/mkocabas/VIBE.git
%cd VIBE/
# Install the other requirements
!pip install torch==1.4.0 numpy==1.17.5
!pip install git+https://github.com/giacaglia/pytube.git --upgrade
!pip install -r requirements.txt
# Download pretrained weights and SMPL data
!source scripts/prepare_data.sh

**Caution**

device & torch version must be cuda & 1.4.0 respectively

If torch version is not 1.4.0, restart runtime and run from below cell.

In [None]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
print(torch.__version__)

# Custom Dataset & DataLoader

In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
# from PIL import Image
import cv2
import json

# custom dataset 만들기.
class UcfDataset(Dataset):
    def __init__(self, base_dir):
 
        # 경로 설정.
        with open('/content/drive/MyDrive/HAT/walk_run/walk_run.json') as json_file:
            json_data = json.load(json_file)

        self.img_names = []
        self.annotations = []
        self.actions = []
        # action_type 0 -> Run-Side, 1 -> Walk-Front
        action_type = -1 
        for action in list(json_data.keys()):
            action_dir = os.path.join(base_dir, action)
            action_type += 1
            for video_num in list(json_data[action].keys()):
                # 아직 완성되지 않은 video는 예외처리.
                if len(json_data[action][video_num]) == 0:
                    continue

                cur_dir = os.path.join(action_dir, video_num)

                img_list = json_data[action][video_num]['images']
                self.img_names.append([os.path.join(cur_dir, f) for f in img_list])

                self.annotations.append(json_data[action][video_num]['bboxes'])

                self.actions.append(action_type)

    def __len__(self):
        return len(self.img_names)

    def __getitem__(self, index):
        images = []
        print(self.img_names[index]) # image 누락 확인용.
        for image_name in self.img_names[index]:
          images.append([cv2.imread(image_name)[:,:,::-1].copy().astype(np.float32)])
          # images.append([cv2.cvtColor(cv2.imread(image_name), cv2.COLOR_BGR2RGB)])

        images = np.array(images).squeeze(1)

        return images, np.array(self.annotations[index]), self.actions[index]

In [None]:
from torch.utils.data import DataLoader
dataset = UcfDataset("/content/drive/MyDrive/HAT/walk_run")
dataloader = DataLoader(dataset, shuffle=True)

# Pretrained VIBE model

In [None]:
import os
os.environ['PYOPENGL_PLATFORM'] = 'egl'

import cv2
import time
import torch
import joblib
import shutil
import colorsys
import argparse
import numpy as np
import json
from tqdm import tqdm
from torch.utils.data import DataLoader

from lib.models.vibe import VIBE_Demo
from lib.utils.renderer import Renderer
from lib.dataset.inference import Inference
from lib.utils.smooth_pose import smooth_pose
from lib.data_utils.kp_utils import convert_kps
from lib.utils.pose_tracker import run_posetracker

from lib.utils.demo_utils import (
    download_youtube_clip,
    smplify_runner,
    convert_crop_coords_to_orig_img,
    convert_crop_cam_to_orig_img,
    prepare_rendering_results,
    video_to_images,
    images_to_video,
    download_ckpt,
)

MIN_NUM_FRAMES = 16

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# ========= Define VIBE model ========= #
model = VIBE_Demo(seqlen=16, n_layers=2, hidden_size=1024,
    add_linear=True,
    use_residual=True)

model = model.to(device)

# ========= Load pretrained weights ========= #
pretrained_file = download_ckpt(use_3dpw=False)
ckpt = torch.load(pretrained_file)
print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}')
ckpt = ckpt['gen_state_dict']
model.load_state_dict(ckpt, strict=False)
model.eval()
print(f'Loaded pretrained weights from \"{pretrained_file}\"')

# Classifier (Prototype)

In [None]:
import torch.nn as nn

class Classifier(nn.Module):
  def __init__(self, seqlen = 16):
    super().__init__()
    self.fc1 = nn.Linear(seqlen * 72 + seqlen * 10, 1024)
    self.fc2 = nn.Linear(1024, 1024)
    self.fc3 = nn.Linear(1024, 256)
    self.fc4 = nn.Linear(256, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, parameters):
    x = self.fc1(parameters)
    x = self.fc2(x)
    x = self.fc3(x)
    out = self.fc4(x)
    return self.sigmoid(out)

classifier = Classifier()

# Loss function

In [None]:
criterion = nn.BCELoss()

# Hyperparameters

In [None]:
import torch.optim as optim

optimizer = optim.Adam(classifier.parameters())

# Trainer (prototype)

In [None]:
from lib.data_utils.img_utils import get_single_image_crop_demo
from google.colab.patches import cv2_imshow

bbox_scale = 1.1
crop_size = 224

classifier = classifier.to(device)
classifier.train()

for i, (images, bboxes, action) in enumerate(dataloader):

  norm_imgs = []

  for j, (img, bbox) in enumerate(zip(images.squeeze(0), bboxes.squeeze(0))):

    norm_img, raw_img, kp_2d = get_single_image_crop_demo(
        img,
        bbox,
        None,
        bbox_scale,
        crop_size)
    
    norm_imgs.append(norm_img.unsqueeze(0))

  norm_imgs = torch.cat(norm_imgs)

  with torch.no_grad():
    
    norm_imgs = norm_imgs.unsqueeze(0)
    num_frames = norm_imgs.shape[1]
    norm_imgs = norm_imgs.to(device)

    output = model(norm_imgs)[-1]

    # classifier 의 input 으로 활용될 VIBE outputs (parameters)
    poses = output['theta'][:, :, 3:75].squeeze(0)
    betas = output['theta'][:, :, 75:].squeeze(0)
    joints3d = output['kp_3d'].squeeze(0)
    joints2d = output['kp_2d'].squeeze(0)

    # poses : [num_frames, 72]
    # betas.shape : [num_frames, 10]
    # joints3d : [num_frames, 49, 3]
    # joints2d : [num_frames, 49, 2]

    cv2_imshow(np.array(norm_imgs[0,0].permute(1,2,0).cpu()))
    # 각 폴더 내의 첫번째 image 의 cropped & normalized image 출력.
    print('run' if action == 0 else 'walk')

  # sequence 를 sampling 해야함. 우선은 가장 처음 sequence 만 사용하는 것으로 설정.
  parameters = torch.cat([poses[:16].flatten(), betas[:16].flatten()])
  output = classifier(parameters)

  action = action.to(device)

  action = action.unsqueeze(0)
  output = output.unsqueeze(0)
  
  print(action)
  print(output)

  loss = criterion(output, action.float())

  print(loss)

  loss.backward()

  optimizer.step()



In [None]:
# !python demo.py --vid_file /content/drive/MyDrive/HAT/walk_run/Run-Side/009/3687-17_70245.avi --output_folder output/