# 2-5. Pose Estimation, Action Recognition

In [None]:
!pip3 install --upgrade mxnet-cu101 > /dev/null
!pip3 install --upgrade gluoncv > /dev/null

## Pose Estimation

### Pose Estimation Framework
<img src='https://res.infoq.com/articles/human-pose-estimation-ai-powered-fitness-apps/en/resources/30image001-1602703271382.jpg' width=100% />

- heatmap

<img src='https://res.infoq.com/articles/human-pose-estimation-ai-powered-fitness-apps/en/resources/25image003-1602703274021.jpg' />

### Simple Pose Estimation
- input size : 256*192

In [None]:
from matplotlib import pyplot as plt
from gluoncv import model_zoo, data, utils
from gluoncv.data.transforms.pose import detector_to_simple_pose, heatmap_to_coord

In [None]:
detector = model_zoo.get_model('yolo3_mobilenet1.0_coco', pretrained=True)
pose_net = model_zoo.get_model('simple_pose_resnet18_v1b', pretrained=True)

# Note that we can reset the classes of the detector to only include
# human, so that the NMS process is faster.

detector.reset_class(["person"], reuse_weights=['person'])

In [None]:
im_fname = utils.download('https://github.com/dmlc/web-data/blob/master/' +
                          'gluoncv/pose/soccer.png?raw=true',
                          path='soccer.png')
x, img = data.transforms.presets.ssd.load_test(im_fname, short=512)
print('Shape of pre-processed image:', x.shape)

class_IDs, scores, bounding_boxs = detector(x)

In [None]:
pose_input, upscale_bbox = detector_to_simple_pose(img, class_IDs, scores, bounding_boxs)

In [None]:
predicted_heatmap = pose_net(pose_input)
pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox)

In [None]:
ax = utils.viz.plot_keypoints(img, pred_coords, confidence,
                              class_IDs, bounding_boxs, scores,
                              box_thresh=0.5, keypoint_thresh=0.2)
plt.rcParams["figure.figsize"] = (20,15)
plt.show()

### AlphaPose Estimation
- input size : 320*256

In [None]:
from matplotlib import pyplot as plt
from gluoncv import model_zoo, data, utils
from gluoncv.data.transforms.pose import detector_to_alpha_pose, heatmap_to_coord_alpha_pose

In [None]:
detector = model_zoo.get_model('yolo3_mobilenet1.0_coco', pretrained=True)
pose_net = model_zoo.get_model('alpha_pose_resnet101_v1b_coco', pretrained=True)

# Note that we can reset the classes of the detector to only include
# human, so that the NMS process is faster.

detector.reset_class(["person"], reuse_weights=['person'])

In [None]:
im_fname = utils.download('https://github.com/dmlc/web-data/blob/master/' +
                          'gluoncv/pose/soccer.png?raw=true',
                          path='soccer.png')
x, img = data.transforms.presets.yolo.load_test(im_fname, short=512)
print('Shape of pre-processed image:', x.shape)

class_IDs, scores, bounding_boxs = detector(x)

In [None]:
pose_input, upscale_bbox = detector_to_alpha_pose(img, class_IDs, scores, bounding_boxs)


In [None]:
predicted_heatmap = pose_net(pose_input)
pred_coords, confidence = heatmap_to_coord_alpha_pose(predicted_heatmap, upscale_bbox)

In [None]:
ax = utils.viz.plot_keypoints(img, pred_coords, confidence,
                              class_IDs, bounding_boxs, scores,
                              box_thresh=0.5, keypoint_thresh=0.2)

plt.rcParams["figure.figsize"] = (20,15)
plt.show()

## Action Recognition

### TSN (UCF101)
- Temporal Segment Networks

#### TSN Framework
<img src='https://blog.kakaocdn.net/dn/cPmbiR/btqI4UwbnXz/c6UOHFYP5ia2hdWbKH5lhk/img.png' width=100% />

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model

In [None]:
url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/ThrowDiscus.png'
im_fname = utils.download(url)

img = image.imread(im_fname)

plt.imshow(img.asnumpy())
plt.show()

In [None]:
transform_fn = transforms.Compose([
    video.VideoCenterCrop(size=224),
    video.VideoToTensor(),
    video.VideoNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
img_list = transform_fn([img.asnumpy()])
plt.imshow(np.transpose(img_list[0], (1,2,0)))
plt.show()

In [None]:
net = get_model('vgg16_ucf101', nclass=101, pretrained=True)

#### TSN Layer

In [None]:
net.summary

In [None]:
pred = net(nd.array(img_list[0]).expand_dims(axis=0))

classes = net.classes
topK = 5
ind = nd.topk(pred, k=topK)[0].astype('int')
print('The input video frame is classified to be')
for i in range(topK):
    print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))

In [None]:
from gluoncv.utils import try_import_cv2
cv2 = try_import_cv2()

url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/v_Basketball_g01_c01.avi'
video_fname = utils.download(url)

cap = cv2.VideoCapture(video_fname)
cnt = 0
video_frames = []
while(cap.isOpened()):
    ret, frame = cap.read()
    cnt += 1
    if ret and cnt % 25 == 0:
        video_frames.append(frame)
    if not ret: break

cap.release()
print('We evenly extract %d frames from the video %s.' % (len(video_frames), video_fname))

In [None]:
if video_frames:
    video_frames_transformed = transform_fn(video_frames)
    final_pred = 0
    for _, frame_img in enumerate(video_frames_transformed):
        pred = net(nd.array(frame_img).expand_dims(axis=0))
        final_pred += pred
    final_pred /= len(video_frames)

    classes = net.classes
    topK = 5
    ind = nd.topk(final_pred, k=topK)[0].astype('int')
    print('The input video is classified to be')
    for i in range(topK):
        print('\t[%s], with probability %.3f.'%
              (classes[ind[i].asscalar()], nd.softmax(final_pred)[0][ind[i]].asscalar()))

### I3D (Kinetcis400)

#### I3D Framework
- Inflated 3D ConvNet
<img src='https://www.researchgate.net/profile/Jamil_Ahmad13/publication/321352236/figure/fig6/AS:668725271351305@1536447937705/Framework-of-the-proposed-DB-LSTM-for-action-recognition-action-recognition-The-output.png' width=100% />

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model

In [None]:
from gluoncv.utils.filesystem import try_import_decord
decord = try_import_decord()

url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4'
video_fname = utils.download(url)
vr = decord.VideoReader(video_fname)
frame_id_list = range(0, 64, 2)
video_data = vr.get_batch(frame_id_list).asnumpy()
clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]

In [None]:
transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
clip_input = transform_fn(clip_input)
clip_input = np.stack(clip_input, axis=0)
clip_input = clip_input.reshape((-1,) + (32, 3, 224, 224))
clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
print('Video data is downloaded and preprocessed.')

In [None]:
model_name = 'i3d_inceptionv1_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True)
print('%s model is successfully loaded.' % model_name)

#### I3D layer

In [None]:
net.summary

In [None]:
pred = net(nd.array(clip_input))

classes = net.classes
topK = 5
ind = nd.topk(pred, k=topK)[0].astype('int')
print('The input video clip is classified to be')
for i in range(topK):
    print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))

### SlowFast (Kinetcis400)

#### SlowFast Framework
<img src='https://miro.medium.com/max/700/0*WLTSCRGi1DNfqyyi.png' width=100% />

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model

In [None]:
from gluoncv.utils.filesystem import try_import_decord
decord = try_import_decord()

url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4'
video_fname = utils.download(url)
vr = decord.VideoReader(video_fname)
fast_frame_id_list = range(0, 64, 2)
slow_frame_id_list = range(0, 64, 16)
frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list)
video_data = vr.get_batch(frame_id_list).asnumpy()
clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]

In [None]:
transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
clip_input = transform_fn(clip_input)
clip_input = np.stack(clip_input, axis=0)
clip_input = clip_input.reshape((-1,) + (36, 3, 224, 224))
clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
print('Video data is downloaded and preprocessed.')

In [None]:
model_name = 'slowfast_4x16_resnet50_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True)
print('%s model is successfully loaded.' % model_name)

#### slowfast Layer

In [None]:
net.summary

In [None]:
pred = net(nd.array(clip_input))

classes = net.classes
topK = 5
ind = nd.topk(pred, k=topK)[0].astype('int')
print('The input video clip is classified to be')
for i in range(topK):
    print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))

## Object Tracking

### SiamRPN

In [None]:
import os
import argparse
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from gluoncv import model_zoo, utils
from gluoncv.model_zoo.siamrpn.siamrpn_tracker import SiamRPNTracker as build_tracker
from gluoncv.model_zoo.siamrpn.siamrpn_tracker import get_axis_aligned_bbox
from gluoncv.utils.filesystem import try_import_cv2
cv2 = try_import_cv2()

In [None]:
from gluoncv import utils
video_path = 'https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/tracking/Coke.mp4'
im_video = utils.download(video_path)
gt_bbox = [298, 160, 48, 80]

In [None]:
def read_data(video_path): 
    video_frames = [] 
    im_video = utils.download(video_path)
    cap = cv2.VideoCapture(im_video)
    while(True):
        ret, img = cap.read()
        if not ret:
            break
        video_frames.append(img)
    
    return video_frames

In [None]:
def inference(video_frames, tracker, gt_bbox, save_dir): 
    scores = []
    pred_bboxes = []
    gt_bbox = list(map(int, gt_bbox))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    for ind, frame in enumerate(video_frames):
        if ind == 0:
            cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
            gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]
            tracker.init(frame, gt_bbox_, ctx=mx.cpu())
            pred_bbox = gt_bbox_
            scores.append(None)
            pred_bboxes.append(pred_bbox)
        else:
            outputs = tracker.track(frame, ctx=mx.cpu())
            pred_bbox = outputs['bbox']
            pred_bboxes.append(pred_bbox)
            scores.append(outputs['best_score'])
        pred_bbox = list(map(int, pred_bbox))
        cv2.rectangle(frame, (pred_bbox[0], pred_bbox[1]),
                      (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]),
                      (0, 255, 255), 3)
        cv2.imwrite(os.path.join(save_dir, '%04d.jpg'%(ind+1)), frame)

In [None]:
net = model_zoo.get_model('siamrpn_alexnet_v2_otb15', ctx=mx.cpu(), pretrained=True)


In [None]:
tracker = build_tracker(net)


In [None]:
video_path = 'https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/tracking/Coke.mp4'

In [None]:
video_frames = read_data(video_path)


In [None]:
# BGR -> RGB
plt.imshow(video_frames[0][:,:,[2,1,0]])
plt.show()


In [None]:
gt_bbox = [298, 160, 48, 80]
save_dir = './predictions'

In [None]:
inference(video_frames, tracker, gt_bbox, save_dir) 

## 저장된 이미지 확인

In [None]:
import os

In [None]:
file_list = os.listdir(save_dir)

In [None]:
file_list.sort()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import imageio
import os

In [None]:
def img_file_to_gif(img_root, img_files, output_file_name): 
    imgs_array = [np.array(imageio.imread(img_root + '/' + img_file)) for img_file in img_files] 
    imageio.mimsave(output_file_name, imgs_array, format='GIF', fps=24)

In [None]:
img_file_to_gif(save_dir, file_list, "coke_tracking.gif")


- downsize

In [None]:
!apt-get install -y gifsicle

In [None]:
!pip3 install pygifsicle

In [None]:
from pygifsicle import optimize


In [None]:
optimize("./coke_tracking.gif")

![coke](coke_tracking.gif)