# 3-5. Action Recognition

## DataSet

### 1. [Kinetics400/700 Dataset](https://deepmind.com/research/open-source/kinetics)
- 인간-물체 상호작용
- 인간-인간 상호작용
- 650,000 개의 비디오 클립을 제공

<img src='https://miro.medium.com/max/2400/1*k3kk3deV6tFY4lopJ7alJA.png' >


### 2. [UCF101 Dataset](https://www.crcv.ucf.edu/data/UCF101.php)
- YouTube에서 수집한 실제 액션 동영상
  - 인간-물체 상호작용
  - 신체-동작
  - 인간-인간 상호작용
  - 악기 연주
  - 스포츠
- 101개의 카테고리, 13,320 개의 비디오

<img src='https://www.researchgate.net/profile/Khurram_Soomro/publication/233815759/figure/fig1/AS:669565927297037@1536648365593/Sample-frames-for-6-action-classes-of-UCF101.png' width=100%>



### 3. [HMDB51 Dataset](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/)
- 주로 영화에서 수집
- HMDB, Prelinger 아카이브, Youtube, Google 비디오 등 공개 데이터베이스 수집
  - 사람 얼굴 행동
  - 물체를 통한 얼굴 행동
  - 신체움직임
  - 물체 - 사람 상호작용
  - 사람 - 사람 상호작용

<img src='https://www.researchgate.net/publication/290181771/figure/fig4/AS:318089251049478@1452849795740/Examples-from-HMDB51-Kuehne-et-al-2011-dataset-for-a-few-of-51-classes.png' width=100%>

### 4. [The 20BN-something-something-V2 Dataset](https://20bn.com/datasets/something-something)
- 인간이 일상적인 사물로 미리 정의된 기본 동작을 수행하는 행위 정의
  - 220,847개의 영상
  - 무언가를 무언가에 넣기 와 같은 물체-물체 주석 포함
  - 영상의 행위를 설명하는 용도로 사용가능

<img src='https://miro.medium.com/max/600/0*udOMDyKgeiWfN8Aa.' width=100%>

## 라이브러리

In [None]:
!pip3 install --upgrade mxnet-cu101 > /dev/null
!pip3 install --upgrade gluoncv > /dev/null

In [None]:
!pip3 install --upgrade youtube-dl > /dev/null

In [None]:
def get_youtube_video(youtube_url):
  import youtube_dl

  ydl_opts = {  
    # 'format': 'mp4' 
    'format': 'best[height<=480][ext=mp4]' 
    } 
  with youtube_dl.YoutubeDL(ydl_opts) as ydl: 
      info_dict = ydl.extract_info(youtube_url, download=True)    
      filename = ydl.prepare_filename(info_dict)
  return filename

## i3d action recognition example

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model


In [None]:
from gluoncv.utils.filesystem import try_import_decord
decord = try_import_decord()

In [None]:
url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4'
video_fname = utils.download(url)
vr = decord.VideoReader(video_fname)
frame_id_list = range(0, 64, 2)
video_data = vr.get_batch(frame_id_list).asnumpy()
clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]

In [None]:
transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
clip_input = transform_fn(clip_input)
clip_input = np.stack(clip_input, axis=0)
clip_input = clip_input.reshape((-1,) + (32, 3, 224, 224))
clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
print('Video data is downloaded and preprocessed.')

In [None]:
model_name = 'i3d_nl10_resnet101_v1_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True)

# model_i3d_ucf101 = 'i3d_resnet50_v1_ucf101'
# net_i3d_ucf101 = get_model(model_i3d_ucf101, nclass=101, pretrained=True)

# model_i3d_hmdb51 = 'i3d_resnet50_v1_hmdb51'
# net_i3d_hmdb51 = get_model(model_i3d_hmdb51, nclass=51, pretrained=True)

# model_i3d_sthsthv2 = 'i3d_resnet50_v1_sthsthv2'
# net_i3d_sthsthv2 = get_model(model_i3d_sthsthv2, nclass=174, pretrained=True)
print('%s model is successfully loaded.' % model_name)

In [None]:
pred = net(nd.array(clip_input))

classes = net.classes
topK = 5
ind = nd.topk(pred, k=topK)[0].astype('int')
print('The input video clip is classified to be')
for i in range(topK):
    print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))

### 영상 이미지로 저장

In [None]:
def read_video(filename, output_path, start_ms = 0, end_ms = None, step=1):
  from tqdm.notebook import tqdm
  import os 
  from gluoncv.utils.filesystem import try_import_cv2
  cv2 = try_import_cv2()
  # video_frames = [] 

  if not os.path.exists(output_path):
        os.makedirs(output_path)

  cap = cv2.VideoCapture(filename)

  cap.set(cv2.CAP_PROP_POS_MSEC, start_ms);

  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  fps = cap.get(cv2.CAP_PROP_FPS)
  total_ms = frame_count * fps * 1000
  print('Frame width:', width)
  print('Frame height:', height)
  print('Frame count:', frame_count)
  print('FPS:', fps)
  print('total sec:', int(total_ms/1000))

  capture_count = 0
  
  if end_ms != None:
    time_gap_sec = (end_ms - start_ms)/1000
    pbar_count = round(time_gap_sec*fps)
  else:
    pbar_count = round(frame_count)

  with tqdm(total=pbar_count) as pbar:
    while cap.isOpened():
        ret, img = cap.read()
        if not ret:
            cap.release()
            break 
        frame_no = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
        frame_msec = int(cap.get(cv2.CAP_PROP_POS_MSEC))

        # print('{}. msec:{}, endmsec:{}'.format(frame_no, frame_msec, end_ms)) 
        if end_ms != None and frame_msec >= end_ms:
            break

        # print('{}. msec:{}'.format(frame_no, frame_msec)) 

        # video_frames.append(img)
        cv2.imwrite(os.path.join(output_path, '%05d.jpg'%(frame_no)), img)
        capture_count += 1
        pbar.update(step)
        if step > 1:
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no + (step-1));
  
  print('capture_count:', capture_count)

  return width, height, fps, frame_count

## 학습데이터 분석

In [None]:
from gluoncv import utils
video_path = 'http://crcv.ucf.edu/THUMOS14/UCF101/UCF101/v_GolfSwing_g01_c01.avi'
train_data_swing = utils.download(video_path)

In [None]:
width, height, fps, frame_count = read_video(train_data_swing, './train_data_swing')

## 영상 준비

In [None]:
kpga = get_youtube_video('https://www.youtube.com/watch?v=f3KhsQq7VDw')
print(kpga)

In [None]:
# width, height, fps, frame_count = read_video(kpga, './kpga')

### 영상 분석 코드


In [None]:
# video reader 객체 생성
vr = decord.VideoReader(kpga)

In [None]:
# video reader의 length
video_length = len(vr)
print('frame length:{}'.format(video_length))

In [None]:
# 64개의 영상 단위로 분석
window_size = 64
image_size = 64

In [None]:
from gluoncv.data.transforms import video
transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

In [None]:
print('total frame count : {}'.format(len(vr)))
print('total recognition step : {}'.format(round(len(vr)/window_size)))

In [None]:
def action_recognition_and_print(start_index, clip_input, topK=5):
  pred = net(nd.array(clip_input))

  classes = net.classes 
  ind = nd.topk(pred, k=topK)[0].astype('int')
  print('{} clip is classified to be'.format(start_index))
  for i in range(topK):
      print('\t[%s], with probability %.3f.'%
            (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))



In [None]:
# for start_index in range(0, video_length, window_size): 
#   # image_size 64개 확보가 되지않는경우(영상 종료시점 체크)
#   if video_length - window_size < image_size:
#     break
  
#   # 영상분석 이미지 추출
#   frame_id_list = range(0+start_index, 64+start_index, 2)
#   video_data = vr.get_batch(frame_id_list).asnumpy()
#   clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]
  
#   # 이미지 보정
#   clip_input = transform_fn(clip_input)
#   clip_input = np.stack(clip_input, axis=0)
#   clip_input = clip_input.reshape((-1,) + (32, 3, 224, 224))
#   clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

#   # 영상 분석
#   action_recognition_and_print(start_index, clip_input)

### 사람이 있는 영상만 분석
- 64의 이미지 묶음 중 처음과 중간 이미지를 분석하여
  - clip_input[0], clip_input[image_size/2]
- 사람이 detect되는 이미지 묶음만 처리하자

### person detect function

In [None]:
# pose estimation 에서 사용했던 코드 재사용
detector = get_model('yolo3_mobilenet1.0_coco', pretrained=True) 
detector.reset_class(["person"], reuse_weights=['person'])

In [None]:
def person_detect(im_fname):   
  from gluoncv import model_zoo, data, utils
  
  x, _ = data.transforms.presets.yolo.load_test(im_fname)

  class_IDs, scores, bounding_boxs = detector(x)

  L = class_IDs.shape[1] 
  for i in range(L):
    if class_IDs[0][i].asscalar() == 0:
      return True
  
  return False
  

In [None]:
import os

In [None]:
image_path = './kpga'

In [None]:
file_list = os.listdir(image_path)
file_list.sort()

In [None]:
print(video_length)

In [None]:
def read_image_batch(frame_id_list):
  from gluoncv import data

  video_data = []
  for index in frame_id_list:
    im_fname = os.path.join(image_path, '%05d.jpg'%(index+1))
    _, img = data.transforms.presets.yolo.load_test(im_fname) 

    video_data.append(img)

  return video_data

In [None]:
def action_recognition(clip_input, topK=5):
  pred = net(nd.array(clip_input))

  classes = net.classes 
  ind = nd.topk(pred, k=topK)[0].astype('int') 

  result = []
  for i in range(topK):
      # print('[%s], with probability %.3f.'%(classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))
      result.append('%s, %.3f.'%(classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))

  return result

In [None]:
for start_index in range(0, len(file_list), window_size): 
  # image_size 64개 확보가 되지않는경우(영상 종료시점 체크)
  if video_length - window_size < image_size:
    break 
  
  try:
    # 영상분석 이미지 추출
    frame_id_list = range(0+start_index, 64+start_index, 2) 
    org_image_list = read_image_batch(frame_id_list)
    video_data = np.array(org_image_list)
    clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]
    
    # person detect
    filename1 = os.path.join(image_path, '%05d.jpg'%(start_index+1))
    filename2 = os.path.join(image_path, '%05d.jpg'%(start_index + 32 +1))
    result1 = person_detect(filename1)
    result2 = person_detect(filename2)
    # print('detect result:{}/{}'.format(result1, result2))
    
    if result1 == False and result2 == False:
      continue

    # 이미지 보정
    clip_input = transform_fn(clip_input)
    clip_input = np.stack(clip_input, axis=0)
    clip_input = clip_input.reshape((-1,) + (32, 3, 224, 224))
    clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

    # 영상 분석
    action_result = action_recognition(clip_input)
    print(start_index)
    for result_ in action_result:
      print( result_)

  except Exception as ex:
    print(ex)
    continue

### 이미지에 text write

In [None]:
# frame : 이미지
# str : 문자열
# (x, y) : 문자열 표시 좌표
# cv2.FONT_HERSHEY_SCRIPT_SIMPLEX
# 1 : 문자 크기(scale)
# (0, 255, 0) : 색상 (r,g,b)

# cv2.putText(img, '{:s} {:s}'.format(class_name, score),
#                         (xmin, y), cv2.FONT_HERSHEY_SIMPLEX, min(scale/2, 2),
#                         bcolor)

In [None]:
from gluoncv.utils.filesystem import try_import_cv2
cv2 = try_import_cv2()

In [None]:
# !rm -rf kpga_detect*

In [None]:
output_path = './kpga_detect'
if not os.path.exists(output_path):
      os.makedirs(output_path)

In [None]:
for start_index in range(0, len(file_list), window_size): 
  # image_size 64개 확보가 되지않는경우(영상 종료시점 체크)
  if video_length - window_size < image_size:
    break 
  
  try:
    # 영상분석 이미지 추출
    frame_id_list = range(0+start_index, 64+start_index, 2) 
    org_image_list = read_image_batch(frame_id_list)
    video_data = np.array(org_image_list)
    clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]
    
    # person detect
    filename1 = os.path.join(image_path, '%05d.jpg'%(start_index+1))
    filename2 = os.path.join(image_path, '%05d.jpg'%(start_index + 32 +1))
    result1 = person_detect(filename1)
    result2 = person_detect(filename2)
    # print('detect result:{}/{}'.format(result1, result2))
    
    if result1 == False and result2 == False:
      continue

    # 이미지 보정
    clip_input = transform_fn(clip_input)
    clip_input = np.stack(clip_input, axis=0)
    clip_input = clip_input.reshape((-1,) + (32, 3, 224, 224))
    clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

    # 영상 분석
    action_result = action_recognition(clip_input)

    
    output_img = org_image_list[0] 
    output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2RGB)

    print(start_index)
    cv2.putText(output_img, '%05d'%(start_index),
                    (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                    (255,0,255) , 1, cv2.LINE_AA)
    
    for result_idx, result_ in enumerate( action_result):
      print( result_)
      cv2.putText(output_img, result_,
                        (50, 80 + result_idx*25), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                        (255,0,0) , 1, cv2.LINE_AA)


    

    cv2.imwrite(os.path.join(output_path, '%05d.jpg'%(start_index)), output_img)

  except Exception as ex:
    print(ex)
    continue

In [None]:
def compress_folder(input_path):
  from zipfile import ZipFile
  import os
  from os.path import basename
  from tqdm.notebook import tqdm

  # create a ZipFile object
  with ZipFile('{}.zip'.format(input_path), 'w') as zipObj:
    # Iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk(input_path):
        for filename in tqdm(filenames):
            #create complete filepath of file in directory
            filePath = os.path.join(folderName, filename)
            # Add file to zip
            zipObj.write(filePath, basename(filePath))

In [None]:
compress_folder('kpga_detect')

# 실습

## 1. 모든 이미지를 저장한다.
- 같은 묶음의 이미지에는 detect 결과를 같은 내용을 출력한다

## 2. 저장된 이미지를 이용하여 동영상을 생성한다