# Tutorial

Run this notebook in Google Colab.

## Install MMAction2

In [None]:
# Logs. Not required.
import wandb
wandb.login()

In [None]:
# install dependencies: (use cu111 because colab has CUDA 11.1)
%pip install -qqq torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html

# install mmcv-full thus we could use CUDA operators
%pip install -qqq mmcv-full==1.5.0 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html

# Install mmaction2
# !rm -rf mmaction2
!git clone -q https://github.com/open-mmlab/mmaction2.git
%cd mmaction2

%pip install -qqq -e .

# Install some optional requirements
%pip install -qqq -r requirements/optional.txt

Found existing installation: mmcv-full 1.5.0
Uninstalling mmcv-full-1.5.0:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/mmcv/*
    /usr/local/lib/python3.7/dist-packages/mmcv_full-1.5.0.dist-info/*
Proceed (y/n)? y
  Successfully uninstalled mmcv-full-1.5.0
/content/mmaction2/mmaction2/mmaction2


In [None]:
# Check Pytorch installation
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())

# Check MMAction2 installation
import mmaction
print(mmaction.__version__)

# Check MMCV installation
from mmcv.ops import get_compiling_cuda_version, get_compiler_version
print(get_compiling_cuda_version())
print(get_compiler_version())

import warnings
warnings.filterwarnings('ignore')
import gc
from tqdm.auto import tqdm

1.9.0+cu111 True
0.24.0
11.1
GCC 7.3


## Download dataest from OneDrive

In [None]:
import base64

def download_onedrive(link):
    """
    link: `https://1drv.ms/u/s!`
    """
    b = base64.urlsafe_b64encode(link.strip().encode('ascii'))
    s = b.decode('ascii') # seems that 'Qnc=' is fine
    res = f'https://api.onedrive.com/v1.0/shares/u!{s}/root/content'
    return res


In [None]:
# RepCountA
onedrive_link = download_onedrive('https://1drv.ms/u/s!AiohV3HRf-34ipwACYfKSHhkZzebrQ?e=T7PKHN')
!wget $onedrive_link -O rawframes.zip > /dev/null

!unzip rawframes.zip > /dev/null

--2022-05-18 00:28:26--  https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3UvcyFBaW9oVjNIUmYtMzRpcHdBQ1lmS1NIaGtaemViclE_ZT1UN1BLSE4=/root/content
Resolving api.onedrive.com (api.onedrive.com)... 13.107.42.12
Connecting to api.onedrive.com (api.onedrive.com)|13.107.42.12|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://b32b3w.bl.files.1drv.com/y4mY-S--iClqUx5BDHVHLBMpiUV2A6_wEkGylCdMHjSMhWPRdGtHlRRsbaZe4BrLyGXWFMSrj1Pm8TpIfyxhwdrH-SRAIjsEoroWCzgmGh8XiWjeKhF89-rM43utCRE_b5saIjEvqf3rFGrKZXAdZFeu47kj5Q2uTM0c1wiv1O-8NDDggfVfRiYxVLk9aDs4RrhcIcCO8TaST7eiCD8Nl3ucA/rawf.zip [following]
--2022-05-18 00:28:26--  https://b32b3w.bl.files.1drv.com/y4mY-S--iClqUx5BDHVHLBMpiUV2A6_wEkGylCdMHjSMhWPRdGtHlRRsbaZe4BrLyGXWFMSrj1Pm8TpIfyxhwdrH-SRAIjsEoroWCzgmGh8XiWjeKhF89-rM43utCRE_b5saIjEvqf3rFGrKZXAdZFeu47kj5Q2uTM0c1wiv1O-8NDDggfVfRiYxVLk9aDs4RrhcIcCO8TaST7eiCD8Nl3ucA/rawf.zip
Resolving b32b3w.bl.files.1drv.com (b32b3w.bl.files.1drv.com)... 13.107.42.12
Con

In [None]:
classes = ['front_raise', 'pull_up', 'squat', 'bench_pressing', 'jump_jack', 'situp',
           'push_up', 'others', 'battle_rope', 'pommelhorse']

## Train a recognizer on customized dataset

In [None]:
%%capture
%%sh
wget -c https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x8_100e_kinetics400_rgb/tsn_r50_256p_1x1x8_100e_kinetics400_rgb_20200817-883baf16.pth \
    -O checkpoints/tsn_r50_256p_1x1x8_100e_kinetics400_rgb_20200817-883baf16.pth 

In [None]:
config = 'configs/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb.py'
checkpoint = 'checkpoints/tsn_r50_256p_1x1x8_100e_kinetics400_rgb_20200817-883baf16.pth'

### Modify the config

In the next step, we need to modify the config for the training.
To accelerate the process, we finetune a recognizer using a pre-trained recognizer.

In [None]:
from mmcv import Config
import os.path as osp
cfg = Config.fromfile(config)

from mmcv.runner import set_random_seed
DATASET = '/content/mmaction2/rawframes/'
NUM_CLASSES = len(classes)
# Modify dataset type and path
cfg.dataset_type = 'RawframeDataset'
cfg.data_root = DATASET + 'train'
cfg.data_root_val = DATASET + 'val'
cfg.ann_file_train = DATASET +'rawframes_train.txt'
cfg.ann_file_val = DATASET +'rawframes_val.txt'
cfg.ann_file_test = DATASET +'rawframes_test.txt'

cfg.data.test.ann_file = cfg.ann_file_test
cfg.data.test.data_prefix = DATASET + 'test'

cfg.data.train.ann_file = cfg.ann_file_train
cfg.data.train.data_prefix = cfg.data_root

cfg.data.val.ann_file = cfg.ann_file_val
cfg.data.val.data_prefix = cfg.data_root_val

cfg.setdefault('omnisource', False)

cfg.model.cls_head.num_classes = NUM_CLASSES

# The original learning rate (LR) is set for 8-GPU training.
# We divide it by 8 since we only use one GPU.
cfg.data.videos_per_gpu = max(1, cfg.data.videos_per_gpu // 8)
cfg.optimizer.lr = cfg.optimizer.lr / 8 / 16
cfg.total_epochs = 100
cfg.load_from = checkpoint
# cfg.resume_from = osp.join(cfg.work_dir, 'latest.pth')
cfg.checkpoint_config.interval = 5

cfg.seed = 0
set_random_seed(0, deterministic=False)
cfg.gpu_ids = range(1)

# Save the best
cfg.evaluation.save_best='auto'

cfg.log_config = dict(
    interval=50,
    hooks=[
        # dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook'),
        dict(type='WandbLoggerHook', init_kwargs=dict(project='RepCount-cleaned', 
                                                      config={**cfg})),
])

print(cfg.pretty_text)


model = dict(
    type='Recognizer3D',
    backbone=dict(
        type='ResNet2Plus1d',
        depth=34,
        pretrained=None,
        pretrained2d=False,
        norm_eval=False,
        conv_cfg=dict(type='Conv2plus1d'),
        norm_cfg=dict(type='SyncBN', requires_grad=True, eps=0.001),
        conv1_kernel=(3, 7, 7),
        conv1_stride_t=1,
        pool1_stride_t=1,
        inflate=(1, 1, 1, 1),
        spatial_strides=(1, 2, 2, 2),
        temporal_strides=(1, 2, 2, 2),
        zero_init_residual=False),
    cls_head=dict(
        type='I3DHead',
        num_classes=10,
        in_channels=512,
        spatial_type='avg',
        dropout_ratio=0.5,
        init_std=0.01),
    train_cfg=None,
    test_cfg=dict(average_clips='prob'))
checkpoint_config = dict(interval=5)
log_config = dict(
    interval=50,
    hooks=[
        dict(type='TensorboardLoggerHook'),
        dict(
            type='WandbLoggerHook',
            init_kwargs=dict(
                project='RepCount-cle

### Train a new recognizer

In [None]:
from mmaction.datasets import build_dataset
from mmaction.models import build_model
from mmaction.apis import train_model

import mmcv

# Build the dataset
datasets = [build_dataset(cfg.data.train)]

# Build the recognizer
model = build_model(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg'))

# Create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
train_model(model, datasets, cfg, distributed=False, validate=True)

## Test the trained recognizer

After finetuning the recognizer, let's check the prediction results!

In [None]:
from mmaction.apis import single_gpu_test
from mmaction.datasets import build_dataloader
from mmcv.parallel import MMDataParallel

# Build a test dataloader
dataset = build_dataset(cfg.data.test, dict(test_mode=True))
data_loader = build_dataloader(
        dataset,
        videos_per_gpu=1,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=False,
        shuffle=False)
model = MMDataParallel(model, device_ids=[0])
outputs = single_gpu_test(model, data_loader)

eval_config = cfg.evaluation
eval_config.pop('interval')
eval_res = dataset.evaluate(outputs, **eval_config)

[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 152/152, 2.4 task/s, elapsed: 64s, ETA:     0s
Evaluating top_k_accuracy ...

top1_acc	0.8487
top5_acc	0.9737

Evaluating mean_class_accuracy ...

mean_acc	0.7613
top1_acc: 0.8487
top5_acc: 0.9737
mean_class_accuracy: 0.7613


In [None]:
# Test on best model
best_model_path = '/content/mmaction2/work_dirs/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/lastest.pth'
best_cfg = cfg
best_cfg.load_from = best_model_path
model_best = build_model(cfg.model, test_cfg=best_cfg.get('test_cfg'))
model_best = MMDataParallel(model_best, device_ids=[0])
outputs = single_gpu_test(model_best, data_loader)

eval_config = cfg.evaluation
eval_res = dataset.evaluate(outputs, **eval_config)

### Inference Youtube videos

In [None]:
%pip install yt-dlp -qqq
import yt_dlp

import os
def download_ytb(url, folder):
    # vid = url.split('?v=')[1]
    # link = url + vid
    ydl_opts = {
        'outtmpl': '%(id)s.%(ext)s',
        'quiet': True,
        'ignoreerrors': True,
        'format': '136',  # 136 for mp4 1280x720 25fps no audio. 137: 1080P
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

[K     |████████████████████████████████| 2.6 MB 5.3 MB/s 
[K     |████████████████████████████████| 357 kB 48.3 MB/s 
[K     |████████████████████████████████| 2.0 MB 35.1 MB/s 
[K     |████████████████████████████████| 218 kB 54.8 MB/s 
[K     |████████████████████████████████| 112 kB 49.3 MB/s 
[?25h

Modify config for video.

In [None]:
from mmaction.apis import inference_recognizer, init_recognizer

test_cfg = cfg
test_cfg.dataset_type = 'VideoDataset'
test_cfg.img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
test_cfg.test_pipeline = [
    dict(type='DecordInit', num_threads=1),
    dict(
        type='SampleFrames',
        clip_len=1,
        frame_interval=1,
        num_clips=16,
        test_mode=True),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='CenterCrop', crop_size=224),
    dict(type='Normalize', **test_cfg.img_norm_cfg),
    dict(type='FormatShape', input_format='NCHW'),
    dict(type='Collect', keys=['imgs'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]

test_cfg.data = dict(
    videos_per_gpu=1,
    workers_per_gpu=2,
    test=dict(
        type=test_cfg.dataset_type,
        ann_file=None,
        data_prefix=None,
        pipeline=test_cfg.test_pipeline))

ckpt = '/content/mmaction2/work_dirs/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/lastest.pth'
model_video = init_recognizer(test_cfg, ckpt, device='cuda:0')

load checkpoint from local path: /content/mmaction2/work_dirs/tsm_r50_1x1x16_50e_sthv2_rgb/best_top1_acc_epoch_58.pth


In [None]:
url = 'https://www.youtube.com/watch?v=kRX2NfqM90g'
FILENAME = url[-11:]+'.mp4'
download_ytb(url, 'demo')

[download] 100% of 5.83MiB in 00:00                                                  

### Run demo.

In [None]:
results = inference_recognizer(model_video, FILENAME)
for r in results:
    print(f'{classes[r[0]]}, {r[1]}')

[(2, 0.7956046), (3, 0.20423073), (8, 4.4019278e-05), (6, 3.4168923e-05), (1, 3.166302e-05)]
0.7956045866012573, squat
0.20423072576522827, bench_pressing
4.401927799335681e-05, battle_rope
3.416892286622897e-05, push_up
3.1663021218264475e-05, pull_up


In [None]:
# Check the video
from IPython.display import HTML
from base64 import b64encode
mp4 = open( FILENAME,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=600 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)