-
Notifications
You must be signed in to change notification settings - Fork 0
/
actionRecognitionPytorch.py
109 lines (92 loc) · 3.29 KB
/
actionRecognitionPytorch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
"""
@file actionRecognitionPytorch.py .
@author Ayush Saini
@email saini712@gmail.com
@brief Action inference (Pytorch) from video folder.
@version 0.0.1
@date 2021-07-25
@copyright Copyright (c) 2021
"""
import argparse
import os
import albumentations as A
import cv2
import numpy as np
import torch
# construct the argument parser
parser = argparse.ArgumentParser()
# model path
parser.add_argument('-m', '--model-path', dest='model_path',
help='path to .pt model')
# path of folder containing videos for inference
parser.add_argument('-f', '--folder-path', dest='folder_path',
help='path of inference videos')
args = vars(parser.parse_args())
# define transforms according to model input
transform = A.Compose([
A.Resize(129, 171, always_apply=True),
A.CenterCrop(112, 112, always_apply=True),
A.Normalize(mean=[0.43216, 0.394666, 0.37645],
std=[0.22803, 0.22145, 0.216989],
always_apply=True)
])
def load_video(path):
"""Extract all frames from video and arrange in an array.
Args:
path ([String]): [path to video]
"""
all_frames = []
cap = cv2.VideoCapture(path)
if (cap.isOpened() is False):
print('Error while trying to read video. Please check path again')
# read until end of video
while(cap.isOpened()):
# capture each frame of the video
ret, frame = cap.read()
if ret is True:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = transform(image=frame)['image']
# append to array
all_frames.append(frame)
else:
break
cap.release()
return all_frames
def predict(frames, model):
"""Predict top 2 actions taking in array of frames.
Args:
frames ([array]): [array of frames]
"""
with torch.no_grad(): # we do not want to backprop any gradients
input_frames = np.array(frames)
# add an extra dimension
input_frames = np.expand_dims(input_frames, axis=0)
# transpose to get [1, 3, num_clips, height, width]
input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3))
# convert the frames to tensor
input_frames = torch.tensor(input_frames, dtype=torch.float32)
input_frames = input_frames.to(device)
# forward pass to get the predictions
outputs = model(input_frames)
# softmax layer to get probablities
percentage = torch.nn.functional.softmax(outputs, dim=1)[0]
# get the prediction index
_, preds = torch.sort(outputs.data, descending=True)
result = [(
class_names[idx].strip(), f'{percentage[idx] * 100:5.2f}%')
for idx in preds[0][:2]]
return result
# read the class names from labels.txt
with open('labels.txt', 'r') as f:
class_names = f.readlines()
f.close()
# get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# We are using pretrained Resnet3D18 https://arxiv.org/abs/1711.11248
r3d = torch.load(args['model_path'])
r3d = r3d.eval().to(device)
video_folder_path = args['folder_path']
# iterate through all video and print top 2 inference
for video in os.listdir(video_folder_path):
print(video + ' '+str(predict(load_video(video_folder_path+video), r3d)))