In [1]:
# matplotlib
import matplotlib.pyplot as plt

# numpy
import numpy as np

# torch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import transforms

# misc
import time
from datetime import datetime
from torchinfo import summary

# preprocessing 
import av
import pandas as pd
import matplotlib.pyplot as plt
import cv2


# helper classes and functions
from train_model import train
from model import CNN_LSTM, VGG_LSTM
from Conv3D import r2plus1d_18
from collections import OrderedDict

# logging
import os
import logging
from torch.utils.tensorboard import SummaryWriter

In [2]:
# create relevant directories

# create dir to save logs if it does not exist
if not os.path.exists("log3d"):
    os.mkdir("log3d")

# create dir to save runs if it does not exist
if not os.path.exists("runs3d"):
    os.mkdir("runs3d")

# create dir to save models if it does not exist
if not os.path.exists("saved_models3d"):
    os.mkdir("saved_models3d")

In [3]:
log_path = "log/cnnlstm_{:%Y-%m-%d_%H-%M-%S}.log".format(datetime.now())
sum_path = "runs/cnnlstm_{:%Y-%m-%d_%H-%M-%S}".format(datetime.now())

# Log to file & tensorboard writer
logging.basicConfig(level=logging.INFO, format='%(message)s', handlers=[logging.FileHandler(log_path), logging.StreamHandler()])
logger = logging.getLogger('signo-lingo')
logger.info('Logging to file...')
writer = SummaryWriter(sum_path)

Logging to file...


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
data_dir = "dataset"
train_dir = f'{data_dir}/train'
val_dir = f'{data_dir}/val'
test_dir = f'{data_dir}/test'

In [6]:
root_path = "dataset"

train_path = [os.path.join('{}/train'.format(root_path), f) for f in os.listdir('{}/train'.format(root_path))]
val_path = [os.path.join('{}/val'.format(root_path), f) for f in os.listdir('{}/val'.format(root_path))]
test_path = [os.path.join('{}/test'.format(root_path), f) for f in os.listdir('{}/test'.format(root_path))]

In [7]:
# All labels
train_label_df = pd.read_csv(r'{}/train_labels.csv'.format(root_path), header=None)
test_label_df = pd.read_csv(r'{}/test_labels.csv'.format(root_path), header=None)
val_label_df = pd.read_csv(r'{}/val_labels.csv'.format(root_path), header=None)

# convert all into hashmap - key = u_vid_name , value = label

train_label = {k[0]: k[1] for k in train_label_df.values.tolist()}
test_label = {k[0]: k[1] for k in test_label_df.values.tolist()}
val_label = {k[0]: k[1] for k in val_label_df.values.tolist()}

In [8]:
total_label = pd.read_csv(r'{}/ClassId.csv'.format(root_path))
u_len_label = len(total_label['ClassId'].unique())
print("total unique label:", u_len_label)

total unique label: 226


In [9]:
# pick fix frames
'''
e.g.
input_frame = 5, output_frame = 3

steps 1. create an array and multiply each frame by output factor
    [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5] 
    
step 2. Divide by output section
    [1, 1, 1, 2, 2, | 2, 3, 3, 3, 4, | 4, 4, 5, 5, 5]
    
step 3. Find the center index to pick
    5 // 2 = 2
    
step 4. Select center index from each section
    [1, 1*, 1, 2, 2, | 2, 3*, 3, 3, 4, | 4, 4*, 5, 5, 5]
    
step 5. Return a set of those index
    set([1, 3, 4])
'''
def fix_frame(input_frame: int, output_frame: int) -> set:
    '''
    input
        - number of input frames
        - number of output frames
    output
        - a set of frames
    '''
    if input_frame < output_frame:
        print('Spotted video that have input frame: {} < output frame: {}'.format(input_frame, output_frame))
        return set([i for i in range(1, input_frame+1)])
    
    # create array to pick from
    pick_arr = []
    for i in range(1,input_frame+1):
        for r in range(output_frame):
            pick_arr.append(i)
            
    # decide on index to capture
    # e.g. frame 58//2 = 29
    ind = input_frame//2
    
    # capture frame
    output = set()
    i = 1
    batch = 0
    while (i + (batch * input_frame)) < len(pick_arr):
        if i == ind:
            output.add(pick_arr[i + (batch * input_frame) - 1])
        i+=1
        if i == input_frame + 1:
            i = 1
            batch += 1
    if len(output) != output_frame:
        raise ValueError('output does not have the same frame requirements. output: {}, required: {}'.format(len(output), output_frame))
    return output

In [10]:
def extract_frames(vid, transforms = None, frames_cap = 30):
    
    selector = fix_frame(len(vid), frames_cap)
    output = []
    for e,frame in enumerate(vid):
        if e+1 in selector:
            output.append(frame)
    
    # edge case
    if len(vid) < frames_cap:
        remainder = frames_cap - len(vid)
        # take last frame
        last_frame = vid[-1]
        for _ in range(remainder):
            output.append(last_frame)
        
    return np.array(output)

In [11]:
# mask rbg image
def masking(rbg_vid, depth_vid):
    """
    input
        - path for rbg
        - path for depth
    output
        - array of numpy arrays
    """
    rbg_arr = []
    container_rbg = av.open(rbg_vid)

    for packet in container_rbg.demux():
        for frame in packet.decode():
            rbg_arr.append(np.array(frame.to_image()))

    depth_arr = []
    container_depth = av.open(depth_vid)

    for packet in container_depth.demux():
        for frame in packet.decode():
            depth_arr.append(np.array(frame.to_image()))
            
    # pose estimation
    #rbg_arr = pose_styling(rbg_arr)

    # display - correct color orientation
    overlay_arr = []
    for i in range(len(rbg_arr)):
        c = cv2.cvtColor(rbg_arr[i], cv2.COLOR_BGR2RGB)
        gray = cv2.cvtColor(depth_arr[i], cv2.COLOR_BGR2GRAY)
        overlay = cv2.bitwise_and(c,c, mask= gray)
        
        # resize and reshape
        overlay = cv2.resize(overlay, (256,256))
        
        # convert from (h , w, c) to (c, h, w)
        overlay_reshape = np.transpose(overlay, (2, 0, 1))
        
        overlay_arr.append(overlay_reshape)
        
    return np.array(overlay_arr)

In [12]:
class Turkish_Dataset(Dataset):
    def __init__(self, paths, labels):
        self.paths = paths
        self.labels = labels
        self._get_unique()
        
    def _get_unique(self):
        u_vid_depth = {}
        u_vid_color = {}
        u_vid = set()
        for path in self.paths:
            vid = path.split("\\")[-1].split(".")[0] # train\\signer0_sample1_color.mp4 or train\\signer0_sample1_depth.mp4
            vid_split = vid.split("_")
            vid_type = vid_split[-1] # color or depth
            vid_name = "_".join(vid_split[:-1]) # signer0_sample1_color
            if vid_type == "color":
                u_vid_color[vid_name] = path
            elif vid_type == "depth":
                u_vid_depth[vid_name] = path
            else:
                raise ValueError('Detected vid type as neither color nor depth. type is', vid_type)
            u_vid.add(vid_name)
        self.u_vid_depth = u_vid_depth
        self.u_vid_color = u_vid_color
        self.u_vid = list(u_vid)
    
    def __getitem__(self, index):
        vid_name = self.u_vid[index]
        vid_label = self.labels[vid_name]
        
        vid_color = self.u_vid_color[vid_name]
        vid_depth = self.u_vid_depth[vid_name]
        
        # preprocessing
        vid_arr = masking(vid_color, vid_depth)
#         if (len(vid_arr) < 30):
#             print('{} has {} frames'.format(vid_name, len(vid_arr)))
        vid_arr = extract_frames(vid_arr, 30)

        # create one-hot-encoding for label
        #label = np.zeros(u_len_label)
        label = np.zeros(226)
        label[vid_label] = 1
        
        # convert arr to tensors
        vid_arr = torch.from_numpy(vid_arr).float()
        vid_arr = vid_arr.permute(1, 0, 2, 3)
        label = torch.from_numpy(label).long().argmax()
        
        # return masked video array and label
        return vid_arr, label
                
    
    def __len__(self):
        return len(self.u_vid)

In [13]:
n_frames = 30
transforms_compose = transforms.Compose([transforms.Resize(256), 
                                         transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.5], std=[0.5])])

In [14]:
# create train dataset
ld_train = Turkish_Dataset(train_path, train_label)

# show image but clip rbg values


In [15]:
# create test dataset
ld_test = Turkish_Dataset(test_path, test_label)
print("shape of first array", ld_test[0][0].shape)

# show image but clip rbg values


shape of first array torch.Size([3, 30, 256, 256])


In [16]:
# create val dataset
ld_val = Turkish_Dataset(val_path, val_label)
print("shape of first array", ld_val[0][0].shape)


shape of first array torch.Size([3, 30, 256, 256])


In [19]:
bs_train = 1
bs_test = 1
bs_val = 1
train_loader = DataLoader(ld_train, batch_size = bs_train, shuffle = True)
test_loader = DataLoader(ld_test, batch_size = bs_test, shuffle = True)
val_loader = DataLoader(ld_val, batch_size = bs_val, shuffle = True)

In [20]:
model = r2plus1d_18(pretrained=True, num_classes=500)
    # load pretrained
checkpoint = torch.load('slr_resnet2d+1.pth')
new_state_dict = OrderedDict()
for k, v in checkpoint.items():
    name = k[7:] # remove 'module.'
    new_state_dict[name]=v
model.load_state_dict(new_state_dict)
model.fc1 = nn.Linear(model.fc1.in_features, 226)

In [21]:
summary(model, input_size=(1, 3, 30, 256, 256))

Layer (type:depth-idx)                             Output Shape              Param #
r2plus1d_18                                        [1, 226]                  --
├─Sequential: 1-1                                  [1, 512, 1, 1, 1]         --
│    └─R2Plus1dStem: 2-1                           [1, 64, 30, 128, 128]     --
│    │    └─Conv3d: 3-1                            [1, 45, 30, 128, 128]     6,615
│    │    └─BatchNorm3d: 3-2                       [1, 45, 30, 128, 128]     90
│    │    └─SiLU: 3-3                              [1, 45, 30, 128, 128]     --
│    │    └─Conv3d: 3-4                            [1, 64, 30, 128, 128]     8,640
│    │    └─BatchNorm3d: 3-5                       [1, 64, 30, 128, 128]     128
│    │    └─SiLU: 3-6                              [1, 64, 30, 128, 128]     --
│    └─Sequential: 2-2                             [1, 64, 30, 128, 128]     --
│    │    └─BasicBlock: 3-7                        [1, 64, 30, 128, 128]     222,016
│    │    └─BasicBlock:

In [22]:
no_of_epochs = 1000
optimizer_lr = 1e-5
save_dir = "saved_models3d/final_masked2"

In [23]:
import gc
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()

In [24]:
train(model, 
      train_loader, 
      val_loader, 
      no_of_epochs, 
      logger,
      writer,
      save_dir=save_dir, 
      device=device, 
      patience=10, 
      optimizer_lr=optimizer_lr, 
      use_scheduler=True)

######################Training Started######################
Epoch 1
100%|██████████| 28123/28123 [5:49:44<00:00,  1.34batch/s, accuracy=0, loss=5.25]      
Average Training Loss of Epoch 1: 5.324275 | Acc: 0.90%
100%|██████████| 4413/4413 [23:05<00:00,  3.19batch/s, accuracy=0, loss=5.44]
Average Validation Loss of Epoch 1: 5.457649 | Acc: 0.25%
####################Epoch 1 Model Saved#####################
Epoch 2
100%|██████████| 28123/28123 [5:56:39<00:00,  1.31batch/s, accuracy=0, loss=4.19]     
Average Training Loss of Epoch 2: 4.636559 | Acc: 4.19%
100%|██████████| 4413/4413 [22:15<00:00,  3.30batch/s, accuracy=0, loss=5.22]
Average Validation Loss of Epoch 2: 5.566875 | Acc: 0.50%
####################Epoch 2 Model Saved#####################
Increment early stopper to 1 because val loss (5.566875257217611) is greater than threshold (5.4576494889872)
Epoch 3
100%|██████████| 28123/28123 [4:41:08<00:00,  1.67batch/s, accuracy=0, loss=4.99]   
Average Training Loss of Epoch 3: 3.936

Epoch     5: reducing learning rate of group 0 to 1.0000e-06.


100%|██████████| 28123/28123 [5:04:18<00:00,  1.54batch/s, accuracy=1, loss=1.86]     
Average Training Loss of Epoch 6: 2.243220 | Acc: 44.38%
100%|██████████| 4413/4413 [22:44<00:00,  3.23batch/s, accuracy=0, loss=6.86]
Average Validation Loss of Epoch 6: 5.759455 | Acc: 0.16%
####################Epoch 6 Model Saved#####################
Increment early stopper to 5 because val loss (5.759455168946865) is greater than threshold (5.4576494889872)
Epoch 7
100%|██████████| 28123/28123 [4:42:27<00:00,  1.66batch/s, accuracy=0, loss=2.51]   
Average Training Loss of Epoch 7: 2.127816 | Acc: 48.69%
100%|██████████| 4413/4413 [22:26<00:00,  3.28batch/s, accuracy=0, loss=4.07]
Average Validation Loss of Epoch 7: 5.682339 | Acc: 0.11%
####################Epoch 7 Model Saved#####################
Increment early stopper to 6 because val loss (5.6823389318398325) is greater than threshold (5.4576494889872)
Epoch 8
100%|██████████| 28123/28123 [6:51:29<00:00,  1.14batch/s, accuracy=0, loss=2.34]  

Epoch     9: reducing learning rate of group 0 to 1.0000e-07.


100%|██████████| 28123/28123 [4:41:05<00:00,  1.67batch/s, accuracy=0, loss=2.85]    
Average Training Loss of Epoch 10: 1.893262 | Acc: 56.92%
100%|██████████| 4413/4413 [22:36<00:00,  3.25batch/s, accuracy=0, loss=5.25]
Average Validation Loss of Epoch 10: 5.753560 | Acc: 0.05%
####################Epoch 10 Model Saved####################
Increment early stopper to 9 because val loss (5.753559931967498) is greater than threshold (5.4576494889872)
Epoch 11
100%|██████████| 28123/28123 [4:41:54<00:00,  1.66batch/s, accuracy=0, loss=2.43]    
Average Training Loss of Epoch 11: 1.882564 | Acc: 57.78%
100%|██████████| 4413/4413 [22:44<00:00,  3.24batch/s, accuracy=0, loss=5.03]
Average Validation Loss of Epoch 11: 5.780108 | Acc: 0.11%
####################Epoch 11 Model Saved####################
Increment early stopper to 10 because val loss (5.780107849826895) is greater than threshold (5.4576494889872)
Model has overfit, early stopping...
#####################Training Finished###########

r2plus1d_18(
  (r2plus1d_18): Sequential(
    (0): R2Plus1dStem(
      (0): Conv3d(3, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
      (1): BatchNorm3d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
      (3): Conv3d(45, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
      (4): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): BasicBlock(
        (conv1): Sequential(
          (0): Conv2Plus1D(
            (0): Conv3d(64, 144, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
            (1): BatchNorm3d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
            (3): Conv3d(144, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
          )
          (1): BatchNorm3d(64, eps=