In [29]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
from tqdm import tqdm
import sys
import av
import pandas as pd
import os
import cv2
from collections import OrderedDict
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [30]:
data_dir = "dataset"
test_dir = f'{data_dir}/test'

In [31]:
test_label_df = pd.read_csv(f'{data_dir}/test_labels.csv', header=None)

In [32]:
test_label = {f"{test_dir}/{k[0]}": k[1] for k in test_label_df.values.tolist()}

In [33]:
total_label = pd.read_csv(f'{data_dir}/ClassId.csv')
n_classes = len(total_label['ClassId'].unique())
print("total unique label:", n_classes)

total unique label: 226


In [34]:
class_id_to_label = {k[0]: k[2] for k in total_label.values.tolist()}

In [35]:
print(class_id_to_label)

{0: 'sister', 1: 'hurry', 2: 'hungry', 3: 'enjoy_your_meal', 4: 'brother', 5: 'tree', 6: 'heavy', 7: 'cry', 8: 'family', 9: 'wise', 10: 'unwise', 11: 'kin', 12: 'shopping', 13: 'key', 14: 'mother', 15: 'friend', 16: 'ataturk', 17: 'shoe', 18: 'mirror', 19: 'same', 20: 'father', 21: 'garden', 22: 'look', 23: 'honey', 24: 'glass', 25: 'flag', 26: 'feast', 27: 'baby', 28: 'single', 29: 'wait', 30: 'I', 31: 'petrol', 32: 'together', 33: 'inform', 34: 'we', 35: 'work', 36: 'wednesday', 37: 'fork', 38: 'tea', 39: 'teapot', 40: 'hammer', 41: 'ugly', 42: 'child', 43: 'soup', 44: 'friday', 45: 'saturday', 46: 'wallet', 47: 'minute', 48: 'grandfather', 49: 'change', 50: 'topple', 51: 'government', 52: 'doctor', 53: 'full', 54: 'wedding', 55: 'yesterday', 56: 'enemy', 57: 'wall', 58: 'pharmacy', 59: 'glove', 60: 'labor', 61: 'retired', 62: 'male', 63: 'meal', 64: 'house', 65: 'yes', 66: 'married', 67: 'memorize', 68: 'elephant', 69: 'photograph', 70: 'football', 71: 'past', 72: 'get_well', 73: 'b

In [36]:
def extract_frames(vid_path, frames_cap, transforms=None):
    """Extract and transform video frames

    Parameters:
    vid_path (str): path to video file
    frames_cap (int): number of frames to extract, evenly spaced
    transforms (torchvision.transforms, optional): transformations to apply to frame

    Returns:
    list of numpy.array: vid_arr

    """
    vid_arr = []
    with av.open(vid_path) as container:
        stream = container.streams.video[0]
        n_frames = stream.frames
        remainder = n_frames % frames_cap
        interval = n_frames // frames_cap
        take_frame_idx = 0
        for frame_no, frame in enumerate(container.decode(stream)):
            if frame_no == take_frame_idx:
                img = frame.to_image()
                if transforms:
                    img = transforms(img)
                vid_arr.append(np.array(img))
                if remainder > 0:
                    take_frame_idx += 1
                    remainder -= 1
                take_frame_idx += interval
    if len(vid_arr) < frames_cap:
        raise ValueError(f"video with path '{vid_path}' is too short, please make sure that video has >={frames_cap} frames")
    return vid_arr

In [37]:
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [38]:
from Conv3D import r2plus1d_18

In [44]:
model = r2plus1d_18(pretrained=True, num_classes=226)
checkpoint = torch.load("saved_models3d/final_masked2/11.pt", map_location=torch.device("cpu"))
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

KeyError: 'model_state_dict'

In [None]:
#model = r2plus1d_18(pretrained=True, num_classes=226)
#checkpoint = torch.load("saved_models/cnn_lstm_512_6_1_512_drop80_weightd8/18-checkpoint.pt")
#model.load_state_dict(checkpoint['model_state_dict'])
#model.eval()

In [None]:
def extract_frames(vid, transforms = None, frames_cap = 30):
    
    selector = fix_frame(len(vid), frames_cap)
    output = []
    for e,frame in enumerate(vid):
        if e+1 in selector:
            output.append(frame)
    
    # edge case
    if len(vid) < frames_cap:
        remainder = frames_cap - len(vid)
        # take last frame
        last_frame = vid[-1]
        for _ in range(remainder):
            output.append(last_frame)
        
    return np.array(output)

In [None]:
def masking(rbg_vid, depth_vid):
    """
    input
        - path for rbg
        - path for depth
    output
        - array of numpy arrays
    """
    rbg_arr = []
    container_rbg = av.open(rbg_vid)

    for packet in container_rbg.demux():
        for frame in packet.decode():
            rbg_arr.append(np.array(frame.to_image()))

    depth_arr = []
    container_depth = av.open(depth_vid)

    for packet in container_depth.demux():
        for frame in packet.decode():
            depth_arr.append(np.array(frame.to_image()))
            
    # pose estimation
    #rbg_arr = pose_styling(rbg_arr)

    # display - correct color orientation
    overlay_arr = []
    for i in range(len(rbg_arr)):
        c = cv2.cvtColor(rbg_arr[i], cv2.COLOR_BGR2RGB)
        gray = cv2.cvtColor(depth_arr[i], cv2.COLOR_BGR2GRAY)
        overlay = cv2.bitwise_and(c,c, mask= gray)
        
        # resize and reshape
        overlay = cv2.resize(overlay, (256,256))
        
        # convert from (h , w, c) to (c, h, w)
        overlay_reshape = np.transpose(overlay, (2, 0, 1))
        
        overlay_arr.append(overlay_reshape)
        
    return np.array(overlay_arr)

In [None]:
def fix_frame(input_frame: int, output_frame: int) -> set:
    '''
    input
        - number of input frames
        - number of output frames
    output
        - a set of frames
    '''
    if input_frame < output_frame:
        print('Spotted video that have input frame: {} < output frame: {}'.format(input_frame, output_frame))
        return set([i for i in range(1, input_frame+1)])
    
    # create array to pick from
    pick_arr = []
    for i in range(1,input_frame+1):
        for r in range(output_frame):
            pick_arr.append(i)
            
    # decide on index to capture
    # e.g. frame 58//2 = 29
    ind = input_frame//2
    
    # capture frame
    output = set()
    i = 1
    batch = 0
    while (i + (batch * input_frame)) < len(pick_arr):
        if i == ind:
            output.add(pick_arr[i + (batch * input_frame) - 1])
        i+=1
        if i == input_frame + 1:
            i = 1
            batch += 1
    if len(output) != output_frame:
        raise ValueError('output does not have the same frame requirements. output: {}, required: {}'.format(len(output), output_frame))
    return output

In [16]:
def load_and_test_video(vid_name):
    """Load video from dataset and pass video to model

    Parameters:
    vid_name (str): video name to display

    """
    transforms_compose = transforms.Compose([transforms.Resize(256), 
                                     transforms.ToTensor(),
                                     transforms.Normalize(mean=[0.5], std=[0.5])])
    vid_color_path = f"{test_dir}/{vid_name}_color.mp4"
    vid_depth_path = f"{test_dir}/{vid_name}_depth.mp4"
    rgb_arr = extract_frames(vid_color_path, 30, transforms=transforms_compose)
    vid_arr = np.array(rgb_arr)
    vid_arr = vid_arr/255
    #vid_arr = masking(vid_color_path, vid_depth_path)
    #vid_arr = extract_frames(vid_arr, 30)
    vid_arr = torch.from_numpy(vid_arr).float()
    #vid_arr = vid_arr.permute(1, 0, 2, 3)
    vid_arr = vid_arr.unsqueeze(0)
    predict_id = model.forward(vid_arr)
    predict_id = torch.max(predict_id, 1)[1].item()
    ground_truth_id = test_label[f"{test_dir}/{vid_name}"]
    return predict_id, class_id_to_label[predict_id], ground_truth_id, class_id_to_label[ground_truth_id]

In [17]:
# Load the test set and perform predictions on each sample. Add their ground truths and predictions to lists
df = pd.read_csv("dataset/test_labels.csv", names=["video_name", "class_id"])
predict_ids = []
predict_labels = []
ground_truth_ids = []
ground_truth_labels = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    predict_id, predict_label, ground_truth_id, ground_truth_label = load_and_test_video(row['video_name'])
    predict_ids.append(predict_id)
    predict_labels.append(predict_label)
    ground_truth_ids.append(ground_truth_id)
    ground_truth_labels.append(ground_truth_label)

# Load the test set and perform predictions on each sample. Add their ground truths and predictions to lists

df['predict_id'] = predict_ids
df['predict_label'] = predict_labels
df['ground_truth_id'] = ground_truth_ids
df['ground_truth_label'] = ground_truth_labels

100%|██████████| 3739/3739 [52:01<00:00,  1.20it/s]


In [18]:
df['predict_id'] = predict_ids
df['predict_label'] = predict_labels
df['ground_truth_id'] = ground_truth_ids
df['ground_truth_label'] = ground_truth_labels

In [19]:
df

Unnamed: 0,video_name,class_id,predict_id,predict_label,ground_truth_id,ground_truth_label
0,signer34_sample1,133,175,you,133,guest
1,signer34_sample2,61,164,request,61,retired
2,signer34_sample3,32,40,hammer,32,together
3,signer34_sample4,169,39,teapot,169,champion
4,signer34_sample5,77,77,show,77,show
...,...,...,...,...,...,...
3734,signer30_sample658,125,125,salary,125,salary
3735,signer30_sample659,191,191,ceiling,191,ceiling
3736,signer30_sample660,96,220,absent,96,medicine
3737,signer30_sample661,59,142,wood,59,glove


In [20]:
target_names = [class_id_to_label[key] if key in class_id_to_label.keys() else 0 for key in range(226)]
print(classification_report(ground_truth_ids, predict_ids, target_names=target_names))
report = classification_report(ground_truth_ids, predict_ids,target_names=target_names, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
df_classification_report.to_csv('normal_report.csv', encoding='utf-8')

                 precision    recall  f1-score   support

         sister       0.40      0.25      0.31        16
          hurry       0.00      0.00      0.00        16
         hungry       0.00      0.00      0.00        17
enjoy_your_meal       0.14      0.18      0.16        17
        brother       0.38      0.18      0.24        17
           tree       0.19      0.53      0.28        17
          heavy       0.21      0.18      0.19        17
            cry       0.00      0.00      0.00        17
         family       0.05      0.12      0.07        17
           wise       0.00      0.00      0.00        17
         unwise       0.03      0.12      0.05        17
            kin       0.10      0.06      0.08        16
       shopping       0.04      0.13      0.06        15
            key       0.00      0.00      0.00        16
         mother       0.14      0.06      0.08        17
         friend       0.56      0.33      0.42        15
        ataturk       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(f'Total test data:{len(df)}')
cm = confusion_matrix(ground_truth_ids, predict_ids)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.array(target_names))
fig, ax = plt.subplots(figsize=(14,14))
disp.plot(ax=ax)

In [None]:
df.to_csv('masked_CNN_prediction.csv', encoding='utf-8')