In [1]:
%load_ext autoreload
%autoreload 2
from rekall import Interval, IntervalSet, IntervalSetMapping, Bounds3D
from rekall.predicates import *
from vgrid import VGridSpec, VideoMetadata, VideoBlockFormat, FlatFormat, SpatialType_Bbox, SpatialType_Keypoints, Metadata_Keypoints
from vgrid_jupyter import VGridWidget
import os, json
import pandas as pd
from const import *


### Load in video data.

In [42]:
SINGLE_VIDEO = True
test_id = 2
# load in video metadata
video_collection_intel = [
    {'num_frames': 3053, 'height': 720, 'width': 406, 'fps': 29.84, 'filename': 'dally_sy.mp4', 'id': 0},
    {'num_frames': 1488, 'height': 720, 'width': 1280, 'fps': 30, 'filename': 'hip_emily.mp4', 'id': 1},
    {'num_frames': 2062, 'height': 720, 'width': 1280, 'fps': 30, 'filename': '7thsense_mayee.mp4', 'id': 2}
    ]
if SINGLE_VIDEO:
    video_collection_intel = [video_collection_intel[2]]
video_metadata_intel = [
    VideoMetadata(v["filename"], v["id"], v["fps"], int(v["num_frames"]), v["width"], v["height"])
    for v in video_collection_intel
]

In [3]:
# load in openpose output data
json_dir_sy = "C:/Users/heidi/Documents/seniorproject/openpose-1.5.1-binaries-win64-only_cpu-python-flir-3d/openpose-1.5.1-binaries-win64-only_cpu-python-flir-3d/openpose/long_output/"
json_dir_emily = "C:/Users/heidi/Documents/seniorproject/data/hip_output/"
json_dir_mayee = "C:/Users/heidi/Documents/seniorproject/data/7thsense_output/"
json_dirs = [json_dir_sy, json_dir_emily, json_dir_mayee]
# json_dirs = [json_dir_mayee]

if SINGLE_VIDEO:
    json_dirs = [json_dirs[test_id]]

def load_openpose_data(keypoint_files, vm):
    """
    Loads openpose data for single video.
    Arguments:
        keypoint_files: list of keypoint.json file names for one video
            each json file contains dictionary of openpose output info for one frame. 
        vm: VideoMetadata object for one video
    Returns:
        frame_list: list of frames where each frame is a dictionary of part idxs to points {0: [x y conf], 1: etc.} 
    """
    frame_list = []
    for js in keypoint_files:
        with open(js) as json_file:
            keypoint_data = json.load(json_file)
#             if (len(keypoint_data['people']) > 1):
#                 print (js)
            if (len(keypoint_data['people']) != 0 and len(keypoint_data['people'][0]) != 0):
                pose_keypoints_2d = keypoint_data['people'][0]['pose_keypoints_2d']
            else: # fill in empty frames w/ 0s
                pose_keypoints_2d = [0 for i in range(75)]

            part_data = {}
            for index in BODY25_MAPPING:
                keypoint_index = index * POINTS_PER_PART
                part_data[index] = pose_keypoints_2d[keypoint_index : keypoint_index + POINTS_PER_PART]
                if len(part_data[index]) != 0: #normalize
                    part_data[index][0] /= vm.width
                    part_data[index][1] /= vm.height

            frame_list.append(part_data)
    return frame_list
    
data_list = []
for path_to_json, vm in zip(json_dirs, video_metadata_intel):
    keypoint_files = [os.path.join(path_to_json, pos_json) for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
    data_list.append(load_openpose_data(keypoint_files, vm))
    
#TODO expand for more than one vid
frame_list = data_list[-1]


### Annotate videos with open pose data, bounding boxes.

In [8]:
def get_op_bbox(frame):
    """
    Arguments: 
        frame: dictionary of joint indices to normalized coords [x, y, conf]. ie {0: [.5, .5, .98]}
    Returns 4 normalized bounding box coordinates x1, x2, y1, y2
    """
    x1 = 1
    x2 = 0
    y1 = 1
    y2 = 0
    for key in frame:
        joint = frame[key]
        if len(joint) != 0:
            if (joint[0] != 0):
                x1 = min(x1, joint[0])
                x2 = max(x2, joint[0])
            if (joint[1] != 0):
                y1 = min(y1, joint[1])
                y2 = max(y2, joint[1])
        
    return x1, x2, y1, y2
        
        

In [9]:
# create intervalset mapping w/ pose visualizing data

vm = video_metadata_intel[test_id]
bboxes = [get_op_bbox(frame) for frame in frame_list]
interval_mapping = IntervalSetMapping({
        vm.id: IntervalSet([
            Interval(
                Bounds3D(
                    t1 = frame_num / vm.fps,
                    t2 = (frame_num + 1) / vm.fps,
                    x1 = bboxes[frame_num][0],
                    x2 = bboxes[frame_num][1],
                    y1 = bboxes[frame_num][2],
                    y2 = bboxes[frame_num][3]
                ),
                
                {'spatial_type': SpatialType_Keypoints(),
                    'metadata': {
                        # This function can also parse faces and hands
                        'pose': Metadata_Keypoints(pose, BODY25_EDGES)
                    }
                }
                
            )
            for frame_num, pose in enumerate(frame_list)
        ])
})



### Visualize OP data


In [10]:
# visualize
def visualize_helper(video_metadata_intel, interval_mapping):
    vgrid_spec = VGridSpec(
        video_meta = video_metadata_intel,
        vis_format = VideoBlockFormat(imaps = [
            ('bboxes', interval_mapping)
        ]),
        video_endpoint = 'http://localhost:8000'
    )
    return VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())
        
visualize_helper(video_metadata_intel, interval_mapping)

VGridWidget(vgrid_spec={'compressed': True, 'data': b'x\x9c\xec\xbd\xdb\x8el\xc9\x8d\xa6\xf9*\x85\xbcN\x08v>\x…

### Generate interval mapping for example dance move: hands up

In [13]:
def get_coord(interval, joint_idx, coord_idx):
    md = interval['payload']['metadata']['pose'].to_json()
    return md['args']['keypoints'][joint_idx][coord_idx]
              
Rwrist = 4
Lwrist = 7
Neck = 15 # this is actually eyes but ... forget it
hands_up = interval_mapping.filter(lambda interval: 
                                   get_coord(interval, Rwrist, 1) < get_coord(interval, Neck, 1)
                                  and  get_coord(interval, Lwrist, 1) < get_coord(interval, Neck, 1))


0

In [14]:
visualize_helper(video_metadata_intel, hands_up)

VGridWidget(vgrid_spec={'compressed': True, 'data': b'x\x9c\xec\xbd\xdd\xae4\xd7\x91\x1c\xfa*\x03^\x13\xc2\xfa…

### Hand annotate video for this move.

In [15]:
#HAND ANNOTATE
vgrid_spec = VGridSpec(
        video_meta = video_metadata_intel,
        vis_format = VideoBlockFormat(imaps = [
            ('bboxes', interval_mapping)
        ]),
        video_endpoint = 'http://localhost:8000'
    )
widget =  VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed()) 


In [16]:
widget

VGridWidget(vgrid_spec={'compressed': True, 'data': b'x\x9c\xec\xbd\xdb\x8el\xc9\x8d\xa6\xf9*\x85\xbcN\x08v>\x…

In [18]:
# widget_output = hands_up[0].get_intervals()[0]['payload']
output_widget = True
widget_file = '7th_sense_mayee_annotated.json'
if output_widget:
    with open(widget_file, 'w') as f:
        json.dump(widget.label_state, f)

In [19]:
# TIME TO START EVALUATING YO
# with open(widget_file, 'w') as f:
#     widget_labels = json.loads(f)



{'blocks_selected': {},
 'block_labels': {'0': {'captions_selected': [],
   'new_intervals': [{'bounds': {'t1': 17.502203,
      't2': 17.725549,
      'bbox': {'x1': 0, 'x2': 1, 'y1': 0, 'y2': 1}},
     'data': {'spatial_type': {'args': {}}, 'metadata': {}}},
    {'bounds': {'t1': 21.075739,
      't2': 21.745777,
      'bbox': {'x1': 0, 'x2': 1, 'y1': 0, 'y2': 1}},
     'data': {'spatial_type': {'args': {}}, 'metadata': {}}},
    {'bounds': {'t1': 24.22267,
      't2': 24.22267001864799,
      'bbox': {'x1': 0, 'x2': 1, 'y1': 0, 'y2': 1}},
     'data': {'spatial_type': {'args': {}}, 'metadata': {}}},
    {'bounds': {'t1': 51.055636,
      't2': 52.160523,
      'bbox': {'x1': 0, 'x2': 1, 'y1': 0, 'y2': 1}},
     'data': {'spatial_type': {'args': {}}, 'metadata': {}}},
    {'bounds': {'t1': 53.896773,
      't2': 54.528137,
      'bbox': {'x1': 0, 'x2': 1, 'y1': 0, 'y2': 1}},
     'data': {'spatial_type': {'args': {}}, 'metadata': {}}},
    {'bounds': {'t1': 65.261324,
      't2': 66.

## Evaluate results.

In [74]:
import math
import numpy as np 
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score #inputs: y_truth, y_predict

def get_action_segments(interval_list, vm, segment_length = 1):
    action_segments = np.zeros(math.ceil((vm.num_frames / vm.fps) / segment_length))
    for interval in interval_list:
        bounds = interval['bounds']
        t1 = bounds['t1'] 
        t2 = bounds['t2'] 
        while (t1 <= t2):
            segment_idx = math.floor(t1 / segment_length)
            action_segments[segment_idx] = 1 #mark as 1 for an event!
            t1 += segment_length
    return action_segments

def evaluate(intervals_predict, intervals_truth, vm, segment_length):
    y_predict = get_action_segments(intervals_predict, vm, segment_length)
    y_truth = get_action_segments(intervals_truth, vm, segment_length)
    print("Evaluating with segment length = {}s".format(segment_length))
    print("Recall: {}".format(recall_score(y_truth, y_predict)))
    print("Precision: {}".format(precision_score(y_truth, y_predict)))
    print("F1 score: {}".format(f1_score(y_truth, y_predict)))
    print("Accuracy: {}".format(accuracy_score(y_truth, y_predict)))
    print()
    
interval_list = widget.label_state['block_labels']['0']['new_intervals']
vm = video_metadata_intel[-1]
segment_length = .1 # in terms of seconds / times instead of # frames, since time is a constant measure across vids and is more intuitive

rekall_labels = hands_up[test_id].get_intervals()
hand_labels = interval_list

for i in range(1, 20, 2):
    segment_length = i / 10.0
    evaluate(rekall_labels, hand_labels, vm, segment_length)

Evaluating with segment length = 0.1s
Recall: 0.30357142857142855
Precision: 0.34
F1 score: 0.32075471698113206
Accuracy: 0.8953488372093024

Evaluating with segment length = 0.3s
Recall: 0.391304347826087
Precision: 0.3103448275862069
F1 score: 0.34615384615384615
Accuracy: 0.8521739130434782

Evaluating with segment length = 0.5s
Recall: 0.6666666666666666
Precision: 0.4166666666666667
F1 score: 0.5128205128205129
Accuracy: 0.8623188405797102

Evaluating with segment length = 0.7s
Recall: 0.75
Precision: 0.375
F1 score: 0.5
Accuracy: 0.8181818181818182

Evaluating with segment length = 0.9s
Recall: 0.5454545454545454
Precision: 0.3333333333333333
F1 score: 0.41379310344827586
Accuracy: 0.7792207792207793

Evaluating with segment length = 1.1s
Recall: 0.7272727272727273
Precision: 0.42105263157894735
F1 score: 0.5333333333333333
Accuracy: 0.7777777777777778

Evaluating with segment length = 1.3s
Recall: 0.8
Precision: 0.5
F1 score: 0.6153846153846154
Accuracy: 0.8113207547169812

Eval

### Now, try it with Gaussian smoothing

In [102]:
from scipy.ndimage import gaussian_filter
import matplotlib.pyplot as plt

def evaluate_gaussian(intervals_predict, intervals_truth, vm, segment_length=1, sigma=.5, visualize=True):
    y_predict_raw = get_action_segments(intervals_predict, vm, segment_length)
    y_truth_raw = get_action_segments(intervals_truth, vm, segment_length)
    
    #smooth
    y_predict = gaussian_filter(y_predict_raw, sigma)
    y_truth = gaussian_filter(y_truth_raw, sigma)
    
    if visualize:
        plt.scatter(range(len(y_predict_raw)), y_predict_raw, label="original")
        plt.scatter(range(len(y_predict)), y_predict, label="smoothed")
        plt.legend()
        plt.title("Predictions")
        plt.show()
    
    #calculate recall
    print("Evaluating with segment length = {}s, sigma = {}s".format(segment_length, sigma))
    relative_score = y_predict.dot(y_truth)
    print ("Relative score (cont. dot product): {}".format(relative_score))
        
    print()
#         print([round(num,3) for num in y_predict])
    
#     print("Evaluating with segment length = {}s and gaussian weighting".format(segment_length))
#     print("Recall: {}".format(recall_score(y_truth, y_predict)))
#     print("Precision: {}".format(precision_score(y_truth, y_predict)))
#     print("F1 score: {}".format(f1_score(y_truth, y_predict)))
#     print()
                              
for i in range(1, 20, 2):
    segment_length = i / 10.0
    evaluate_gaussian(rekall_labels, hand_labels, vm, segment_length, visualize=False)


Evaluating with segment length = 0.1s, sigma = 0.5s
Relative score (cont. dot product): 16.952506113138625

Evaluating with segment length = 0.3s, sigma = 0.5s
Relative score (cont. dot product): 8.605778758347089

Evaluating with segment length = 0.5s, sigma = 0.5s
Relative score (cont. dot product): 8.506300740621437

Evaluating with segment length = 0.7s, sigma = 0.5s
Relative score (cont. dot product): 7.709284480112926

Evaluating with segment length = 0.9s, sigma = 0.5s
Relative score (cont. dot product): 5.594144875584039

Evaluating with segment length = 1.1s, sigma = 0.5s
Relative score (cont. dot product): 7.223811368694495

Evaluating with segment length = 1.3s, sigma = 0.5s
Relative score (cont. dot product): 6.386163555918522

Evaluating with segment length = 1.5s, sigma = 0.5s
Relative score (cont. dot product): 7.218645117420377

Evaluating with segment length = 1.7s, sigma = 0.5s
Relative score (cont. dot product): 7.074620546078005

Evaluating with segment length = 1.9

### Experiment w/ preprocessing / smoothing prediction data to human capabilities.