## Notes

"As I explained before: you have to do detection with py-faster-rcnn and then a tracking using deep sort."

## Use mask-r-cnn to get detections in ROI format

In [3]:
import os
import sys
import random
import math
import numpy as np
import skimage.io
import matplotlib
import matplotlib.pyplot as plt

import coco
import utils
import model as modellib
import visualize

%matplotlib inline 

# Root directory of the project
ROOT_DIR = os.getcwd()

# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")

# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
# Download COCO trained weights from Releases if needed
if not os.path.exists(COCO_MODEL_PATH):
    utils.download_trained_weights(COCO_MODEL_PATH)

# Directory of images to run detection on
IMAGE_DIR = os.path.join(ROOT_DIR, "images")

########################################


class InferenceConfig(coco.CocoConfig):
    # Set batch size to 1 since we'll be running inference on
    # one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    NUM_CLASSES = 1

config = InferenceConfig()
config.display()


"""Create Model and Load Trained Weights"""

# Create model object in inference mode.
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)

# Load weights trained on MS-COCO
model.load_weights(COCO_MODEL_PATH, by_name=True)

# COCO Class names
# Index of the class in the list is its ID. For example, to get ID of
# the teddy bear class, use: class_names.index('teddy bear')

# class_names = ['person']

class_names = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
               'bus', 'train', 'truck', 'boat', 'traffic light',
               'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
               'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
               'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
               'kite', 'baseball bat', 'baseball glove', 'skateboard',
               'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
               'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
               'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
               'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
               'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
               'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
               'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
               'teddy bear', 'hair drier', 'toothbrush']


# Load a random image from the images folder
# file_names = next(os.walk(IMAGE_DIR))[2]
# image = skimage.io.imread('os.path.join(IMAGE_DIR, random.choice(file_names))')
image = skimage.io.imread('images/1045023827_4ec3e8ba5c_z.jpg')

# Run detection
results = model.detect([image], verbose=1)

# Visualize results
r = results[0]
print(r['rois'].shape)
print(r['scores'].shape)
print(r['scores'])

visualize.display_instances(image, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'])


Configurations:
BACKBONE_SHAPES                [[256 256]
 [128 128]
 [ 64  64]
 [ 32  32]
 [ 16  16]]
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     1
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.7
DETECTION_NMS_THRESHOLD        0.3
GPU_COUNT                      1
IMAGES_PER_GPU                 1
IMAGE_MAX_DIM                  1024
IMAGE_MIN_DIM                  800
IMAGE_PADDING                  True
IMAGE_SHAPE                    [1024 1024    3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
MASK_POOL_SIZE                 14
MASK_SHAPE                     [28, 28]
MAX_GT_INSTANCES               100
MEAN_PIXEL                     [123.7 116.8 103.9]
MINI_MASK_SHAPE                (56, 56)
NAME                           coco
NUM_CLASSES                    1
POOL_SIZE                      7
POST_NMS_ROIS_INFERENCE        1000
POST_NMS_ROIS_TRAINING      

ValueError: Dimension 1 in both shapes must be equal, but are 4 and 324. Shapes are [1024,4] and [1024,324]. for 'Assign_682' (op: 'Assign') with input shapes: [1024,4], [1024,324].

# Main

## Generate Detections

In [None]:
import os
import csv
import skvideo.io
import numpy as np
import coco
import utils
import model as modellib
import visualize

# Mask-R-CNN
MASKRCNN_DIR = 'maskrcnn'
MODEL_DIR = os.path.join("logs")
COCO_MODEL_PATH = os.path.join("mask_rcnn_coco.h5")

VIDEO_DIR = 'videos'
VIDEO_FILE = 'transition.mp4'
video = os.path.join(VIDEO_DIR, VIDEO_FILE)

# Save video frames
FRAMES_DIR = 'frames'

class InferenceConfig(coco.CocoConfig):
    # Set batch size to 1 since we'll be running inference on
    # one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

config = InferenceConfig()

# Create model object in inference mode.
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)

# Load weights trained on MS-COCO
model.load_weights(COCO_MODEL_PATH, by_name=True)

def get_detections_frame(model, image, frame_idx):
    results = model.detect([image], verbose=0)
    rois = results[0]['rois']
    confs = results[0]['scores']
    
    detections = np.zeros([len(rois), 10])
    
    for idx, coord in enumerate(rois):
        conf = confs[idx]
        detections[idx] = to_mot_format(frame_idx, coord, conf)
    
    return detections


def write_to_csv(csv_file, row_titles = None, new_row = None, new_file = 'no'):
    if new_file == 'yes':
        rwa = 'w'
    else:
        rwa = 'a'
    with open(csv_file, rwa) as csvfile:
        writer = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        if row_titles:
            writer.writerow(row_titles)
        if new_row:
            writer.writerow(new_row)
    csvfile.close()

def to_mot_format(frame_idx, coord, conf):
    """
    Input coordinates: 
    (y1, x1, y2, x2)
    
    Output coordinates: 
    (frame, id, bb_left, bb_top, bb_width, bb_height, -1, -1, -1, -1)
    """
    filler = -1
    bb_left = coord[1]
    bb_top = coord[0]
    bb_width = coord[3] - coord[1]
    bb_height = coord[2] - coord[0]

    # Rearrange coordinates
    coord = np.array([frame_idx + 1, 
                      filler,
                      bb_left,
                      bb_top,
                      bb_width,
                      bb_height,
                      conf,
                      filler,
                      filler,
                      filler])
    
    return coord

def get_detections_video(video):
    """
    Get ROI detections from video using Mask-R-CNN and save in 
    MOTChallenge format
    """    
    videodata = skvideo.io.vread(video)
    num_frames = len(videodata)
    det_file = open('detections.txt', 'ab')
    
    for idx, frame in enumerate(videodata):
        try:
            print("PROCESSING IMAGE {} / {}".format(idx, num_frames))
            detection = get_detections_frame(model, frame, idx)
            np.savetxt(det_file, detection, delimiter=',', fmt='%1.2f')
            print("DONE")
        except:
            print("FRAME {} NOT PROCESSED".format(idx))

        det_file.flush()
    
    det_file.close()
    
    print("FINISHED!")

get_detections_video(video)

## Generate detection features using Deep SORT and append to detections file

In [None]:
import os
import generate_detections
import deep_sort.detection
import numpy as np
import skvideo.io

DETECTION_DIR = 'detections'

VIDEO_DIR = 'videos'
VIDEO_FILE = 'transition.mp4'

CHECKPOINT_PATH = os.path.join('resources', 'networks', 'mars-small128.pb')

encoder = generate_detections.create_box_encoder(CHECKPOINT_PATH)
video = os.path.join(VIDEO_DIR, VIDEO_FILE)
name_out = 'generated_detections'
detection_file = 'detections.txt'

def generate_detections_features(encoder, video, name_out, detection_file):

    """Generate detections with features. Modification with video"""
    detections_in = np.loadtxt(detection_file, delimiter=',')
    detections_out = []

    frame_indices = detections_in[:, 0].astype(np.int)
    min_frame_idx = frame_indices.astype(np.int).min()
    max_frame_idx = frame_indices.astype(np.int).max()

    videodata = skvideo.io.vread(video)
    

    for frame_idx in range(min_frame_idx, max_frame_idx + 1):
        print("Frame %05d/%05d" % (frame_idx, max_frame_idx))
        mask = frame_indices == frame_idx
        rows = detections_in[mask]

        if frame_idx not in frame_indices:
            print("WARNING could not find image for frame %d" % frame_idx)
            continue
            
#         camera.set(cv2.CAP_PROP_POS_FRAMES, frame_idx-1);
#         (grabbed, bgr_image) = camera.read()
#         bgr_image = cv2.imread(image_filenames[frame_idx], cv2.IMREAD_COLOR)  

        bgr_image = videodata[frame_idx]

        features = encoder(bgr_image, rows[:, 2:6].copy())
        detections_out += [np.r_[(row, feature)] for row, feature
                           in zip(rows, features)]

    # output_filename = os.path.join(output_dir, "%s.npy" % sequence)
    np.save(name_out, np.asarray(detections_out), allow_pickle=False)
    
generate_detections_features(encoder, video, name_out, detection_file)

## Run Deep SORT tracking

In [None]:
from matplotlib import pyplot as plt
from deep_sort import nn_matching
from deep_sort.deep_sort_app import create_detections
from deep_sort.application_util import preprocessing
from deep_sort.tracker import Tracker


def tracking(detections_file):
    """ Track objects"""
    detections_file = np.load(detections_file)
    min_frame_idx = int(detections_file[:, 0].min())
    max_frame_idx = int(detections_file[:, 0].max())
    min_confidence = 0.3
    min_detection_height = 0
    nms_max_overlap = 0.3
    
    videodata = skvideo.io.vread(video)
    
    if (display):
        plt.ion()
        fig = plt.figure()

    for frame_idx in range(min_frame_idx, max_frame_idx + 1):
        
        frame = videodata[frame_idx]
        
        print("Processing frame {}".format(frame_idx + 1))

        # Load image and generate detections.
        detections = create_detections(detections_file, frame_idx, min_detection_height)
        detections = [d for d in detections if d.confidence >= min_confidence]

        # Run non-maxima suppression.
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        # Update tracker.
        max_cosine_distance = 0.05
        nn_budget = 1
        
        metric = nn_matching.NearestNeighborDistanceMetric(
            "cosine", max_cosine_distance, nn_budget)
        
        tracker = Tracker(metric)
        tracker.predict()
        tracker.update(detections)

        # Update visualization.
        if display:
            ax1 = fig.add_subplot(111, aspect='equal')
            # fn = 'mot_benchmark/%s/%s/img1/%06d.jpg'%(phase,seq,frame)
            ax1.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            plt.title(' Tracked Targets')

        # Store results.
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlwh()
            results.append([
                frame_idx+1, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3]])

         
            if (display):
                ax1.add_patch(patches.Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, lw=3,ec=colours[track.track_id % 32, :]))
                ax1.set_adjustable('box-forced')
                plt.text(bbox[0], bbox[1], str(track.track_id))

        if(display):
            fig.canvas.flush_events()
            plt.draw()
            ax1.cla()


In [None]:
tracking('generated_detections.npy')

In [13]:
import numpy as np

arr = np.load('generated_detections.npy')

print(arr[:2])

[[ 0.00000000e+00 -1.00000000e+00  5.35000000e+02  1.48000000e+02
   4.60000000e+01  8.70000000e+01 -1.00000000e+00 -1.00000000e+00
  -1.00000000e+00 -1.00000000e+00  6.87755123e-02  5.70516512e-02
  -3.92585294e-03 -9.21779424e-02  7.79442713e-02  2.81216837e-02
  -1.10583819e-01 -8.56326222e-02 -1.28503721e-02 -6.98725358e-02
  -6.12861365e-02 -1.27887800e-01  9.91246477e-03 -5.79381622e-02
   1.49324173e-02  3.30629870e-02 -6.68136925e-02  7.01870397e-02
  -2.13439222e-02  8.82290825e-02  2.19001293e-01  1.97677523e-01
   3.15135322e-03  3.09841838e-02  1.04944520e-01 -6.00195937e-02
  -2.49438472e-02  1.26875460e-01 -4.59383279e-02 -9.58584175e-02
   8.22890624e-02 -2.12550499e-02 -6.88390210e-02 -3.21026482e-02
   1.69488013e-01  4.72694598e-02  5.60176745e-02 -9.21147987e-02
   2.78272867e-01 -7.85716847e-02 -4.80469204e-02 -4.89752255e-02
  -7.50424117e-02  1.77075639e-01  1.62325446e-02  3.47578302e-02
   3.63305137e-02 -1.00752436e-01  7.15530440e-02 -3.28774899e-02
  -1.23983