In [1]:
from keras import backend as K
from keras.models import load_model
from keras.preprocessing import image
from keras.optimizers import Adam
from imageio import imread
import numpy as np
from matplotlib import pyplot as plt

from models.keras_ssd300 import ssd_300
from keras_loss_function.keras_ssd_loss import SSDLoss
from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
from keras_layers.keras_layer_DecodeDetections import DecodeDetections
from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast

from keras_layers.keras_layer_L2Normalization import L2Normalization

from ssd_encoder_decoder.ssd_output_decoder import decode_detections, decode_detections_fast


from data_generator.object_detection_2d_data_generator import DataGenerator
from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
from data_generator.object_detection_2d_geometric_ops import Resize
from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms

%matplotlib inline
img_height = 300
img_width = 300
# 1: Build the Keras model

K.clear_session() # Clear previous models from memory.

model = ssd_300(image_size=(img_height, img_width, 3),
                n_classes=20,
                mode='inference',
                l2_regularization=0.0005,
                scales=[0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05], # The scales for MS COCO are [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]
                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5]],
                two_boxes_for_ar1=True,
                steps=[8, 16, 32, 64, 100, 300],
                offsets=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
                clip_boxes=False,
                variances=[0.1, 0.1, 0.2, 0.2],
                normalize_coords=True,
                subtract_mean=[123, 117, 104],
                swap_channels=[2, 1, 0],
                confidence_thresh=0.5,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400)

# 2: Load the trained weights into the model.

# TODO: Set the path of the trained weights.
weights_path = 'VGG_VOC0712Plus_SSD_300x300_ft_iter_160000.h5'

model.load_weights(weights_path, by_name=True)

# 3: Compile the model so that Keras won't complain the next time you load it.

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)

model.compile(optimizer=adam, loss=ssd_loss.compute_loss)
colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()
classes = ['background',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat',
           'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor']

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


In [31]:
# import the necessary packages
from imutils.video import VideoStream
import argparse
import datetime
import imutils
import time
import cv2
import time
import json
import glob

# initialize the first frame in the video stream
firstFrame = None
frameCount =0
inferencesCount=0
inferencesCountFinal=0
framesCountFinal=0
frame_width = int(vs.get(3))
frame_height = int(vs.get(4))
print(frame_width)
print(frame_height)
out = cv2.VideoWriter('full_baseline_img.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 30, (frame_width,frame_height))
started = False
# your code
no_tracking_res= []
frames=glob.glob('../modd/*.jpg')
for path in frames:
    # grab the current frame and initialize the occupied/unoccupied
    # text
    
    
 
    # if the frame could not be grabbed, then we have reached the end
    # of the video
    
    frameCount+=1
    if(frameCount<0):
        continue
    elif started==False:
        start_time = time.time()
        started=True
    # resize the frame, convert it to grayscale, and blur it
    
    img = image.load_img(path, target_size=(img_height, img_width))
    img = image.img_to_array(img)
    input_images = []
    
    input_images.append(img)
    input_images = np.array(input_images)
    y_pred = model.predict(input_images)
    
    confidence_threshold = 0.4

    y_pred_thresh = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]
    # loop over the contours

    for box in y_pred_thresh[0]:
    # Transform the predicted bounding boxes for the 512x512 image to the original image dimensions.
        if(box[0]!=4):
            continue
        xmin = int(box[-4] * frame.shape[1] / img_width)
        ymin = int(box[-3] * frame.shape[0] / img_height)
        xmax =int(box[-2] * frame.shape[1] / img_width)
        ymax =int(box[-1] * frame.shape[0] / img_height)
        cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax),int(ymax)), (0, 255, 0), 2)
        cv2.putText(frame,'{}: {:.2f}'.format(classes[int(box[0])], box[1]), (xmin, ymin),cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
        
        no_tracking_res.append({"image_id" : frameCount, "category_id" : 1, "bbox" : [float(xmin),float(ymin),float(xmax-xmin),float(ymax-ymin)], "score" : int(box[1])})
   
    out.write(frame)
# cleanup the camera and close any open windows
elapsed_time = time.time() - start_time
print(elapsed_time)

cv2.destroyAllWindows()
out.release()
with open('full_baseline_img.json', 'w') as outfile:  
    json.dump(no_tracking_res, outfile)

640
464
643.2742085456848


In [9]:
# import the necessary packages
from imutils.video import VideoStream
import glob
import argparse
import datetime
import imutils
import time
import cv2
import time
import json
#vs = cv2.VideoCapture('../modd.avi')
# initialize the first frame in the video stream
firstFrame = None
frameCount =0
inferencesCount=0
inferencesCountFinal=0
framesCountFinal=0
frame_width = 1280
frame_height = 960
print(frame_width)
print(frame_height)
started = False
# your code
no_tracking_res= []
frames=glob.glob('../v/*.jpg')
f = open("v.txt","a")

for filePath in frames:
    print(filePath)
    frameCount+=1
    if(frameCount<501):
        continue
    if(frameCount>600):
        break
    print(frameCount)
    img = image.load_img(filePath, target_size=(img_height, img_width))
    
    img = image.img_to_array(img)
    input_images = []
    
    input_images.append(img)
    input_images = np.array(input_images)
    
    y_pred = model.predict(input_images)
    
    confidence_threshold = 0.3

    y_pred_thresh = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]
    # loop over the contours

    for box in y_pred_thresh[0]:
    # Transform the predicted bounding boxes for the 512x512 image to the original image dimensions.
        
        xmin = int(box[-4] * frame_width / img_width)
        ymin = int(box[-3] * frame_height / img_height)
        xmax =int(box[-2] * frame_width / img_width)
        ymax =int(box[-1] * frame_height / img_height)
        
        f.write("v/%s.jpg,%s,%d,%f,%f,%f,%f,%f\n"%(str(frameCount).zfill(5),classes[int(box[0])],box[0],box[1],xmin,ymin,xmax,ymax))
f.close()

1280
960
../v\00001.jpg
../v\00002.jpg
../v\00003.jpg
../v\00004.jpg
../v\00005.jpg
../v\00006.jpg
../v\00007.jpg
../v\00008.jpg
../v\00009.jpg
../v\00010.jpg
../v\00011.jpg
../v\00012.jpg
../v\00013.jpg
../v\00014.jpg
../v\00015.jpg
../v\00016.jpg
../v\00017.jpg
../v\00018.jpg
../v\00019.jpg
../v\00020.jpg
../v\00021.jpg
../v\00022.jpg
../v\00023.jpg
../v\00024.jpg
../v\00025.jpg
../v\00026.jpg
../v\00027.jpg
../v\00028.jpg
../v\00029.jpg
../v\00030.jpg
../v\00031.jpg
../v\00032.jpg
../v\00033.jpg
../v\00034.jpg
../v\00035.jpg
../v\00036.jpg
../v\00037.jpg
../v\00038.jpg
../v\00039.jpg
../v\00040.jpg
../v\00041.jpg
../v\00042.jpg
../v\00043.jpg
../v\00044.jpg
../v\00045.jpg
../v\00046.jpg
../v\00047.jpg
../v\00048.jpg
../v\00049.jpg
../v\00050.jpg
../v\00051.jpg
../v\00052.jpg
../v\00053.jpg
../v\00054.jpg
../v\00055.jpg
../v\00056.jpg
../v\00057.jpg
../v\00058.jpg
../v\00059.jpg
../v\00060.jpg
../v\00061.jpg
../v\00062.jpg
../v\00063.jpg
../v\00064.jpg
../v\00065.jpg
../v\00066.jpg
.

../v\00537.jpg
537
../v\00538.jpg
538
../v\00539.jpg
539
../v\00540.jpg
540
../v\00541.jpg
541
../v\00542.jpg
542
../v\00543.jpg
543
../v\00544.jpg
544
../v\00545.jpg
545
../v\00546.jpg
546
../v\00547.jpg
547
../v\00548.jpg
548
../v\00549.jpg
549
../v\00550.jpg
550
../v\00551.jpg
551
../v\00552.jpg
552
../v\00553.jpg
553
../v\00554.jpg
554
../v\00555.jpg
555
../v\00556.jpg
556
../v\00557.jpg
557
../v\00558.jpg
558
../v\00559.jpg
559
../v\00560.jpg
560
../v\00561.jpg
561
../v\00562.jpg
562
../v\00563.jpg
563
../v\00564.jpg
564
../v\00565.jpg
565
../v\00566.jpg
566
../v\00567.jpg
567
../v\00568.jpg
568
../v\00569.jpg
569
../v\00570.jpg
570
../v\00571.jpg
571
../v\00572.jpg
572
../v\00573.jpg
573
../v\00574.jpg
574
../v\00575.jpg
575
../v\00576.jpg
576
../v\00577.jpg
577
../v\00578.jpg
578
../v\00579.jpg
579
../v\00580.jpg
580
../v\00581.jpg
581
../v\00582.jpg
582
../v\00583.jpg
583
../v\00584.jpg
584
../v\00585.jpg
585
../v\00586.jpg
586
../v\00587.jpg
587
../v\00588.jpg
588
../v\00589.j

In [30]:
import json
f = open("ssd_300_baseline_modd_vid.txt","r")
tracking_res = []
line = f.readline()
while(line):
    comps = line.split(',')
    if(comps[2]=='4'):
        xmin = float(comps[4])
        ymin = float(comps[5])
        xmax = float(comps[6])
        ymax = float(comps[7])
        tracking_res.append({"image_id" : int(comps[0][5:10]), "category_id" : 1, "bbox" : [xmin,ymin,(xmax-xmin),(ymax-ymin)], "score" : float(comps[3])})
    line = f.readline()
with open('ssd300_baseline_modd_vid.json', 'w') as outfile:  
    json.dump(tracking_res, outfile)
f.close()

In [29]:
# import the necessary packages
from imutils.video import VideoStream
import glob
import argparse
import datetime
import imutils
import time
import cv2
import time
import json
#vs = cv2.VideoCapture('../modd.avi')
# initialize the first frame in the video stream
firstFrame = None
frameCount =0
vs = cv2.VideoCapture('../modd.avi')
inferencesCount=0
inferencesCountFinal=0
framesCountFinal=0
frame_width = int(vs.get(3))
frame_height = int(vs.get(4))
print(frame_width)
print(frame_height)
started = False
# your code
no_tracking_res= []

f = open("ssd_300_baseline_modd_vid.txt","a")

while frameCount<641:
    
    frameCount+=1
  
    
    frame = vs.read()[1]
    
    cv2.imwrite('temp.jpg',frame)
    img = image.load_img('temp.jpg', target_size=(img_height, img_width))
    
    img = image.img_to_array(img)
    input_images = []
    
    input_images.append(img)
    input_images = np.array(input_images)
    
    y_pred = model.predict(input_images)
    
    confidence_threshold = 0.1

    y_pred_thresh = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]
    # loop over the contours

    for box in y_pred_thresh[0]:
    # Transform the predicted bounding boxes for the 512x512 image to the original image dimensions.
        
        xmin = int(box[-4] * frame_width / img_width)
        ymin = int(box[-3] * frame_height / img_height)
        xmax =int(box[-2] * frame_width / img_width)
        ymax =int(box[-1] * frame_height / img_height)
        
        f.write("modd/%s.jpg,%s,%d,%f,%f,%f,%f,%f\n"%(str(frameCount).zfill(5),classes[int(box[0])],box[0],box[1],xmin,ymin,xmax,ymax))
f.close()

640
464
