In [6]:
import torchvision
import numpy
import torch
import argparse
import cv2
from google.colab.patches import cv2_imshow
from PIL import Image
from torchvision import transforms
import os
import numpy as np

In [7]:
coco_names = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [8]:
# this will help us create a different color for each class
COLORS = np.random.uniform(0, 255, size=(len(coco_names), 3))

# define the torchvision image transforms
transform = transforms.Compose([
    transforms.ToTensor(),
])

def predict(image, model, device, detection_threshold):
    # transform the image to tensor
    image = transform(image).to(device)
    image = image.unsqueeze(0) # add a batch dimension
    outputs = model(image) # get the predictions on the image
    # print the results individually
    # print(f"BOXES: {outputs[0]['boxes']}")
    # print(f"LABELS: {outputs[0]['labels']}")
    # print(f"SCORES: {outputs[0]['scores']}")
    # get all the predicited class names
    pred_classes = [coco_names[i] for i in outputs[0]['labels'].cpu().numpy()]
    # get score for all the predicted objects
    pred_scores = outputs[0]['scores'].detach().cpu().numpy()
    # get all the predicted bounding boxes
    pred_bboxes = outputs[0]['boxes'].detach().cpu().numpy()
    # get boxes above the threshold score
    boxes = pred_bboxes[pred_scores >= detection_threshold].astype(np.int32)
    return boxes, pred_classes, outputs[0]['labels']

def draw_boxes(boxes, classes, labels, image):
    # read the image with OpenCV
    image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
    for i, box in enumerate(boxes):
        color = COLORS[labels[i]]
        cv2.rectangle(
            image,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, 2
        )
        cv2.putText(image, classes[i], (int(box[0]), int(box[1]-5)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2, 
                    lineType=cv2.LINE_AA)
    return image

In [9]:
# construct the argument parser
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', help='path to input image/video')
parser.add_argument('-m', '--min-size', dest='min_size', default=800, 
                    help='minimum input size for the FasterRCNN network')
args = {'min_size':800,'input':'/content/recvis20_a3/bird_dataset/test_images/mistery_category/002f61512a368e4c1434eedacf609957.jpg'}

# download or load the model from disk
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, 
                                                    min_size=args['min_size'])

In [None]:
from google.colab.patches import cv2_imshow


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image = Image.open(args['input'])
model.eval().to(device)
boxes, classes, labels = predict(image, model, device, 0.8)
image = draw_boxes(boxes, classes, labels, image)
cv2_imshow(image)
save_name = f"{args['input'].split('/')[-1].split('.')[0]}_{args['min_size']}"
cv2.imwrite(f"outputs/{save_name}.jpg", image)
cv2.waitKey(0)

In [None]:
def crop(img_path,new_path,threshold=0.8):
  image = Image.open(img_path)
  image = transform(image).to(device)
  image = image.unsqueeze(0) # add a batch dimension

  outputs = model(image) # get the predictions on the image
  pred_scores = outputs[0]['scores'].detach().cpu().numpy()
  pred_boxes = outputs[0]['boxes'].detach().cpu().numpy()
  labels      = outputs[0]['labels'].detach().cpu().numpy()
  # get boxes who have a bird tag
  bird_hits = pred_boxes[labels==16].astype(np.int32)

  no_match= False


  if bird_hits.size !=0:
    bird_scores = pred_scores[labels==16]
    box = bird_hits[bird_scores == np.max(bird_scores)][0]

  if bird_hits.size ==0 :
    
    print("Found no matches in",img_path)
    no_match = True
    max_hit = pred_boxes[pred_scores == np.max(pred_scores)]
    box  = max_hit[0]

  
  image = transforms.ToPILImage()(image.squeeze())
  image=image.crop(box)
  image.save(new_path , "JPEG")

  return no_match


In [None]:
import os


old_path = '/content/recvis20_a3/bird_dataset/val_images'
new_path = '/content/bird_dataset_c/val_cropped'

def crop_dataset(old_path=old_path,new_path=new_path):


  !rm -r  $new_path
  os.mkdir(new_path)
  classes = os.listdir(old_path)

  no_match_cnt = 0
  for class_ in classes :
    
    print("Cropping class",class_)
    
    old_class_path = os.path.join(old_path,class_)
    new_class_path = os.path.join(new_path,class_)
    os.mkdir(new_class_path)  

    for img in os.listdir(old_class_path):

      old_img_path = os.path.join(old_class_path,img)
      new_img_path = os.path.join(new_class_path,img)
      no_match_cnt += crop(old_img_path,new_img_path)
    
  print(f'Found {no_match_cnt} images with no birds')


crop_dataset()
