STREET VIEW HOUSE NUMBERS DETECTION AND RECOGNITION PROJECT, USING FASTER R-CNN IN PYTORCH

In [1]:
import os
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.autograd import Variable
import torchvision
from torchvision import transforms, datasets, models
from PIL import Image
import json
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from tqdm import tqdm

In [2]:
device = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
TRAIN_DATA = 0 #Set to 1 to train the model

In [4]:
#Directories
ANNOTATION_DIR = {
    "train": "annotation/train.json",
    "val": "annotation/val.json"
}

DATA_DIR = {
    "train": "data/train",
    "val": "data/val",
    "test": "data/test"
}

In [5]:
#Data normalization and tensorization
def data_transform():
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    return transform

In [6]:
#Plot tensor images
def imshow(img):
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img_show = img.cpu().numpy().squeeze().transpose((1,2,0))
    img_show = (img_show * std+mean)
    img_show = np.clip(img_show,0,1)
    return img_show

In [7]:
#Custom dataset
class SVHNDataset(data.Dataset):
    def __init__(self, root, annotation, transforms=None):
        self.root = root
        self.transforms = transforms
        self.annotation = annotation
        self.transforms = transforms
        self.ids = os.listdir(root)
        self.classes = [i for i in range(0,11)] #Classes: 0 - 9 digits and none
        with open(self.annotation, 'r') as anno:
            json_file = json.load(anno)
        self.json_file = json_file
    def __getitem__(self, index):
        
        #Open annotation file in json type
        obj_name = self.json_file[index]['filename']
        obj_boxes = self.json_file[index]['boxes']
        img_id = Image.open(os.path.join(self.root,obj_name))
        
        num_obj = len(obj_boxes)
        boxes = []
        labels = []
        
        #Read in bounding boxes cordinate and labels
        for i in range(num_obj):
            xmin =  obj_boxes[i]['left']
            ymin =  obj_boxes[i]['top']
            xmax =  xmin + obj_boxes[i]['width']
            ymax =  ymin + obj_boxes[i]['height']
            label = obj_boxes[i]['label']
            boxes.append([xmin,ymin,xmax,ymax])
            labels.append(label)
        
        #Tensorize 
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        labels = torch.LongTensor(labels)
        index = torch.Tensor([index])
        iscrowd = torch.zeros((num_obj,), dtype=torch.int64)
        
        #Create dict of annotation for easy access
        my_annotation = {}
        my_annotation["boxes"] = boxes
        my_annotation["labels"] = labels
        my_annotation["image_id"] = index
        my_annotation["iscrowd"] = iscrowd
        my_annotation["area"] = area
        
        if self.transforms is not None:
            img = self.transforms()(img_id)
        
        return img, my_annotation
    
    def __len__(self):
        return len(self.ids)
    

In [8]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [9]:
#Create dataset and dataloader
if TRAIN_DATA:
    dataset = SVHNDataset(DATA_DIR["train"],ANNOTATION_DIR["train"],transforms=data_transform)
    dataloader = data.DataLoader(dataset,batch_size=1, shuffle=True, collate_fn=collate_fn, num_workers = 0)

In [10]:
if TRAIN_DATA:
    CLASSES = dataset.classes
    NUM_CLASSES = len(CLASSES)
    TRAINING_SAMPLES =len(dataset)
    print(f"Number of training samples: {TRAINING_SAMPLES}")
    print(f"Number of classes: {NUM_CLASSES}")
    print(f"Classes: {CLASSES}") #label '10' is for digit '0' and '0' is for none digit

In [11]:
#Visualize 10 training samples
def visualize_samples():
    fig = plt.figure(figsize=(15,10))
    plt.axis("off")
    plt.title("Training Samples")
    for id,(img,anno) in enumerate(dataloader): 
        if id==9:
            break
        ax = fig.add_subplot(3,3,id+1)
        for label, box in zip(anno[0]["labels"], anno[0]["boxes"]):
            rect = patches.Rectangle((box[0],box[1]),box[2]-box[0],box[3]-box[1],fill=False,ec="red",lw=3)
            plt.text(box[0],box[1]-3,int(label.numpy()),color="red",fontsize=15, fontweight='bold')
            ax.add_patch(rect)
        img = imshow(img[0])
        plt.imshow(img)
    plt.show()
#visualize_samples()

In [12]:
#Create Faster R-CNN model
def instance_segmentation_model(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    
    #Edit the last layer 
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

model = instance_segmentation_model(11)
model = model.to(device)
print(model)

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

Faster R-CNN model in Pytorch receives input as a list of image and a list of annotation dicts, then outputs losses when in training mode or the target annotation prediction when in evaluate mode. More detail on official Pytorch website.

In [13]:
#Training process
def train(model,dataloader):
    EPOCHS = 1
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_list = []
    print("Start training...")
    for epoch in range(EPOCHS):
        model.train()
        for id, (imgs,anno) in enumerate(dataloader):
            optimizer.zero_grad()
            
            #Put images and annotations in list type
            imgs = list(image.to(device) for image in imgs)
            targets = [{k: v.to(device) for k, v in t.items()} for t in anno]
            
            #Model outputs are losses when in training mode
            loss_dict = model(imgs, targets)
            losses = sum(loss for loss in loss_dict.values())

            
            losses.backward()
            optimizer.step()
            loss_list.append(losses)
            print(f'Iteration: [{id}/{len(dataloader)}], Loss: {losses}')
            
            #Save model at every k iterations
            if id %3000 == 0:
                torch.save(model.state_dict(), "model/model.pth")
        torch.save(model.state_dict(), "model/model.pth")
    print("Training Completed!")
    
    return model, loss_list

In evaluating mode, the model gives out predicted annotation (bounding boxes, labels, scores)

In [14]:
def predict(model, img):
    #Set model into evaluate mode
    model.eval()
    
    #Note: PIL image dimension is h,w
    h,w = img.size
    img = data_transform()(img)
    
    #Tensor view is w,h
    imgs = Variable(img).view(-1,3,w,h).to(device)
    
    #Predict
    with torch.no_grad():
        outputs = model(imgs)
    if torch.cuda.is_available():
        del imgs
        torch.cuda.empty_cache() 
    return outputs

In [15]:

if TRAIN_DATA:
    model, loss_list = train(model,dataloader)
else:
    #if not train, load trained model
    model.load_state_dict(torch.load("model/model.pth", map_location=device))

In [18]:
#Testing model
def visualize_test(NUM_PIC=4):
    thresh = 0.87 #Threshhold to ensure that is a digit
    fig = plt.figure(figsize=(15,15))
    a = np.random.randint(13000)
    path = os.listdir(DATA_DIR["test"])
    for id,i in enumerate(path[a:a+NUM_PIC]):

        fig.add_subplot(NUM_PIC/2,2,id+1)
        
        #Open image
        img = Image.open(os.path.join(DATA_DIR["test"],i))
        
        #Predict each image at a time
        prediction = predict(model,img)
        
        #Get top scores that pass the threshold
        scores = prediction[0]["scores"]
        scores = scores[scores >= thresh]
        num_objs = len(scores)
        
        #Get predicted bbox and label
        bboxes = prediction[0]["boxes"][:num_objs]
        labels = prediction[0]["labels"][:num_objs]
        plt.imshow(img)
        ax = plt.gca()
        plt.axis("off")
        plt.title(i)
        for box, label,score in zip(bboxes,labels,scores):
            x,y,x2,y2 = box
            label = str(label.cpu().numpy()) #+": " + "{:.3f}".format(score.cpu().numpy())
            
            #Draw bboxes
            rect = patches.Rectangle((x,y),x2-x,y2-y,color="red",lw=3,fill=False)
            #Print label
            plt.text(x,y-2,label,color="red",fontsize=10, fontweight='bold')
            ax.add_patch(rect)
            
            #These "del"s for flush cuda memory, prevent it from overloading
            if torch.cuda.is_available():
                del x,y,x2,y2,box,label,score
                torch.cuda.empty_cache()
        if torch.cuda.is_available():
            del prediction,bboxes, labels,scores
            torch.cuda.empty_cache() 
    plt.show()

In [19]:
visualize_test(2)

['1.jpg', '2.jpg']


<Figure size 1080x1080 with 0 Axes>