# Fast R-CNN

To make R-CNN faster, Girshick (2015) improved the training procedure by unifying three independent models into one jointly trained framework and increasing shared computation results, named Fast R-CNN. Instead of extracting CNN feature vectors independently for each region proposal, this model aggregates them into one CNN forward pass over the entire image and the region proposals share this feature matrix. Then the same feature matrix is branched out to be used for learning the object classifier and the bounding-box regressor. In conclusion, computation sharing speeds up R-CNN.

In [1]:
import os 
import glob
import pandas as pd
from collections import Counter 
import cv2
import numpy as np
import random
from sklearn.model_selection import train_test_split


import torch

import torch.nn as nn
from torch.utils.data import Dataset,DataLoader

from torchsummary import summary


from torchvision import models
from torchvision import transforms as T

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.patches as patches

from dataload import xml_to_csv,PetData,Sub_region_train,Sub_region

from tqdm import tqdm
from utills import ssearch,misc
from utills.misc import create_label,balance_df
from metrics import iou

import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

In [2]:
##Loading data
#root_path="D:/Dataset/Pet_Data/"
root_path="D:/Dataset/Oxford"

img_path=os.path.join(root_path,"images")
annotation_path=os.path.join(root_path,"annotations/xmls")           
annots = glob.glob(annotation_path+"/*.xml")
seed=0
df=xml_to_csv(annots,img_path)
df.head()

## Make Balanced Dataset (To save time, but don't do this in real research!!)
g = df.groupby('target')
balanced_df = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min(),random_state=seed).reset_index(drop=True),))  
## 
train, valid = train_test_split(balanced_df, test_size=0.3,random_state=seed)  

BATCH_SIZE = 1




## Updated Dataset Class (Added Selective Search "inside" original class)

In [3]:
class PetData(Dataset):
    def __init__(self, dataframe,train=False,ssearch=False,samples=16):
        self.df=dataframe
        self.ssearch=ssearch
        self.transform=iaa.Sequential([iaa.Resize((224,224))])
        self.torch_transform=T.Compose([T.ToTensor(),
                                        T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])    
        self.samples=samples
        self.train=train
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        regions=None
        fn,target,xmin,ymin,xmax,ymax=self.df.iloc[idx] #
        im=cv2.cvtColor(cv2.imread(fn),cv2.COLOR_BGR2RGB) ##Load Img
        
        class_label=target+1  ##Class #0 represents background
        bbs=BoundingBoxesOnImage([BoundingBox(xmin,ymin,xmax,ymax,label=class_label)], shape=im.shape) #BBox
        image_aug, bbs_aug = self.transform(image=im, bounding_boxes=bbs) #Transformation
        bbs_aug=torch.stack([torch.tensor([bb.x1,bb.y1,bb.x2,bb.y2,bb.label]) for bb in bbs_aug])
        
        region_np=[]
        img_shape=image_aug.shape      
        if self.ssearch:                                
            regions=ssearch.selective_search(image_aug, scale=50, sigma=0.8, min_size=20)
            
            if self.train:
                regions=create_label(regions,bbs_aug,iou_threshold=0.5)
                regions=[dict(t) for t in {tuple(d.items()) for d in regions}]
                for dicts in regions:
                    region_np.append((np.array(dicts['rect'],dtype=np.float)))
                region_np=np.stack(region_np)
                region_np=region_np[np.where(region_np[:,-1]>0.1)]
                
                try:
                    pos_idx = random.choices(np.where((region_np[:,4]) != 0)[0],k=16)
                except:
                    pos_idx=[]
                neg_samples=64
                if len(pos_idx):
                    neg_samples=48
                neg_idx = random.choices(np.where((region_np[:,4]) == 0)[0],k=neg_samples)
                region_np=region_np[pos_idx+neg_idx]
                region_np=torch.from_numpy(region_np)
                labels=region_np[:,4].long()
                bbox_idx=region_np[:,5].long()
                region_np=torch.stack([torch.clamp(region_np[:,0]-16,0,img_shape[1]),
                                       torch.clamp(region_np[:,1]-16,0,img_shape[0]),
                                       torch.clamp(region_np[:,2]+16,0,img_shape[1]),
                                       torch.clamp(region_np[:,3]+16,0,img_shape[0])],dim=1)
                
            else:
                regions=[dict(t) for t in {tuple(d.items()) for d in regions}]
                for dicts in regions:
                    region_np.append((np.array(dicts['rect'],dtype=np.float)))
                region_np=np.stack(region_np)
                region_np=region_np[np.where(region_np[:,5]>0.1)]
                        
                region_np=torch.from_numpy(region_np)
                region_np=torch.stack([torch.clamp(region_np[:,0]-16,0,img_shape[1]),
                                       torch.clamp(region_np[:,1]-16,0,img_shape[0]),
                                       torch.clamp(region_np[:,2]+16,0,img_shape[1]),
                                       torch.clamp(region_np[:,3]+16,0,img_shape[0]),
                                       region_np[:,4]],dim=1)


        return self.torch_transform(image_aug), bbs_aug,region_np,labels,bbox_idx

In [4]:
train_ds = PetData(train, train=True, ssearch=True)
valid_ds= PetData(valid, train=True,ssearch=True)

BATCH_SIZE=2
def collate_fn(batch):
    return zip(*batch)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE
                                       , collate_fn=collate_fn,shuffle=False)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE
                                       , collate_fn=collate_fn)

## <center>    First, pre-train a convolutional neural network on image classification tasks. </center>


In [5]:
model = models.vgg16(pretrained=True)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
                              
print(f'Num of parameters: {count_parameters(model)}')


Num of parameters: 138357544


## Fast RCNN Network Architecture


![ROI](https://i.imgur.com/yYIDM5h.png)

In [6]:
class roi_pooling(nn.Module):
    def __init__(self,ft_shape,img_shape,size):
        '''
        '''
        super(roi_pooling, self).__init__()
        self.size=size
        #Layers
        self.img_shape=img_shape
        self.ft_shape=ft_shape
        self.adaptivepool=nn.AdaptiveMaxPool2d(size[0], size[1])
    
    def forward(self,ft,rois):
        batch_size=len(rois)
        out=[]
        for i in range(batch_size):
            out_img=[]
            roi=rois[i]
            ft_img=ft[i].unsqueeze(0)

            x1=torch.floor((roi[:,0]/self.img_shape)*self.ft_shape).type(torch.int32)
            y1=torch.floor((roi[:,1]/self.img_shape)*self.ft_shape).type(torch.int32)
            x2=torch.ceil((roi[:,2]/self.img_shape)*self.ft_shape).type(torch.int32)
            y2=torch.ceil((roi[:,3]/self.img_shape)*self.ft_shape).type(torch.int32)

            for j in range(roi.shape[0]):
                ft_img_=ft_img[:,:,y1[j]:y2[j], x1[j]:x2[j]]
                ft_img_ = self.adaptivepool(ft_img_)[0]
                out_img.append(ft_img_)
            out_img=torch.cat(out_img, dim=0).unsqueeze(0)  
            out.append(out_img)
        out = torch.cat(out, dim=0)  
        return out
    

In [7]:
class Fast_RCNN(nn.Module):
    def __init__(self,ft_shape=32,img_shape=224,num_class=2):
        '''
        '''
        super(Fast_RCNN, self).__init__()
        
        self.num_class=num_class
        
        #Layers
       # self.pretrained=models.vgg16(pretrained=True)
        self.ft_net=models.vgg16(pretrained=True).features[0:-1]
        self.roi=roi_pooling(ft_shape=14,img_shape=img_shape,size=(7,7))
        self.classifier_net=models.vgg16(pretrained=True).classifier[0:-1]
        self.cls_score = nn.Linear(4096, num_class+1)
        self.bbox = nn.Linear(4096, 4*(num_class+1))
                

    def forward(self,imgs,regions):
        
        fts=self.ft_net(imgs)
        o_roi=self.roi(fts,regions)
        
        batch_size=o_roi.shape[0]
        regions_num=o_roi.shape[1]
        
        o_roi=o_roi.detach()
        o_roi=o_roi.view(batch_size*regions_num,-1)
        ft=self.classifier_net(o_roi)
        
        cls_score = self.cls_score(ft).view(batch_size*regions_num,-1)
        bbox = self.bbox(ft).view(batch_size*regions_num, self.num_class+1, 4)
        
      #  cls_score = self.cls_score(ft).view(batch_size,regions_num,-1)
       # bbox = self.bbox(ft).view(batch_size,regions_num, self.num_class+1, 4)
        return cls_score,bbox
    

In [8]:
fastRCNN=Fast_RCNN()
fastRCNN.eval()
fastRCNN

Fast_RCNN(
  (ft_net): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilati

In [9]:
## Single Batch Test

In [10]:
single_batch=next(iter(train_dl))

In [11]:
img,gt_box,rois,label,bbox_idx=single_batch

img=torch.stack([i for i in img])
gt_box=torch.stack([i for i in gt_box])
rois=torch.stack([i for i in rois])
rois_label=torch.stack([i for i in label]).reshape(-1)
bbox_idx=torch.stack([i for i in bbox_idx])

In [12]:
#Forward

In [13]:
clf_score,offsets=fastRCNN(img,rois)

![image.png](https://i.imgur.com/oEktP0k.png)


![image.png](https://i.imgur.com/v9D5lTU.png)

![image.png](https://i.imgur.com/I7kpSxV.png)


In [14]:
from utills.misc import cvtScale

In [15]:
clf_crit=nn.CrossEntropyLoss()
clf_loss=clf_crit(clf_score,rois_label)
clf_loss

tensor(1.3523, grad_fn=<NllLossBackward>)

In [16]:
rois=rois.reshape(-1,4)
tgt_box=[]

lbl = rois_label.view(-1, 1, 1).expand(rois_label.size(0), 1, 4)
mask = (rois_label != 0).float().view(-1, 1).expand(rois_label.size(0), 4)


for img_idx,bbox_idx_image in enumerate(bbox_idx):
    tgt_box.append(torch.stack([cvtScale(gt_box[img_idx][bbox_idx])  for bbox_idx in bbox_idx_image]))
tgt_box=torch.cat(tgt_box,dim=0).detach()

t_x=(tgt_box[:,0]-rois[:,0])/rois[:,2]
t_y=(tgt_box[:,1]-rois[:,1])/rois[:,3]
t_w=torch.log(tgt_box[:,2]/rois[:,2])
t_h=torch.log(tgt_box[:,3]/rois[:,3])

tgt_offset=torch.stack([t_x,t_y,t_w,t_h],dim=1)
tgt_offset=tgt_offset*mask.detach()

offsets=offsets.gather(1, lbl).squeeze(1)* mask                                                

In [17]:
reg_crit=nn.SmoothL1Loss()
reg_loss=reg_crit(offsets,tgt_offset)
reg_loss

tensor(0.0838, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)

In [18]:
optimizer = torch.optim.Adam(fastRCNN.parameters(), lr=1e-4)

In [19]:
from utills.tboard import Writer,delete_files

In [20]:
base_folder="experiments"
try:
    os.mkdir(base_folder)
except:
    print("Exists")

Exists


In [21]:
#Create TensorBoard Summary

writer=Writer("FastRCNN","experiments")

Event are saved under experiments\FastRCNN\5


## Training

In [22]:
epochs=50
device="cuda"
fastRCNN.to(device)
reg_crit=nn.L1Loss()
clf_crit=nn.CrossEntropyLoss()

train_log_idx=0
valid_log_idx=0
for epoch in range(epochs):
    print(f"epochs: {epoch}")

    
    fastRCNN.train()
    fastRCNN.to(device)
    correct=0
    total=0
    for batch_idx,(img,gt_box,rois,label,bbox_idx) in enumerate(train_dl):
        optimizer.zero_grad()
        

        img=torch.stack([i for i in img])
        gt_box=torch.stack([i for i in gt_box])
        rois=torch.stack([i for i in rois])
        rois_label=torch.stack([i for i in label]).reshape(-1)
        bbox_idx=torch.stack([i for i in bbox_idx])
        
        img=img.to(device)
        rois=rois.to(device)     
        
        clf_score,offsets=fastRCNN(img,rois)
        rois=rois.to("cpu")
        
                
        rois=rois.reshape(-1,4)
        tgt_box=[]

        lbl = rois_label.view(-1, 1, 1).expand(rois_label.size(0), 1, 4)
        mask = (rois_label != 0).float().view(-1, 1).expand(rois_label.size(0), 4)
        for img_idx,bbox_idx_image in enumerate(bbox_idx):
            tgt_box.append(torch.stack([cvtScale(gt_box[img_idx][bbox_idx])  for bbox_idx in bbox_idx_image]))
        tgt_box=torch.cat(tgt_box,dim=0).detach()
        t_x=(tgt_box[:,0]-rois[:,0])/rois[:,2]
        t_y=(tgt_box[:,1]-rois[:,1])/rois[:,3]
        t_w=torch.log(tgt_box[:,2]/rois[:,2])
        t_h=torch.log(tgt_box[:,3]/rois[:,3])
        tgt_offset=torch.stack([t_x,t_y,t_w,t_h],dim=1)
        tgt_offset=tgt_offset*mask.detach()

        tgt_offset=tgt_offset.to(device)
        lbl=lbl.to(device)
        mask=mask.to(device)
        offsets=offsets.gather(1, lbl).squeeze(1)* mask                                                                               
        
        rois_label=rois_label.to(device)
        tgt_offset=tgt_offset.to(device)
        clf_loss=clf_crit(clf_score,rois_label)
        
        
        reg_loss=reg_crit(offsets,tgt_offset)
        
        _, predicted = torch.max(clf_score.data, 1)
        total+=rois_label.size()[0]
        correct += (predicted == rois_label).sum().item()
        
        
        total_loss=clf_loss+reg_loss
        total_loss.backward()
        optimizer.step()
        
        if batch_idx % 50==0:
            print(f'batch_idx: {batch_idx}/{len(train_dl)}')
            print(f'CLFLOSS: {clf_loss.item()}')
            print(f'REGLOSS: {reg_loss.item()}')
            print(f'ACCURACY: {correct/(total)}')
            writer.add_events("clf_loss",clf_loss.item(),train_log_idx)
            writer.add_events("reg_loss",reg_loss.item(),train_log_idx)
            writer.add_events("Accuracy",correct/(total),train_log_idx)
            train_log_idx+=1
    torch.save(fastRCNN.state_dict(), "E:/OD_models/FAST_RCNN/"+f"epoch_{epoch}.pth")


    

epochs: 0
batch_idx: 0/832
CLFLOSS: 1.544580340385437
REGLOSS: 0.2258477779233986
ACCURACY: 0.1796875
batch_idx: 50/832
CLFLOSS: 0.3645169734954834
REGLOSS: 0.0866283785217406
ACCURACY: 0.7582720588235294
batch_idx: 100/832
CLFLOSS: 0.29322201013565063
REGLOSS: 0.07342008447256615
ACCURACY: 0.7782332920792079
batch_idx: 150/832
CLFLOSS: 0.28911107778549194
REGLOSS: 0.052382240492322193
ACCURACY: 0.7918046357615894
batch_idx: 200/832
CLFLOSS: 0.2384262979030609
REGLOSS: 0.07218564511153515
ACCURACY: 0.8049207089552238
batch_idx: 250/832
CLFLOSS: 0.39045438170433044
REGLOSS: 0.09859483337727426
ACCURACY: 0.8109437250996016
batch_idx: 300/832
CLFLOSS: 0.47391268610954285
REGLOSS: 0.10210372847762
ACCURACY: 0.8154588870431894
batch_idx: 350/832
CLFLOSS: 1.1319419145584106
REGLOSS: 0.11562253203005955
ACCURACY: 0.8181089743589743
batch_idx: 400/832
CLFLOSS: 0.2033587247133255
REGLOSS: 0.02430272985225502
ACCURACY: 0.824501246882793
batch_idx: 450/832
CLFLOSS: 0.511923611164093
REGLOSS: 0.05

In [23]:
correct/(total)

0.9891104179194228

In [25]:
clf_loss.device

device(type='cuda', index=0)

In [132]:
total_loss.backward()

In [99]:
lbl.item()

ValueError: only one element tensors can be converted to Python scalars