In [3]:
import pandas as pd
from torch_snippets import *
import selectivesearch
from torchvision import transforms, models, datasets
from torch_snippets import Report
from torchvision.ops import nms
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
image_root = '..\data\open-images-bus-trucks\images\images'
df_data = pd.read_csv('..\data\open-images-bus-trucks\df.csv')
print(df_data.head())

In [None]:
class OpenImages(Dataset):
    def __init__(self, df, image_folder=image_root):
        self.root = image_folder
        self.df = df
        self.unique_images = df['ImageID'].unique()
    def __len__(self): return len(self.unique_images)
    def __getitem__(self, ix):
        image_id = self.unique_images[ix]
        image_path = f'{self.root}/{image_id}.jpg'
        image = cv2.imread(image_path, 1)
        # image = image[...,::-1] # conver BGR to RGB
        # image = image[:,:,::-1] # conver BGR to RGB - 也同样奏效
        # image = image[:][:][::-1] # 变成了一种翻转 -- [:][:]没有用
        # image = image[::-1][::-1][::-1] #变成了一种翻转     
        h, w, _ = image.shape
        df = self.df.copy()
        df = df[df['ImageID'] == image_id]
        boxes = df['XMin,YMin,XMax,YMax'.split(',')].values # 根据数据集构建边界框
        boxes = (boxes * np.array([w,h,w,h])).astype(np.uint16).tolist() # 根据数据集构建边界框
        classes = df['LabelName'].values.tolist()
        return image, boxes, classes, image_path
ds = OpenImages(df=df_data)
im, bbs, clss, _ = ds[5]
show(im, bbs=bbs, texts=clss, sz=10)

In [None]:
# 关于切片赋值的一些测试而已
# im1 = im[:][:][0] 
# print(im1)
# im2 = im[:,:,0]
# print(im2)
# im3 = im[0] 
# print(im3)
# im4 = im[:][0] 
# print(im4)
# im5 = im[:][:][:][0] 
# im6 = im[0][:]
# print(im6)  
# print(id(im5) == id(im6))
# im7 = im6
# print(id(im7) == id(im6))

In [None]:
# 返回一堆的边界框，叫做Box Candidates比较好
# 0 0 0 719 565
# 2 0 570 310 447
# ... 
# 后两个数字是宽和高
def extract_candidates(img):
    img_lbl, regions = selectivesearch.selective_search(img, scale=200, min_size=100)
    img_area = np.prod(img.shape[:2])
    candidates = []
    for r in regions:
        if r['rect'] in candidates: continue
        if r['size'] < (0.05*img_area): continue
        if r['size'] > (1*img_area): continue
        x, y, w, h = r['rect']
        candidates.append(list(r['rect']))
    return candidates

def extract_iou(boxA, boxB, epsilon=1e-5):
    x1 = max(boxA[0], boxB[0])
    y1 = max(boxA[1], boxB[1])
    x2 = min(boxA[2], boxB[2])
    y2 = min(boxA[3], boxB[3])
    width = (x2 - x1)
    height = (y2 - y1)
    if (width<0) or (height <0):
        return 0.0
    area_overlap = width * height
    area_a = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    area_b = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    area_combined = area_a + area_b - area_overlap
    iou = area_overlap / (area_combined+epsilon)
    return iou

In [None]:
#FPATHS - 每一个文件的路径
#GTBBS - 边界框的真值 - ground truth bounding boxes 从Dataset构建而来
#CLSS - 类别真值 Bus/Truck, 不过根据best_iou的值，增加一个背景类
#ROIS - region proposal locations - 通过extract_candidates和extract_iou得到
#IOUS - IoU of region proposals with ground truths - 通过extract_candidates和extract_iou得到
#DELTAS - delta offset of a bounding box with region proposals 

FPATHS, GTBBS, CLSS, DELTAS, ROIS, IOUS = [], [], [], [], [], []
N = 500
for ix, (im, bbs, labels, fpath) in enumerate(ds):
    if(ix==N): # why ???
        break
    H, W, _ = im.shape
    candidates = extract_candidates(im)
    candidates = np.array([(x,y,x+w,y+h) for x,y,w,h in candidates])
    ious, rois, clss, deltas = [], [], [], []
    ious = np.array([[extract_iou(candidate, _bb_) for candidate in candidates] for _bb_ in bbs]).T
    for jx, candidate in enumerate(candidates):
        cx,cy,cX,cY = candidate
        candidate_ious = ious[jx]
        best_iou_at = np.argmax(candidate_ious)
        best_iou = candidate_ious[best_iou_at]
        best_bb = _x,_y,_X,_Y = bbs[best_iou_at]
        if best_iou > 0.3: clss.append(labels[best_iou_at])
        else : clss.append('background')
        delta = np.array([_x-cx, _y-cy, _X-cX, _Y-cY]) / np.array([W,H,W,H])
        deltas.append(delta)
        rois.append(candidate / np.array([W,H,W,H]))
    FPATHS.append(fpath)
    IOUS.append(ious)
    ROIS.append(rois)
    CLSS.append(clss)
    DELTAS.append(deltas)
    GTBBS.append(bbs)
FPATHS = [f'{IMAGE_ROOT}/{stem(f)}.jpg' for f in FPATHS] 

In [None]:
targets = pd.DataFrame(flatten(CLSS), columns=['label'])
label2target = {l:t for t,l in enumerate(targets['label'].unique())}
target2label = {t:l for l,t in label2target.items()}
background_class = label2target['background']

In [None]:
# 数据集，重点看特征和标签都是啥
class RCNNDataset(Dataset):
    def __init__(self, fpaths, rois, labels, deltas, gtbbs):
        self.fpaths = fpaths
        self.gtbbs = gtbbs
        self.rois = rois
        self.labels = labels
        self.deltas = deltas
    def __len__(self): return len(self.fpaths)
    def __getitem__(self, ix):
        fpath = str(self.fpaths[ix])
        image = cv2.imread(fpath, 1)[...,::-1]
        H, W, _ = image.shape
        sh = np.array([W,H,W,H])
        gtbbs = self.gtbbs[ix]
        rois = self.rois[ix]
        bbs = (np.array(rois)*sh).astype(np.uint16)
        labels = self.labels[ix] # 这是卡车或巴士的类别标签
        deltas = self.deltas[ix] # 这是选择框的距离标签
        crops = [image[y:Y,x:X] for (x,y,X,Y) in bbs]
        return image, crops, bbs, labels, deltas, gtbbs, fpath
    # 这个collate_fn非常重要了，它把一堆堆的东西简化为3个值
    # 1. input - 特征 - 一组经过crop之后的根据Proposed特征框裁剪的图像
    # 2. labels - 每一张图像的分类值 - 标签1 （来源于Excel）
    # 3. deltas - 每一张图像距离真值的距离差 - 标签2 （来源与Excel以及selectivesearch）
    def collate_fn(self, batch):
        input, rois, rixs, labels, deltas = [], [], [], [], []
        for ix in range(len(batch)):
            image, crops, image_bbs, image_labels, image_deltas, image_gt_bbs, image_fpath = batch[ix]
            crops = [cv2.resize(crop, (224,224)) for crop in crops]
            crops = [preprocess_image(crop/255.)[None] for crop in crops]
            input.extend(crops)
            labels.extend([label2target[c] for c in image_labels])
            deltas.extend(image_deltas)
        input = torch.cat(input).to(device)
        labels = torch.Tensor(labels).long().to(device)
        deltas = torch.Tensor(deltas).float().to(device)
        return input, labels, deltas # 返回一个特征，两个标签

In [None]:
# 模型，由两个部分组成！ 1. 预测类别 2. 预测与真实边界框的差值 
class RCNN(nn.Module):
    def __init__(self):
        super().__init__()
        feature_dim = 25088 # 224*224 ? 可能是vgg_backbone输出的维度
        self.backbone = vgg_backbone
        self.cls_score = nn.Linear(feature_dim, len(label2target)) # 这是一个普通的分类器
        self.bbox = nn.Sequential( # 这是一个输出4个点的回归
              nn.Linear(feature_dim, 512), 
              nn.ReLU(),  # 为什么？
              nn.Linear(512, 4), 
              nn.Tanh(), # 为什么？
            )
        self.cel = nn.CrossEntropyLoss() # 分类交叉熵 - 我给你一堆图片告诉你它的类别，你来预测新图片。
        self.sl1 = nn.L1Loss() # 回归损失 - 关键就是，我给你一堆图片，告诉你这些图片的四个角和真值的4个点的距离，你构造一个函数来预测新的图片框距离真值的4点距离。
    def forward(self, input):
        feat = self.backbone(input) # 输入图片，输出是啥形状？ -- 可能是25088
        cls_score = self.cls_score(feat) # 类别值，输出是3分类 - len(label2target)
        bbox = self.bbox(feat) # 返回4个点的值
        return cls_score, bbox
    
    # 这个计算损失的函数，用了上面的损失对象
    def calc_loss(self, probs, _deltas, labels, deltas):
        detection_loss = self.cel(probs, labels)
        ixs, = torch.where(labels != 0)
        _deltas = _deltas[ixs]
        deltas = deltas[ixs]
        self.lmb = 10.0 # 回归参数的权重？
        if len(ixs) > 0:
            regression_loss = self.sl1(_deltas, deltas) # 何必在init中定义呢？直接用多好！
            return detection_loss + self.lmb * regression_loss, detection_loss.detach(), regression_loss.detach()
        else:
            regression_loss = 0
            return detection_loss + self.lmb * regression_loss, detection_loss.detach(), regression_loss

In [None]:
def train_batch(inputs, model, optimizer, criterion):
    input, clss, deltas = inputs
    model.train()
    optimizer.zero_grad()
    _clss, _deltas = model(input)
    # 调用方法，计算损失，这个方法是类中定义的calc_loss，我觉得这样太绕了，有什么特别的原因吗?
    loss, loc_loss, regr_loss = criterion(_clss, _deltas, clss, deltas) # 调用方法，计算损失。
    accs = clss == decode(_clss)
    loss.backward()
    optimizer.step()
    return loss.detach(), loc_loss, regr_loss, accs.cpu().numpy()

In [None]:
rcnn = RCNN().to(device)
criterion = rcnn.calc_loss # RCNN中构造的计算损失的方法，方法中用到了init中的损失函数类
optimizer = optim.SGD(rcnn.parameters(), lr=1e-3)
n_epochs = 5
log = Report(n_epochs)

In [None]:
for epoch in range(n_epochs):

    _n = len(train_loader)
    for ix, inputs in enumerate(train_loader):
        loss, loc_loss, regr_loss, accs = train_batch(inputs, rcnn, 
                                                      optimizer, criterion) # RCNN中构造的计算损失的方法，这就是以函数为对象！
        pos = (epoch + (ix+1)/_n)
        log.record(pos, trn_loss=loss.item(), trn_loc_loss=loc_loss, 
                   trn_regr_loss=regr_loss, 
                   trn_acc=accs.mean(), end='\r')
        
    _n = len(test_loader)
    for ix,inputs in enumerate(test_loader):
        _clss, _deltas, loss, \
        loc_loss, regr_loss, accs = validate_batch(inputs, 
                                                rcnn, criterion)
        pos = (epoch + (ix+1)/_n)
        log.record(pos, val_loss=loss.item(), val_loc_loss=loc_loss, 
                val_regr_loss=regr_loss, 
                val_acc=accs.mean(), end='\r')

# Plotting training and validation metrics
log.plot_epochs('trn_loss,val_loss'.split(','))

In [None]:
def test_predictions(filename, show_output=True):
    img = np.array(cv2.imread(filename, 1)[...,::-1]) # BGR -> GRB
    candidates = extract_candidates(img) # 候选边界框
    candidates = [(x,y,x+w,y+h) for x,y,w,h in candidates] # 生成XMin YMin XMax和YMax， 列表生成器？
    input = []
    for candidate in candidates: # 遍历
        x,y,X,Y = candidate # 提取坐标
        crop = cv2.resize(img[y:Y,x:X], (224,224)) # 改变图片至标准大小 
        input.append(preprocess_image(crop/255.)[None]) # 归一化之后，输入神经网络
    input = torch.cat(input).to(device)  # 为什么cat后面只有一个张量？？？
    with torch.no_grad(): # 测试模型，无需求导
        rcnn.eval() # 测试模型
        probs, deltas = rcnn(input) # 只需要把图片输入给训练好的模型，模型就自动的返回类别和Deltas值了 --- 4个点和边界框的距离差。
        probs = torch.nn.functional.softmax(probs, -1)
        confs, clss = torch.max(probs, -1)
    candidates = np.array(candidates)
    confs, clss, probs, deltas = [tensor.detach().cpu().numpy() for tensor in [confs, clss, probs, deltas]]

    ixs = clss!=background_class
    confs, clss, probs, deltas, candidates = [tensor[ixs] for tensor in [confs, clss, probs, deltas, candidates]]
    bbs = (candidates + deltas).astype(np.uint16) # 所要绘制的边框，就是候选边界框+预测出来的距离差
    ixs = nms(torch.tensor(bbs.astype(np.float32)), torch.tensor(confs), 0.05)
    confs, clss, probs, deltas, candidates, bbs = [tensor[ixs] for tensor in [confs, clss, probs, deltas, candidates, bbs]]
    if len(ixs) == 1:
        confs, clss, probs, deltas, candidates, bbs = [tensor[None] for tensor in [confs, clss, probs, deltas, candidates, bbs]]
    if len(confs) == 0 and not show_output:
        return (0,0,224,224), 'background', 0
    if len(confs) > 0:
        best_pred = np.argmax(confs)
        best_conf = np.max(confs)
        best_bb = bbs[best_pred]
        x,y,X,Y = best_bb
    _, ax = plt.subplots(1, 2, figsize=(20,10))
    show(img, ax=ax[0])
    ax[0].grid(False)
    ax[0].set_title('Original image')
    if len(confs) == 0:
        ax[1].imshow(img)
        ax[1].set_title('No objects')
        plt.show()
        return
    ax[1].set_title(target2label[clss[best_pred]])
    show(img, bbs=bbs.tolist(), texts=[target2label[c] for c in clss.tolist()], ax=ax[1], title='predicted bounding box and class')
    plt.show()
    return (x,y,X,Y),target2label[clss[best_pred]],best_conf