## 数据处理

In [1]:
import pandas as pd
import numpy as np
from path import Path

import matplotlib.pyplot as plt
import cv2

from sklearn import preprocessing 
import xml.etree.ElementTree as ET

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import torch
from torch.utils.data import DataLoader, Dataset

In [None]:
root=Path(r'')
imagedir=root/'JPEGImages'
xmldir=root/'Annotations'

class XmlParser(object):
    
    def __init__(self,xml_file):

        self.xml_file = xml_file
        self._root = ET.parse(self.xml_file).getroot()
        self._objects = self._root.findall("object")
        # path to the image file as describe in the xml file
        self.img_path = imagedir/self._root.find('filename').text
        # image id 
        self.image_id = self.img_path.stem
        # names of the classes contained in the xml file
        self.names = self._get_names()
        # coordinates of the bounding boxes
        self.boxes = self._get_bndbox()

    def parse_xml(self):
        """"Parse the xml file returning the root."""
    
        tree = ET.parse(self.xml_file)
        return tree.getroot()

    def _get_names(self):
        
        names = [obj.find("name").text for obj in self._objects]
        return np.array(names)

    def _get_bndbox(self):

        boxes = []
        for obj in self._objects:
            coordinates = []
            bndbox = obj.find("bndbox")
            coordinates.append(np.float32(bndbox.find("xmin").text))
            coordinates.append(np.float32(bndbox.find("ymin").text))
            coordinates.append(np.float32(bndbox.find("xmax").text))
            coordinates.append(np.float32(bndbox.find("ymax").text))
            boxes.append(coordinates)

        return np.array(boxes)


def xml_files_to_df(xml_files):
        
    """"Return pandas dataframe from list of XML files."""
    
    names = []
    boxes = []
    image_id = []
    xml_path = []
    img_path = []
    for file in xml_files:
        xml = XmlParser(file)
        names.extend(xml.names)
        boxes.extend(xml.boxes)
        image_id.extend([xml.image_id] * len(xml.names))
        xml_path.extend([xml.xml_file] * len(xml.names))
        img_path.extend([xml.img_path] * len(xml.names))
        
    data = {"image_id": image_id,
            "names": names,
            "boxes": boxes,
            "xml_path":xml_path,
            "img_path":img_path}
    
    df = pd.DataFrame.from_dict(data, orient='index')
    df = df.transpose()

    # df['xmin'] = -1
    # df['ymin'] = -1
    # df['xmax'] = -1
    # df['ymax'] = -1

    df[['xmin','ymin','xmax','ymax']]=np.stack(df['boxes'][i] for i in range(len(df['boxes'])))

    # df.drop(columns=['boxes'], inplace=True)
    # df['xmin'] = df['xmin'].astype(np.float32)
    # df['ymin'] = df['ymin'].astype(np.float32)
    # df['xmax'] = df['xmax'].astype(np.float32)
    # df['ymax'] = df['ymax'].astype(np.float32)

    enc = preprocessing.LabelEncoder()
    df['labels'] = enc.fit_transform(df['names'])
    df['labels'] = np.stack(df['labels'][i]+1 for i in range(len(df['labels']))) 
    return df

df = xml_files_to_df(xmldir.files())
df.head()

In [None]:
classes = df[['names','labels']].value_counts()
print(classes)
classes = {item.labels:item.names for item in classes.reset_index().itertuples(index=False) }
print(classes)

image_ids = df['image_id'].unique()
train_len = int(len(image_ids)*0.8)
train_ids = image_ids[:train_len]
valid_ids = image_ids[train_len:]

valid_df = df[df['image_id'].isin(valid_ids)]
train_df = df[df['image_id'].isin(train_ids)]
print(train_df.shape, valid_df.shape)

In [4]:
class VOCDataset(Dataset):
        
    def __init__(self, dataframe, image_dir, transforms=None):
        super().__init__()
        
        self.image_ids = dataframe['image_id'].unique()
        self.df = dataframe
        self.image_dir = image_dir
        self.transforms = transforms
    
    def _parse(self, index: int):
        image_id = self.image_ids[index]
        records = self.df[self.df['image_id'] == image_id]
        
        image = cv2.imread(f'{self.image_dir}/{image_id}.jpg', cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        
        # boxes = records[['xmin', 'ymin', 'xmax', 'ymax']].values
        boxes=np.stack(records['boxes'].values)
       
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        area = torch.as_tensor(area, dtype=torch.float32)
        
        label = records['labels'].values
        labels = torch.as_tensor(label, dtype=torch.int64)
        
        # suppose all instances are not crowd
        iscrowd = torch.zeros((records.shape[0],), dtype=torch.int64)
        
        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id': torch.tensor([index]),
            'area': area,
            'iscrowd': iscrowd
        }
        return image, target 

    def __getitem__(self, index: int):
        image, target = self._parse(index)
        
        if self.transforms:
            sample = {
                'image': image,
                'bboxes': target['boxes'],
                'labels': target['labels']
            }
            sample = self.transforms(**sample)
            image = sample['image']
            
            # target['boxes'] = torch.stack(tuple(map(torch.tensor, zip(*sample['bboxes'])))).permute(1,0)
            target['boxes'] = torch.stack(tuple(map(torch.tensor, sample['bboxes'])))
            
        return image, target

    def coco_index(self, index):
        """
        该方法是专门为pycocotools统计标签信息准备，不对图像和标签作任何处理
        由于不用去读取图片，可大幅缩减统计时间
        """
        # read xml
        image, target = self._parse(index)
        data_height,data_width = image.shape[:2]
        return (data_height, data_width), target
        
    def __len__(self) -> int:
        return self.image_ids.shape[0]

In [10]:
def get_transform_train():
    return A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.2),
        A.Normalize(max_pixel_value=1),
        ToTensorV2(p=1.0)
    ], bbox_params={'format':'pascal_voc', 'label_fields': ['labels']})

def get_transform_valid():
    return A.Compose([
        A.Normalize(max_pixel_value=1),
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields':['labels']})


def collate_fn(batch):
    return tuple(zip(*batch))

train_dataset = VOCDataset(train_df, imagedir, get_transform_train())
valid_dataset = VOCDataset(valid_df, imagedir, get_transform_valid())

# split the dataset in train and test set
# indices = torch.randperm(len(train_dataset)).tolist()
train_data_loader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    num_workers=0,
    collate_fn=collate_fn
)

valid_data_loader = DataLoader(
    valid_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=4,
    collate_fn=collate_fn
)

In [15]:
def denorm(x):
    mean=(0.485, 0.456, 0.406)
    std=(0.229, 0.224, 0.225)
    mean=torch.Tensor(mean)
    std=torch.Tensor(std)
    area=x.shape[1]*x.shape[2]
    mean=mean.reshape(3,1).repeat(1,area).reshape(x.shape)
    std = std.reshape(3,1).repeat(1,area).reshape(x.shape)
    out = x.mul(std).add(mean).clamp(0,1) # 逆正则化  Normalize(mean=[0.5], std=[0.5])]  (x-0.5)/0.5=2x-1
    out = out.mul(255).add(0.5).clamp(0, 255) # 恢复至区间[0,255]
    return out.permute(1,2,0).to("cpu", torch.uint8).numpy() # 改变维度，转成整数

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
images, targets= next(iter(train_data_loader))
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

plt.figure(figsize=(20,20))
for i, (image, target) in enumerate(zip(images, targets)):
    plt.subplot(2,2, i+1)
    boxes = targets[i]['boxes'].cpu().numpy().astype(np.int32)
    sample = denorm(images[i])
    # sample = images[i].permute(1,2,0).cpu().numpy()
    names = targets[i]['labels'].cpu().numpy().astype(np.int64)
    for i,box in enumerate(boxes):
        cv2.rectangle(sample,
                      (box[0], box[1]),
                      (box[2], box[3]),
                      (0, 0, 220), 2)
        cv2.putText(sample, classes[names[i]], (box[0],box[1]+15),cv2.FONT_HERSHEY_COMPLEX ,0.5,(0,220,0),1,cv2.LINE_AA)  

    plt.axis('off')
    plt.imshow(sample)

## FasterRCNN

In [None]:
import torchvision
import torch
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

num_classes = 4
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, num_classes = num_classes)

# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.01, weight_decay=0.0001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

from train_eval_utils import train_one_epoch,evaluate
num_epochs = 100

for epoch in range(num_epochs):
    print(epoch)
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, valid_data_loader, device=device)
    save_files = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_scheduler': lr_scheduler.state_dict(),
            'epoch': epoch}
    if (epoch+1)%25==0:       
        torch.save(save_files, f'/project/train/models/faster_rcnn_state_{epoch+1}.pth')
