In [1]:
from distort import distort
import pandas
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch.utils.data import Dataset

In [2]:
input_dir = '/home/huys/wheat_detection'
ori_df = pd.read_csv('{}/train.csv'.format(input_dir))
new = pd.DataFrame(columns = ['image_id', 'width', 'height', 'bbox', 'source'])

In [3]:
class BoxDataset(Dataset):

    def __init__(self, dataframe, noise_level=0):
        super().__init__()

        self.df = dataframe
        self.img_ids = dataframe['image_id'].unique()
        self.source = self._initsource()
        self.noise_level = noise_level

    def __getitem__(self, index: int):

        img_id = self.img_ids[index]
        source = self.source[index]

        records = self.df[self.df['image_id'] == img_id]

        boxes = self._boxread(records)

        boxes = distort(boxes, 1024, noise_level=self.noise_level)


        return img_id, boxes, source

    def __len__(self) -> int:
        return self.img_ids.shape[0]

    def _boxread(self, records):

        boxes = np.zeros((len(records), 4))
        boxes[:, 0:4] = records[['x', 'y', 'w', 'h']].values

        # transform to xyxy
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]

        return boxes

    def _initsource(self):

        source = []
        df = self.df.drop_duplicates('image_id')
        for id in self.img_ids:
            source.append(df[df['image_id'] == id]['source'].item())

        return source
        
def get_xywh(train_df):
    bboxs = np.stack(train_df['bbox'].apply(lambda x: np.fromstring(x[1:-1], sep=',')))
    for i, column in enumerate(['x', 'y', 'w', 'h']):
        train_df[column] = bboxs[:, i]
    train_df.drop(columns=['bbox'], inplace=True)
    return train_df

def filter_bbox(train_df):
    train_df['area'] = train_df['w'] * train_df['h']

    area_list = train_df['area'].values.tolist()
    area_list_copy = area_list.copy()
    area_list_copy.sort(reverse=True)

    for i in range(10):
        index = area_list.index(area_list_copy[i])
        train_df.drop([index], inplace=True)

    train_df.index = range(len(train_df))
    train_df.drop(columns=['area'], inplace=True)
    return train_df

In [4]:
get_xywh(ori_df)
filter_bbox(ori_df)

Unnamed: 0,image_id,width,height,source,x,y,w,h
0,b6ab77fd7,1024,1024,usask_1,834.0,222.0,56.0,36.0
1,b6ab77fd7,1024,1024,usask_1,226.0,548.0,130.0,58.0
2,b6ab77fd7,1024,1024,usask_1,377.0,504.0,74.0,160.0
3,b6ab77fd7,1024,1024,usask_1,834.0,95.0,109.0,107.0
4,b6ab77fd7,1024,1024,usask_1,26.0,144.0,124.0,117.0
...,...,...,...,...,...,...,...,...
147778,5e0747034,1024,1024,arvalis_2,64.0,619.0,84.0,95.0
147779,5e0747034,1024,1024,arvalis_2,292.0,549.0,107.0,82.0
147780,5e0747034,1024,1024,arvalis_2,134.0,228.0,141.0,71.0
147781,5e0747034,1024,1024,arvalis_2,430.0,13.0,184.0,79.0


In [9]:
for noise_level in [1, 2 ,3]:
    set = BoxDataset(ori_df, noise_level)
    result = []
    for i, (img_id, boxes, source) in tqdm(enumerate(set), total=len(set)):
        for box in boxes:
            box[2] = box[2] - box[0]
            box[3] = box[3] - box[1]
            result.append([img_id, 1024, 1024, box.tolist(), source])
    new = pd.DataFrame(result, columns = ['image_id', 'width', 'height', 'bbox', 'source'])
    new.to_csv(f'noiselevel{noise_level}.csv', index=False)

100%|██████████| 3373/3373 [01:49<00:00, 30.79it/s]
100%|██████████| 3373/3373 [01:48<00:00, 31.05it/s]
100%|██████████| 3373/3373 [01:48<00:00, 31.05it/s]


In [25]:
np.random.rand() * 0.2

0.04553440015722887