In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cv2
import random
import os 
from os.path import dirname
import re
import json 
from torch.utils.data import Dataset
%matplotlib nbagg

# Dataset creator

### 1 - Create csv containing files names

In [None]:
def create_csv(dataset_name,
               datas_list=['stereo_images', 'mask', 'depth_map', 'annotations']):
    """
    Creates a dataframe containing all files names for the dataset
    Input:
       - datas_list: list datas from the dataset to include in the
       dataframe
       - dataset_name: name of the dataset
    Output:
        - dataframe object
    """
    dataset_path = dirname(os.getcwd()) + '/data/' + str(dataset_name) + '/'
    regex = re.compile(r'\d+')
    datas = {}
    # Getting the files
    for data_type in datas_list:
        files = os.listdir(dataset_path + data_type)
        if data_type in ['mask', 'stereo_images']:
            right_files = [f for f in files if 'right' in f]
            if 'right' in right_files:
                right_files.remove('right')
            right_id = [int(regex.findall(f)[0]) for f in right_files]
            left_files = [f for f in files if 'left' in f]
            if 'left' in left_files:
                left_files.remove('left')
            left_id = [int(regex.findall(f)[0]) for f in left_files] 
            datas[data_type + '_left'] = (left_files, left_id)
            datas[data_type + '_right'] = (right_files, right_id)
        else:
            files_id = [int(regex.findall(f)[0]) for f in files]
            datas[data_type] = ((files, files_id))
    size = len(datas[datas.keys()[0]][0])
    dataset = pd.DataFrame(index=range(size), columns=datas.keys())
    
    # Let's fill the dataframe now
    for key in datas.keys():
        for ix in range(size):
            dataset[key][datas[key][1][ix]] = datas[key][0][ix]
    dataset.to_csv(dataset_name + '.csv')
    return dataset

In [None]:
dataset_name = 'blender_v3'

In [None]:
datas = create_csv(dataset_name)

In [None]:
train_split = 0.8

In [None]:
def split_train_test(df, train_split):
    """
    Split a dataframe randomly into train & test set
    """
    train_ix = random.sample(range(len(df)), int(train_split * len(df)))
    test_ix = list(set(df.index) - set(train_ix))
    train_df = df.iloc[train_ix, :].reset_index(drop=True)
    test_df = df.iloc[test_ix, :].reset_index(drop=True)
    
    return train_df, test_df

In [None]:
train, test = split_train_test(datas, train_split)

In [None]:
train.head()

In [None]:
len(train)

In [None]:
test.head()

In [None]:
len(test)

### 2 - Wrap everything into a class

In [None]:
class VolumeDataset(Dataset):
    """
    Load a dataset for Volume estimation.
    
    Args:
       - dataset_name
       - datas_list: datas to keep in the dataset
       - size: float between 0 & 1 (ex 0.7)
    """
    def __init__(self, csv_file, dataset_name,
                 datas_list, size, target='volume'):
        self.csv_file = csv_file
        self.dataset_name = dataset_name
        self.datas_list = datas_list
        self.size = size
        self.target = target
        
        # Create the csv file needed
        if '.csv' not in csv_file:
            csv_file + '.csv'
        self.dataset = pd.read_csv(csv_file, index_col=0)
        self.dataset = self.dataset[datas_list]
        self.subsample_dataset()
    
    def subsample_dataset(self):
        """
        Subsample dataset to a give size ratio of the whole dataset
        """
        num_examples = int(self.size * len(self.dataset))
        self.dataset = self.dataset.sample(num_examples).reset_index(drop=True)
        
    def __getitem__(self, index):
        input = {}
        dataset_dir = '../data/' + self.dataset_name 
        for data in self.dataset.columns:
            file_name = self.dataset.loc[index, data]
            if '_right' in data : 
                file_path = dataset_dir + '/' + data.replace('_right', '') + '/' + file_name
            elif '_left' in data:
                file_path = dataset_dir + '/' + data.replace('_left', '') + '/' + file_name
            else: 
                file_path = dataset_dir + '/' + data + '/' + file_name
            if 'npy' in file_name:
                if data == 'depth_map':
                    input[data] = np.load(file_path).T
                else:
                    input[data] = np.load(file_path)
            elif 'png' in file_name:
                input[data] = cv2.imread(file_path)
            elif 'json' in file_name:
                with open(file_path) as f:
                    label = json.load(f)[self.target]
            
        return input, label

In [None]:
csv_file = 'blender_v3.csv'
dataset_name = 'blender_v3'
datas_list = ['depth_map', 'mask_right', 'annotations']
size = 0.9

In [None]:
train_dataset = VolumeDataset(csv_file=csv_file,
                              dataset_name=dataset_name, 
                              datas_list=datas_list, 
                              size=size)

In [None]:
input, label = train_dataset[0]