In [1]:
import pickle
import gzip
import os.path as pth

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [4]:
ROOT_DIR = '../data/original'
TRAIN_IMG_DIR = pth.join(ROOT_DIR, 'train_images')

WIDTH=1600
HEIGHT=256

In [5]:
train = pd.read_csv(pth.join(ROOT_DIR, 'train.csv'), names=['imageid_classid', 'encoded_pixels'], skiprows=1)
train.head()

Unnamed: 0,imageid_classid,encoded_pixels
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0002cc93b.jpg_2,
2,0002cc93b.jpg_3,
3,0002cc93b.jpg_4,
4,00031f466.jpg_1,


In [6]:
im_cl_df = train.imageid_classid.str.split('.jpg_', expand=True)
train['imageid'] = im_cl_df[0].astype('category')
train['classid'] = im_cl_df[1].astype('int8')
train['mask_present'] = pd.isna(train.encoded_pixels) == False
train.head()

Unnamed: 0,imageid_classid,encoded_pixels,imageid,classid,mask_present
0,0002cc93b.jpg_1,29102 12 29346 24 29602 24 29858 24 30114 24 3...,0002cc93b,1,True
1,0002cc93b.jpg_2,,0002cc93b,2,False
2,0002cc93b.jpg_3,,0002cc93b,3,False
3,0002cc93b.jpg_4,,0002cc93b,4,False
4,00031f466.jpg_1,,00031f466,1,False


In [7]:
def convert_to_mask(encoded_pixels):
    rle = np.array(list(map(int, encoded_pixels.split(' '))), dtype='int32')
    
    n_rle = len(rle) // 2
    rle = rle.reshape(n_rle, 2)
    rle[:, 0] -= 1
    
    mask = np.zeros(HEIGHT * WIDTH, dtype='uint8')
    
    for i in range(n_rle):
        #print(rle[i, :])
        mask[rle[i, 0]:(rle[i, 0]+rle[i, 1])] = 1
    
    return mask.reshape(WIDTH, HEIGHT).T.copy()

In [8]:
masks = {}

for t in tqdm(train.itertuples()):
    if pd.isna(t.encoded_pixels):
        continue
    
    mask = convert_to_mask(t.encoded_pixels)
    masks[f'{t.imageid}_{t.classid}'] = mask

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
with gzip.open('../data/train_masks.pickle.gz', 'wb') as f:
    pickle.dump(masks, f, -1)

In [11]:
train.drop(columns=['imageid_classid', 'encoded_pixels'], inplace=True)
train.to_feather('../data/train.feather')