In [None]:
# # setup for the colab
# !pip install imageio
# !pip install torch
# !pip install kaggle
# # upload kaggle.json for data downloading
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle
# !mv kaggle.json ~/.kaggle
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c tgs-salt-identification-challenge
# !mkdir -p test
# !mkdir -p train
# !unzip test.zip -d test
# !unzip train.zip -d train

In [None]:
import os
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import imageio
from torch.utils import data

In [None]:
HEIGHT, WIDTH = 101, 101

In [None]:
class TrainDataset(data.Dataset):
    def __init__(self, file_list, root_path):
        self.file_list = file_list
        self.root_path = root_path
        
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, index):
        file_id = self.file_list[index]
        train_path = os.path.join(self.root_path, "train")
        image_path = os.path.join(train_path, "images", file_id + '.png')
        image = np.array(imageio.imread(image_path), dtype=np.uint8)
        
        mask_path = os.path.join(train_path, "masks", file_id + '.png')
        mask = np.array(imageio.imread(mask_path), dtype=np.uint8)
        return image, mask

In [None]:
train_values = pd.read_csv('train.csv')
file_list = list(train_values['id'])

dataset = TrainDataset(file_list, './')

In [None]:
def rle_to_mask(rle_string):
    if isinstance(rle_string, float) and np.isnan(rle_string):
        return np.zeros((HEIGHT, WIDTH)) 
    rle_numbers = [int(num) for num in rle_string.split()] 
    rle_pairs = np.array(rle_numbers).reshape((-1, 2))
    
    mask = np.zeros(HEIGHT*WIDTH)
    for start, length in rle_pairs:
        mask[start-1: start-1+length] = 255
    
    mask = mask.reshape((HEIGHT, WIDTH)).T
        
    return mask


In [None]:
# check if masks correctly oriented
for _ in range(15):
    i = random.randint(0, len(dataset)-1) 
    image, mask = dataset[i]
    f, axarr = plt.subplots(1, 3)
    axarr[0].imshow(image)
    axarr[1].imshow(mask, cmap='gray')
    axarr[2].imshow(rle_to_mask(train_values['rle_mask'][i]), cmap='gray')
    print(i, 'is correct: ', (mask==rle_to_mask(train_values['rle_mask'][i])).all())
    

In [None]:
# what's next?
# make some statistics:
# how salty on average?
# add depths
# what is correlation between depths
# what is usual depths
# salt-concetration correlation?
pixels = float(HEIGHT*WIDTH)
depths = pd.read_csv("depths.csv")

train_values['depths'] = depths['z']

plt.figure(figsize=(6, 6))
plt.hist(train_values['depths'], bins=50)


In [None]:
norm = HEIGHT * WIDTH * 255.0
def salt_concentration(mask):
    return np.sum(mask)/norm

train_values['salt_concentration'] = [salt_concentration(mask) for _, mask in dataset]

In [None]:
depths[:5]
train_values[:5]
train_all_values = train_values.merge(depths, how='left')

In [215]:
print(train_all_values[:5])
print(file_list[:5dd])

           id                                           rle_mask  depths  \
0  575d24d81d                                                NaN     306   
1  a266a2a9df                                          5051 5151     157   
2  75efad62c1  9 93 109 94 210 94 310 95 411 95 511 96 612 96...     305   
3  34e51dba6a  48 54 149 54 251 53 353 52 455 51 557 50 659 4...     503   
4  4875705fb0  1111 1 1212 1 1313 1 1414 1 1514 2 1615 2 1716...     783   

   salt_concentration    z  
0            0.000000  843  
1            0.504950  794  
2            0.993334  468  
3            0.149201  727  
4            0.042839  797  
['575d24d81d', 'a266a2a9df', '75efad62c1', '34e51dba6a', '4875705fb0']
