# Preprocessing

This notebook crops the large 15km by 15km GFL images and crop them into images that are 1024 x 1024 pixels = 1024 x 1024 meters. 

In [10]:
from PIL import Image
from itertools import product
import glob
import os
Image.MAX_IMAGE_PIXELS = None
import numpy as np
import h5py
import torch
import matplotlib.image as pltim

In [2]:
def tile(filename, dir_out_train, dir_out_test, d):
    name, ext = os.path.splitext(filename)
    name = os.path.basename(name)
    img = Image.open(filename)
    w, h = img.size
    
    grid = product(range(0, h-h%d, d), range(0, w-w%d, d))
    idx = 0
    for i, j in grid:
        box = (j, i, j+d, i+d)
        out_test = os.path.join(dir_out_test, f'{name}_index_{idx}.jpeg')
        out_train = os.path.join(dir_out_train, f'{name}_index_{idx}.jpeg')
        crop = img.crop(box)
        if idx%28 == 0:
            crop.save(out_test, format='jpeg' )
        else:
            crop.save(out_train, format='jpeg')
        idx+=1

In [3]:
def tile_mask(filename, dir_out_train, dir_out_test, d):
    name, ext = os.path.splitext(filename)
    name = os.path.basename(name)
    f = h5py.File(filename,'r')
    data = f['L']
    data = np.array(data).T
    f.close()
    img = Image.fromarray(data)
    w, h = img.size
    
    grid = product(range(0, h-h%d, d), range(0, w-w%d, d))
    idx = 0
    for i, j in grid:
        box = (j, i, j+d, i+d)
        out_test = os.path.join(dir_out_test, f'{name}_index_{idx}.jpeg')
        out_train = os.path.join(dir_out_train, f'{name}_index_{idx}.jpeg')
        crop = img.crop(box) 
        crop = np.array(crop)/255
        crop[crop <= 0.5] = 0
        crop[crop > 0.5] = 1
        crop = Image.fromarray(np.uint8(crop * 255))
        #crop = crop.convert('L')
        if idx%28 == 0:
            crop.save(out_test, format='jpeg')
        else:
            crop.save(out_train, format='jpeg')
        idx+=1

## Input images

oops i accidentally deleted x_train from sherlock dummy

In [40]:
dir_in_x = '/scratch/groups/earlew/arlenlex/dl-ice-floes/x/gt/'
dir_out_x_train = '/scratch/groups/earlew/arlenlex/cs230-dataset/train/x_train'
dir_out_x_test = '/scratch/groups/earlew/arlenlex/cs230-dataset/test/x_test'

In [41]:
for filename in sorted(glob.glob(os.path.join(dir_in_x, '*.tif'))):
    tile(filename, dir_out_x_train, dir_out_x_test, 1024)

## Masks

In [4]:
dir_in_y = '/scratch/groups/earlew/arlenlex/dl-ice-floes/y/labeled-matlab-files/'
dir_out_y_train = '/scratch/groups/earlew/arlenlex/cs230-dataset/train/y_train'
dir_out_y_test = '/scratch/groups/earlew/arlenlex/cs230-dataset/test/y_test'

In [5]:
for filename in sorted(glob.glob(os.path.join(dir_in_y, '*.mat'))):
    tile_mask(filename, dir_out_y_train, dir_out_y_test, 1024)

# Test

In [18]:
filename = '/scratch/groups/earlew/arlenlex/dl-ice-floes/x/gt/beaufo_19990728_2.tif'
name, ext = os.path.splitext(filename)
name = os.path.basename(name)
img = Image.open(filename)
w, h = img.size
print("w = ", w, " h =  ", h)

w =  30202  h =   11030


In [17]:
filename = '/scratch/groups/earlew/arlenlex/dl-ice-floes/y/labeled-matlab-files/beaufo_19990728_2.mat'
name, ext = os.path.splitext(filename)
name = os.path.basename(name)
f = h5py.File(filename,'r')
data = f.get('L')
data = np.array(data).T 
f.close()
img = Image.fromarray(data)
print(data.dtype)
w, h = img.size
print("w = ", w, " h =  ", h)

float64
w =  30202  h =   11030


# Try as numpy

In [11]:
def tile_npy(filename, dir_out_train, dir_out_test, d):
    name, ext = os.path.splitext(filename)
    name = os.path.basename(name)
    img = Image.open(filename)
    w, h = img.size
    
    grid = product(range(0, h-h%d, d), range(0, w-w%d, d))
    idx = 0
    for i, j in grid:
        box = (j, i, j+d, i+d)
        out_test = os.path.join(dir_out_test, f'{name}_index_{idx}')
        out_train = os.path.join(dir_out_train, f'{name}_index_{idx}')
        crop = img.crop(box)
        
        MAX_SIZE = (512, 512) 
        crop.thumbnail(MAX_SIZE) 
        
        crop = np.array(crop)
        if idx%28 == 0:
            np.save(out_test, crop)
        else:
            np.save(out_train, crop)
        idx+=1

In [12]:
def tile_mask_npy(filename, dir_out_train, dir_out_test, d):
    name, ext = os.path.splitext(filename)
    name = os.path.basename(name)
    f = h5py.File(filename,'r')
    data = f['L']
    data = np.array(data).T
    f.close()
    img = Image.fromarray(data)
    w, h = img.size
    
    grid = product(range(0, h-h%d, d), range(0, w-w%d, d))
    idx = 0
    for i, j in grid:
        box = (j, i, j+d, i+d)
        out_test = os.path.join(dir_out_test, f'{name}_index_{idx}')
        out_train = os.path.join(dir_out_train, f'{name}_index_{idx}')
        crop = img.crop(box)
        
        MAX_SIZE = (512, 512) 
        crop.thumbnail(MAX_SIZE) 
        
        crop = np.array(crop)/255
        crop[crop <= 0.5] = 0
        crop[crop > 0.5] = 1
        if idx%28 == 0:
            np.save(out_test, crop)
        else:
            np.save(out_train, crop)
        idx+=1

# Input images

In [13]:
dir_in_x = '/scratch/groups/earlew/arlenlex/dl-ice-floes/x/gt/'
dir_out_x_train = '/scratch/groups/earlew/arlenlex/cs230-dataset/train/x_train_npy'
dir_out_x_test = '/scratch/groups/earlew/arlenlex/cs230-dataset/test/x_test_npy'
for filename in sorted(glob.glob(os.path.join(dir_in_x, '*.tif'))):
    tile_npy(filename, dir_out_x_train, dir_out_x_test, 1024)

# Masks

In [14]:
dir_in_y = '/scratch/groups/earlew/arlenlex/dl-ice-floes/y/labeled-matlab-files/'
dir_out_y_train = '/scratch/groups/earlew/arlenlex/cs230-dataset/train/y_train_npy'
dir_out_y_test = '/scratch/groups/earlew/arlenlex/cs230-dataset/test/y_test_npy'
for filename in sorted(glob.glob(os.path.join(dir_in_y, '*.mat'))):
    tile_mask_npy(filename, dir_out_y_train, dir_out_y_test, 1024)