In [85]:
import json
import os
import numpy as np
metadata_file = 'dataset/slices.geojson'
train_folder = 'dataset/splits/train/'
dev_folder = 'dataset/splits/dev/'
test_folder = 'dataset/splits/test/'

with open(metadata_file) as f:
    d = json.load(f)
patches = d['features']
patches = [{
    **p['properties'], 
     'lng':np.mean(p['geometry']['coordinates'][0][:-1], axis=0)[0],
     'lat':np.mean(p['geometry']['coordinates'][0][:-1], axis=0)[1]
    } for p in patches]
patches[:2]

[{'img_source': '/datadrive/glaciers/unique_tiles/LE07_149037_20041024.tif',
  'mask_source': '/datadrive/glaciers/processed/masks/mask_00.npy',
  'img_slice': '/datadrive/glaciers/processed/slices/slice_0_img_000.npy',
  'mask_slice': '/datadrive/glaciers/processed/slices/slice_0_mask_000.npy',
  'mask_mean_0': 0.0,
  'mask_mean_1': 0.0,
  'mask_mean_2': 0.0,
  'img_mean': 77.38898468017578,
  'lng': 333795.9649122807,
  'lat': 3572176.06391926},
 {'img_source': '/datadrive/glaciers/unique_tiles/LE07_149037_20041024.tif',
  'mask_source': '/datadrive/glaciers/processed/masks/mask_00.npy',
  'img_slice': '/datadrive/glaciers/processed/slices/slice_0_img_001.npy',
  'mask_slice': '/datadrive/glaciers/processed/slices/slice_0_mask_001.npy',
  'mask_mean_0': 0.0,
  'mask_mean_1': 0.0,
  'mask_mean_2': 0.0,
  'img_mean': 133.6044158935547,
  'lng': 348977.8947368421,
  'lat': 3572176.06391926}]

In [86]:
def list_files(folder):
    list_of_files = []

    for root, dirs, files in os.walk(folder):
        for file in files:
            list_of_files.append(file)
    return list_of_files

train_splits = list_files(train_folder)
dev_splits = list_files(dev_folder)
test_splits = list_files(test_folder)

test_splits[:3]

['slice_10_mask_174.npy', 'slice_10_mask_175.npy', 'slice_10_mask_187.npy']

In [87]:
import pandas as pd

metadata = pd.DataFrame(patches)
files_columns = ['img_source','mask_source','img_slice','mask_slice']
metadata[files_columns] = metadata[files_columns].applymap(lambda x: x.split('/')[-1])
metadata['src'] = metadata.img_slice.map(lambda s: s.split('_')[1])
metadata['slice_num'] = metadata.img_slice.map(lambda s: s.split('_')[-1][:3])
metadata.set_index(['src', 'slice_num'], inplace=True)
metadata['mask_mean'] = metadata.mask_mean_0 + metadata.mask_mean_1
metadata.rename(columns={'mask_mean_2':'china'}, inplace=True)
metadata.drop(columns=['img_source', 'mask_source', 'mask_mean_0', 'mask_mean_1'], inplace=True)

metadata.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,img_slice,mask_slice,china,img_mean,lng,lat,mask_mean
src,slice_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,slice_0_img_000.npy,slice_0_mask_000.npy,0.0,77.388985,333795.964912,3572176.0,0.0
0,1,slice_0_img_001.npy,slice_0_mask_001.npy,0.0,133.604416,348977.894737,3572176.0,0.0
0,2,slice_0_img_002.npy,slice_0_mask_002.npy,0.0,94.324028,364159.824561,3572176.0,0.0
0,3,slice_0_img_003.npy,slice_0_mask_003.npy,0.0,189.766983,379341.754386,3572176.0,0.0
0,4,slice_0_img_004.npy,slice_0_mask_004.npy,0.0,198.229507,394523.684211,3572176.0,0.0


In [89]:
def partition_metadata(metadata, splits, folder = ''):
    # takes only the rows corresponding to those in the splits
    res = metadata[metadata.img_slice.isin( splits )].copy()
    # then adds the 
    res.img_slice = folder + res.img_slice
    res.mask_slice = folder + res.mask_slice
    return res

test_data = partition_metadata(metadata, test_splits, folder=test_folder)
dev_data = partition_metadata(metadata, dev_splits, folder=dev_folder)
train_data = partition_metadata(metadata, train_splits, folder=train_folder)

test_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,img_slice,mask_slice,china,img_mean,lng,lat,mask_mean
src,slice_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,151,dataset/splits/test/slice_1_img_151.npy,dataset/splits/test/slice_1_mask_151.npy,0.0,379.826019,460217.903596,4200787.0,0.521347
2,93,dataset/splits/test/slice_2_img_093.npy,dataset/splits/test/slice_2_mask_093.npy,0.00153,525.524292,211011.702409,3505949.0,0.316036
2,109,dataset/splits/test/slice_2_img_109.npy,dataset/splits/test/slice_2_mask_109.npy,0.020706,510.207916,226193.617384,3521131.0,0.35247
2,157,dataset/splits/test/slice_2_img_157.npy,dataset/splits/test/slice_2_mask_157.npy,0.031021,537.625977,271739.362306,3566677.0,0.411125
3,4,dataset/splits/test/slice_3_img_004.npy,dataset/splits/test/slice_3_mask_004.npy,0.039127,352.693176,654383.836999,3417196.0,0.201778
