In [1]:
# rasterio 
from shapely.geometry.polygon import Polygon
import shapely.wkt
from rasterio.mask import mask
import rasterio
import rasterio.features
import rasterio.warp

# other python libraries 
from PIL import Image
from pathlib import Path
import os
import matplotlib.pyplot as plt
import numpy as np
from math import floor
from tqdm.notebook import tqdm
import pandas as pd
import json
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

# Data Preparation Pipeline
### 1. Polygons in images - find where coordinates map to image pixels (GEO-tagged images)
### 2. Extract cropped images from polygon data (exract ROI)
### 3. Filter for data size
### 4. Take labels for img_path & damaged and not damaged 
### 5. Save imgs as png

In [2]:
PATH = Path('/data/disaster')

In [3]:
p1 = PATH/'labeled/labels'
annotation_paths = sorted([x for x in p1.iterdir() if x.is_file()])

In [4]:
p2 = PATH/'labeled/images'
img_paths = sorted([x for x in p2.iterdir() if x.is_file()])

In [5]:
# sanity check
num = 0
for an_path, im_path in zip(annotation_paths,img_paths):
    print(an_path)
    
    print(im_path)
    
    print()
    num += 1
    
    if num == 3:
        break

/data/disaster/labeled/labels/guatemala-volcano_00000003_post_disaster.json
/data/disaster/labeled/images/guatemala-volcano_00000003_post_disaster.tif

/data/disaster/labeled/labels/guatemala-volcano_00000003_pre_disaster.json
/data/disaster/labeled/images/guatemala-volcano_00000003_pre_disaster.tif

/data/disaster/labeled/labels/guatemala-volcano_00000004_post_disaster.json
/data/disaster/labeled/images/guatemala-volcano_00000004_post_disaster.tif



In [6]:
if not os.path.exists(PATH/'labeled/cropped_imgs'):
    os.makedirs(PATH/'labeled/cropped_imgs')

img_names = []
labels = []

for an_path, im_path in zip(tqdm(annotation_paths, desc='current img being cropped', colour='green'), img_paths):
    # Opening JSON file
    with open(an_path) as f:
        # returns JSON object as a dictionary
        data = json.load(f)

        # get num buildings in 1024x1024 image 
        num_buildings = len(data['features']['lng_lat'])

        for i in range(num_buildings):
            # get annotations
            annotations = str(data['features']['lng_lat'][i]['wkt'])
            # image name
            img_name = str('cropped_imgs/' + im_path.stem + '.png')
            # damage label
            label = str(data['features']['lng_lat'][i]['properties'].get('subtype'))

            # get tif image
            src = rasterio.open(im_path)
            # apply annotation
            P = shapely.wkt.loads(annotations)
            # mask image
            masked_image, out_transform = mask(src, [P], nodata=0) 
            # add up rgb channels (don't care about specific individual channels)
            m = masked_image.sum(axis=0)
            # find where non-black pixels
            x, y = np.where(m != 0)

            # if label is interest
            if label != 'None' and label != 'un-classified':
                # proceed only if non-black pixels
                if len(x) > 0 and len(y) > 0:
                    if min(x) != max(x) and min(y) != max(y):
                        cropped_image = masked_image[:, min(x):max(x), min(y):max(y)]
                        if cropped_image.shape[1] > 25 and cropped_image.shape[2] > 25:
                            img_names.append(img_name)
                            labels.append(label)
                            # add to image arr
                            cropped_image = np.uint8(cropped_image)
                            cropped_image = cropped_image.transpose((1,2,0))
                            img = Image.fromarray(cropped_image, 'RGB')
                            img.save(PATH/'labeled/cropped_imgs'/img_name)

            # close tif image
            src.close()

# save labels in pandas df
df = pd.DataFrame({'path': img_names, 'label': labels})
# df = df.loc[(df['label'] != 'None') & (df['label'] != 'un-classified')]
df['label'] = df['label'].replace({'no-damage':0, 'minor-damage':1, 'major-damage':2, 'destroyed':3})
df = df.reset_index(drop=True)

df_train, df_test = train_test_split(df, test_size=0.2)

df.to_csv(PATH/'labeled/data.csv', index=False)
df_train.to_csv(PATH/'labeled/train.csv', index=False)
df_test.to_csv(PATH/'labeled/test.csv', index=False)

current img being cropped:   0%|          | 0/3732 [00:00<?, ?it/s]

In [19]:
df = pd.read_csv(PATH/'labeled/data.csv')

In [20]:
df.head()

Unnamed: 0,path,label
0,cropped_imgs/guatemala-volcano_00000003_post_d...,1
1,cropped_imgs/guatemala-volcano_00000003_post_d...,1
2,cropped_imgs/guatemala-volcano_00000004_post_d...,2
3,cropped_imgs/guatemala-volcano_00000004_post_d...,2
4,cropped_imgs/guatemala-volcano_00000004_post_d...,2


In [21]:
df['label'].unique()

array([1, 2, 3, 0])