In [5]:
cd ..

/Users/jwen/Projects/road_building_extraction


In [6]:
cd src

/Users/jwen/Projects/road_building_extraction/src


In [7]:
import sys
import warnings
warnings.simplefilter("ignore", (UserWarning, FutureWarning))

from matplotlib import pyplot as plt
from torch.utils.data import DataLoader
from skimage import io
from PIL import Image
from tqdm import tqdm
from torchvision import transforms

import pandas as pd
import tensorflow as tf
import glob
import numpy as np
import random
import torch

from utils import data_utils
from utils import augmentation as aug

%matplotlib inline

# Data directory structure
We have pulled the data that we need, but the structure of the data will be difficult to deal with when we are training and testing. Create a .csv with the file paths to the images and the subdirectories

In [107]:
# one csv for each of the main datasets
mass_buildings = glob.glob('/Users/jwen/Projects/road_building_extraction/data/mass_buildings/**/*.tiff', recursive=True)
mass_roads = glob.glob('/Users/jwen/Projects/road_building_extraction/data/mass_roads/**/*.tiff', recursive=True)
mass_roads_crop = glob.glob('/Users/jwen/Projects/road_building_extraction/data/mass_roads_crop/**/*.tiff', recursive=True)

mass_buildings_df = pd.DataFrame(mass_buildings)
mass_roads_df = pd.DataFrame(mass_roads)
mass_roads_crop_df = pd.DataFrame(mass_roads_crop)

mass_buildings_df.rename(columns={0:'file_path'}, inplace=True)
mass_roads_df.rename(columns={0:'file_path'}, inplace=True)
mass_roads_crop_df.rename(columns={0:'file_path'}, inplace=True)

In [108]:
# create new column with the split 
mass_buildings_df['sat_img_path'] = mass_buildings_df['file_path'].apply(lambda x: x.split('/')[-1])
mass_buildings_df['map_img_path'] = mass_buildings_df['file_path'].apply(lambda x: x.split('/')[-1][:-1])
mass_buildings_df['sat_map'] = mass_buildings_df['file_path'].apply(lambda x: x.split('/')[-2])
mass_buildings_df['train_valid_test'] = mass_buildings_df['file_path'].apply(lambda x: x.split('/')[-3])

mass_roads_df['sat_img_path'] = mass_roads_df['file_path'].apply(lambda x: x.split('/')[-1])
mass_roads_df['map_img_path'] = mass_roads_df['file_path'].apply(lambda x: x.split('/')[-1][:-1])
mass_roads_df['sat_map'] = mass_roads_df['file_path'].apply(lambda x: x.split('/')[-2])
mass_roads_df['train_valid_test'] = mass_roads_df['file_path'].apply(lambda x: x.split('/')[-3])

mass_roads_crop_df['sat_img_path'] = mass_roads_crop_df['file_path'].apply(lambda x: x.split('/')[-1])
mass_roads_crop_df['map_img_path'] = mass_roads_crop_df['file_path'].apply(lambda x: x.split('/')[-1][:-1])
mass_roads_crop_df['sat_map'] = mass_roads_crop_df['file_path'].apply(lambda x: x.split('/')[-2])
mass_roads_crop_df['train_valid_test'] = mass_roads_crop_df['file_path'].apply(lambda x: x.split('/')[-3])

In [112]:
# create csv file with the files paths to the data
# mass_buildings_df.to_csv('/Users/jwen/Projects/road_building_extraction/data/mass_buildings/mass_buildings.csv', index=False)
# mass_roads_df.to_csv('/Users/jwen/Projects/road_building_extraction/data/mass_roads/mass_roads.csv', index=False)
# mass_roads_crop_df.to_csv('/Users/jwen/Projects/road_building_extraction/data/mass_roads_crop/mass_roads_crop.csv', index=False)

In [104]:
# total observations for new cropped dataset
len(mass_roads_crop_df)

13861

# Testing Data Loading

In [7]:
# load data with transformatons
# mass_dataset_train = MassRoadBuildingDataset('/Users/jwen/Projects/road_building_extraction/data/mass_roads/mass_roads.csv','mass_roads','train',
#                                        transform=transforms.Compose([RescaleTarget(268), RandomCropTarget(238), ToTensorTarget()]))


# mass_dataset_val = MassRoadBuildingDataset('/Users/jwen/Projects/road_building_extraction/data/mass_roads/mass_roads.csv','mass_roads','valid',
#                                        transform=transforms.Compose([RescaleTarget(268), ToTensorTarget()]))

data_path = '/Users/jwen/Projects/road_building_extraction/data/mass_roads_crop/mass_roads_crop.csv'
data_set = 'mass_roads_crop'

mass_dataset_train = data_utils.MassRoadBuildingDataset(data_path, data_set, 'train',transform=transforms.Compose([aug.ToTensorTarget()]))

mass_dataset_val = data_utils.MassRoadBuildingDataset(data_path, data_set, 'valid')
    
# RandomRotationTarget(45,resize=True), NormalizeTarget(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

In [8]:
dataloader = DataLoader(mass_dataset_train, batch_size=3, num_workers=4)
data_batch = next(iter(dataloader))

dataloader_valid = DataLoader(mass_dataset_val, batch_size=6, num_workers=4)
data_batch_valid = next(iter(dataloader_valid))

# Crop Images

In [62]:
def img_crop_coordinates(img, output_size):
    w, h, c = img.shape
    th, tw = (output_size, output_size)
    if w == tw and h == th:
        return 0, 0

    i = random.randint(0, h - th)
    j = random.randint(0, w - tw)
    
    return i, j

def img_crop(csv_df, output_size, num_crops):
    """ Create crops of the same height and width"""

    # filter out the images with missing data
    filtered_csv_df = csv_df[csv_df['sat_map']!='missing']
    
    # max tries counter
    counter=0
    
    for row in tqdm(filtered_csv_df.itertuples()):

        sat_img = io.imread(row[1])
        map_img = io.imread('/'.join(row[1].split('/')[:-2])+'/map/'+row[3])
        
        for num_crop in range(num_crops):
            
            i, j = img_crop_coordinates(sat_img, output_size)

            cropped_sat_img = sat_img[i:i+output_size, j:j+output_size]
            cropped_map_img = map_img[i:i+output_size, j:j+output_size]
             
            # have 60% of the crops have more then 3% roads in the image (deal with class imbalance)
            if (num_crop <= int(0.6*num_crops)):
                while (np.sum(cropped_map_img)/(output_size**2*255)<0.03) and counter<250:
                    cropped_sat_img = sat_img[i:i+output_size, j:j+output_size]
                    cropped_map_img = map_img[i:i+output_size, j:j+output_size]
                    counter+=1

            final_sat_img = Image.fromarray(cropped_sat_img)
            final_map_img = Image.fromarray(cropped_map_img)
                    
            final_sat_img.save('/'.join(row[1].split('/')[:-4])+'/mass_roads_crop/'+row[4]+'/sat/'+"{}_".format(str(num_crop))+row[2])
            final_map_img.save('/'.join(row[1].split('/')[:-4])+'/mass_roads_crop/'+row[4]+'/map/'+"{}_".format(str(num_crop))+row[3])

In [27]:
img_csv_df_path = '/Users/jwen/Projects/road_building_extraction/data/mass_roads/mass_roads.csv'

In [63]:
img_df = pd.read_csv(img_csv_df_path)

In [None]:
img_crop(img_df, 256, 15)