# Download Kaggle API credentials
#### **Note**: This is a one-time step and you don’t need to generate the credentials every time you download the dataset.
- Navigate to your Kaggle profile
- Click the "Account" tab
- Scroll down to the "API" section
- Click "Create New API Token"; a file named `kaggle.json` will be download which contains your username and API key
- Move the `kaggle.json` file to the same directory as this notebook


In [1]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download balraj98/deepglobe-road-extraction-dataset

Downloading deepglobe-road-extraction-dataset.zip to /Users/Hannah/Code/Metis/projects/street_network_deep_learning
100%|██████████████████████████████████████| 3.79G/3.79G [01:33<00:00, 45.3MB/s]
100%|██████████████████████████████████████| 3.79G/3.79G [01:33<00:00, 43.5MB/s]


In [2]:
!unzip -q deepglobe-road-extraction-dataset.zip -d deep_globe

In [3]:
!rm -rf deepglobe-road-extraction-dataset.zip

# Import packages and libraries

In [4]:
import os
import glob
import random
from tqdm import tqdm
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import cv2


# Pickle training/validation data

In [5]:
def load_data(load_dict=None, sat_img_path=None, mask_path=None, shape=128):
    img_names = os.listdir(sat_img_path)
    mask_names = []
    
    for name in img_names:
        name = name.split('_')[0]
        if name not in mask_names:
            mask_names.append(name)
    
    img_path = sat_img_path + '/'
    mask_path = mask_path + '/'
    
    for i in tqdm(range(len(img_names)//2)):
        try:
            img = plt.imread(img_path + mask_names[i] + '_sat.jpg') 
            mask = plt.imread(mask_path + mask_names[i] + '_mask.png')
            
        except:
            continue
            
        img = cv2.resize(img, (shape, shape))
        mask = cv2.resize(mask, (shape, shape))
        
        load_dict['img'].append(img)
        load_dict['mask'].append(mask[:,:,0])
        
    return load_dict


In [6]:
image_size = 256


In [7]:
train_dict = {'img' : [], 'mask' : []}

input_img_paths = 'deep_globe/train'
target_img_paths = 'deep_globe/train'
train_dict = load_data(train_dict, input_img_paths, target_img_paths, image_size)


100%|██████████| 6226/6226 [08:49<00:00, 11.76it/s]


In [8]:
X_train, y_train = train_dict['img'], train_dict['mask']


In [9]:
np.array(X_train).shape, np.array(y_train).shape

((6226, 256, 256, 3), (6226, 256, 256))

In [10]:
with open('deep-globe-sat-img-train.pkl', 'wb') as f:
    pickle.dump(X_train, f, pickle.HIGHEST_PROTOCOL)

with open('deep-globe-mask-train.pkl', 'wb') as f:
    pickle.dump(y_train, f, pickle.HIGHEST_PROTOCOL)
    

# Pickle test data

In [16]:
def load_test(load_dict=None, sat_img_path=None, shape=128):
    img_names = os.listdir(sat_img_path)

    img_path = sat_img_path + '/'
    
    for i in tqdm(range(len(img_names))):
        try:
            img = plt.imread(img_path + img_names[i]) 
            
        except:
            continue
            
        img = cv2.resize(img, (shape, shape))
        
        load_dict['img'].append(img)
        
    return load_dict


In [17]:
test_dict = {'img' : []}

input_img_paths = 'deep_globe/test'
test_dict = load_test(test_dict, input_img_paths, image_size)

input_img_paths = 'deep_globe/valid'
test_dict = load_test(test_dict, input_img_paths, image_size)


100%|██████████| 1101/1101 [00:59<00:00, 18.61it/s]
100%|██████████| 1243/1243 [01:07<00:00, 18.51it/s]


In [18]:
X_test = test_dict['img']


In [19]:
np.array(X_train).shape, np.array(X_test).shape

((6226, 256, 256, 3), (2344, 256, 256, 3))

In [20]:
with open('deep-globe-sat-img-test.pkl', 'wb') as f:
    pickle.dump(X_test, f, pickle.HIGHEST_PROTOCOL)
    