# Download Kaggle API credentials
#### **Note**: This is a one-time step and you don’t need to generate the credentials every time you download the dataset.
- Navigate to your Kaggle profile
- Click the "Account" tab
- Scroll down to the "API" section
- Click "Create New API Token"; a file named `kaggle.json` will be download which contains your username and API key
- Move the `kaggle.json` file to the same directory as this notebook


In [None]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download balraj98/deepglobe-road-extraction-dataset

In [None]:
!unzip -q deepglobe-road-extraction-dataset.zip -d deep_globe

In [None]:
!rm -rf deepglobe-road-extraction-dataset.zip

# Import packages and libraries

In [1]:
import os
import glob
import random
from tqdm import tqdm
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import cv2


# Pickle training/validation data

In [2]:
def load_data(load_dict=None, sat_img_path=None, mask_path=None, shape=128):
    img_names = os.listdir(sat_img_path)
    mask_names = []
    
    for name in img_names:
        name = name.split('_')[0]
        if name not in mask_names:
            mask_names.append(name)
    
    img_path = sat_img_path + '/'
    mask_path = mask_path + '/'
    
    for i in tqdm(range(len(img_names)//2)):
        try:
            img = plt.imread(img_path + mask_names[i] + '_sat.jpg') 
            mask = plt.imread(mask_path + mask_names[i] + '_mask.png')
            
        except:
            continue
            
        img = cv2.resize(img, (shape, shape))
        mask = cv2.resize(mask, (shape, shape))
        
        load_dict['img'].append(img)
        load_dict['mask'].append(mask[:,:,0])
        
    return load_dict


In [3]:
image_size = 256


In [4]:
train_dict = {'img' : [], 'mask' : []}

input_img_paths = 'deep_globe/train'
target_img_paths = 'deep_globe/train'
train_dict = load_data(train_dict, input_img_paths, target_img_paths, image_size)


100%|██████████| 6226/6226 [05:31<00:00, 18.79it/s]


In [5]:
X_train, y_train = train_dict['img'], train_dict['mask']


In [6]:
np.array(X_train).shape, np.array(y_train).shape

((6226, 256, 256, 3), (6226, 256, 256))

In [7]:
with open('deep-globe-sat-img-train.pkl', 'wb') as f:
    pickle.dump(X_train, f, pickle.HIGHEST_PROTOCOL)

with open('deep-globe-mask-train.pkl', 'wb') as f:
    pickle.dump(y_train, f, pickle.HIGHEST_PROTOCOL)
    

In [8]:
del train_dict
del X_train
del y_train


# Pickle test data

In [9]:
def load_test(load_dict=None, sat_img_path=None, shape=128):
    img_names = os.listdir(sat_img_path)

    img_path = sat_img_path + '/'
    
    for i in tqdm(range(len(img_names))):
        try:
            img = plt.imread(img_path + img_names[i]) 
            
        except:
            continue
            
        img = cv2.resize(img, (shape, shape))
        
        load_dict['img'].append(img)
        
    return load_dict


In [10]:
test_dict = {'img' : []}

input_img_paths = 'deep_globe/test'
test_dict = load_test(test_dict, input_img_paths, image_size)

input_img_paths = 'deep_globe/valid'
test_dict = load_test(test_dict, input_img_paths, image_size)


100%|██████████| 1101/1101 [00:23<00:00, 46.55it/s]
100%|██████████| 1243/1243 [00:27<00:00, 45.95it/s]


In [11]:
X_test = test_dict['img']


In [12]:
np.array(X_test).shape

(2344, 256, 256, 3)

In [13]:
with open('deep-globe-sat-img-test.pkl', 'wb') as f:
    pickle.dump(X_test, f, pickle.HIGHEST_PROTOCOL)
    

In [14]:
del test_dict
del X_test
