This code fetches the image cutouts from the DES and HSC archives. It is based heavily on File_Creation.ipynb in the DeepShadows repository: 
https://github.com/dtanoglidis/DeepShadows

In [1]:
# imports
import numpy as np 
import pandas as pd
from urllib.request import urlretrieve
from PIL import Image
from sklearn.utils import shuffle
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

**DES Data for training and testing**

In [None]:
# reads the files that contain the list of RA and DEC coordinates of the DES images
# for each dataset, then downloads and saves the corresponding cutouts from the DES database.
# all coordinate files have been copied from DeepShadows/Datasets in the DeepShadows repository

np.random.seed(3991773) # set a seed to pick out consistent subsets of the full dataset when re-running

# names of each dataset
datasets = ['train', 'val', 'test']

# Other variables
# desired number of examples in each dataset: [training, validation, test]. MUST BE EVEN NUMBERS
# note the "master" lists do not quite have 20,000 examples in them, so we'll use slightly smaller
# validation and test sets. 
num_images = [30000, 4990, 4990] 
zoom = 15 # for downloading images
fig_name = "placeholder.jpg" # placeholder name to save images under while constructing the datasets

# read in lists of LSBGs and artifacts
LSBGs = pd.read_csv('Data/random_LSBGs_all.csv')
artifacts = pd.read_csv('Data/random_negative_all_2.csv')

# and extract the ra and dec columns, then zip them
LSBG_ra = LSBGs['ra'].values
LSBG_dec = LSBGs['dec'].values
art_ra = artifacts['ra'].values
art_dec = artifacts['dec'].values

assert np.sum(num_images) <= np.sum([len(LSBG_ra), len(art_ra)]) # make sure we're not asking for more images than exist 

# now, we'll download and store the images for each dataset
for i, dataset in enumerate(datasets):
    
    N = num_images[i] # total number of images to download
    n = int(N/2) # N/2, number from each label to download
    
    # pick out N/2 random LSBGs and artifacts from each master list
    LSBG_idx = np.random.choice(np.arange(len(LSBG_ra)), size=n, replace=False) # indices for LSBGs
    art_idx = np.random.choice(np.arange(len(art_ra)), size=n, replace=False) # indices for artifacts
    
    # concatenate these into ra and dec arrays 
    ra_list = np.append(LSBG_ra[LSBG_idx], art_ra[art_idx])
    dec_list = np.append(LSBG_dec[LSBG_idx], art_dec[art_idx])
    
    # remove the just-used coordinates from the master list so 
    # we don't repeat images across datasets
    LSBG_ra = np.delete(LSBG_ra, LSBG_idx)
    LSBG_dec = np.delete(LSBG_dec, LSBG_idx)
    art_ra = np.delete(art_ra, art_idx)
    art_dec = np.delete(art_dec, art_idx)
    
    # initialize arrays
    example_arr = np.zeros([N,64,64,3]) # for examples
    label_arr = np.append(np.ones(n), np.zeros(n)) # for labels, note the first N/2 are LSBGs and the second N/2 are artifacts
    
    # download and store each cutout
    for j in range(N):
        
        # coordinates of the current image
        ra = ra_list[j]
        dec = dec_list[j]
        
        # format the url from which to get the cutout based on the ra and dec
        database_url = "http://legacysurvey.org//viewer/jpeg-cutout?ra={0}&dec={1}&zoom={2}&layer=des-dr1".format(ra, dec, zoom)
        # download the image. Sometimes this times out because my internet isn't the best,
        # so the loop keeps retrying the download until it works so that we don't lose
        # progress if it throws an error.
        flag = True
        while flag:
            try:
                urlretrieve(database_url, fig_name) # downloads and saves the image under the placeholder name
                flag = False # if it worked, move on
            except:
                print('Download timed out, trying it again')
                
        # process image
        image = Image.open(fig_name) # open it
        small_image = image.resize((64, 64)) # resize it
        im_array = np.asarray(small_image)/255. # convert to an RGB array and normalize it between 0 and 1
    
        example_arr[j] = im_array # save the example to the array
        
        # print progress and show the image
        clear_output(wait=True)
        print("Fetched image {}/{} of {} set".format(j+1,N,dataset))
#         plt.imshow(im_array)
#         plt.title('Label: {}'.format(label_arr[j]))
#         plt.show()
    
    # the arrays still have all the LSBGs first, so we'd like to shuffle them
    randomize = np.random.shuffle(np.arange(N))
    shuffled_examples = example_arr[randomize]
    shuffled_labels = label_arr[randomize]
    
    # save arrays
    np.save('Data/X_'+dataset, shuffled_examples)
    np.save('Data/y_'+dataset, shuffled_labels)

clear_output(wait=True)
print('Done!')

**HSC SSP Data for transfer learning and testing**

In [2]:
# reads the files that contain the list of RA and DEC coordinates of the HSC images
# for each dataset, then downloads and saves the corresponding cutouts from the HSC database.
# all coordinate files have been copied from DeepShadows/Datasets in the DeepShadows repository

np.random.seed(3991773) # set a seed to pick out consistent subsets of the full dataset when re-running

# names of each dataset
datasets = ['train', 'test']

# Other variables
# desired number of examples in each dataset: [training, test]. MUST BE EVEN NUMBERS
# These are equal to what was used in the paper. 
num_images = [320, 960] 
zoom = 15 # for downloading images
fig_name = "placeholder.jpg" # placeholder name to save images under while constructing the datasets

# read in list of artifacts
HSC_artifacts = pd.read_csv('Data/hsc_artifacts.csv')

# and extract the ra and dec columns, then zip them
art_ra = HSC_artifacts['ra'].values
art_dec = HSC_artifacts['dec'].values

# for some reason, the lists of LSBG coordinates are given in a different format
LSBG_ra, LSBG_dec = np.genfromtxt('Data/hsc_LSBGs.dat', usecols=(1,2), unpack=True)

assert np.sum(num_images) <= np.sum([len(LSBG_ra), len(art_ra)]) # make sure we're not asking for more images than exist 

# now, we'll download and store the images for each dataset
for i, dataset in enumerate(datasets):
    
    N = num_images[i] # total number of images to download
    n = int(N/2) # N/2, number from each label to download
    
    # pick out N/2 random LSBGs and artifacts from each master list
    LSBG_idx = np.random.choice(np.arange(len(LSBG_ra)), size=n, replace=False) # indices for LSBGs
    art_idx = np.random.choice(np.arange(len(art_ra)), size=n, replace=False) # indices for artifacts
    
    # concatenate these into ra and dec arrays 
    ra_list = np.append(LSBG_ra[LSBG_idx], art_ra[art_idx])
    dec_list = np.append(LSBG_dec[LSBG_idx], art_dec[art_idx])
    
    # remove the just-used coordinates from the master list so 
    # we don't repeat images across datasets
    LSBG_ra = np.delete(LSBG_ra, LSBG_idx)
    LSBG_dec = np.delete(LSBG_dec, LSBG_idx)
    art_ra = np.delete(art_ra, art_idx)
    art_dec = np.delete(art_dec, art_idx)
    
    # initialize arrays
    example_arr = np.zeros([N,64,64,3]) # for examples
    label_arr = np.append(np.ones(n), np.zeros(n)) # for labels, note the first N/2 are LSBGs and the second N/2 are artifacts
    
    # download and store each cutout
    for j in range(N):
        
        # coordinates of the current image
        ra = ra_list[j]
        dec = dec_list[j]
        
        # format the url from which to get the cutout based on the ra and dec
        database_url = "https://www.legacysurvey.org//viewer/jpeg-cutout?ra={0}&dec={1}&layer=hsc2&zoom={2}".format(ra, dec, zoom)
        # download the image. Sometimes this times out because my internet isn't the best,
        # so the loop keeps retrying the download until it works so that we don't lose
        # progress if it throws an error.
        flag = True
        while flag:
            try:
                urlretrieve(database_url, fig_name) # downloads and saves the image under the placeholder name
                flag = False # if it worked, move on
            except:
                print('Download timed out, trying it again')
                
        # process image
        image = Image.open(fig_name) # open it
        small_image = image.resize((64, 64)) # resize it
        im_array = np.asarray(small_image)/255. # convert to an RGB array and normalize it between 0 and 1
    
        example_arr[j] = im_array # save the example to the array
        
        # print progress and show the image
        clear_output(wait=True)
        print("Fetched image {}/{} of {} set".format(j+1,N,dataset))
#         plt.imshow(im_array)
#         plt.title('Label: {}'.format(label_arr[j]))
#         plt.show()
    
    # the arrays still have all the LSBGs first, so we'd like to shuffle them
    randomize = np.random.shuffle(np.arange(N))
    shuffled_examples = example_arr[randomize]
    shuffled_labels = label_arr[randomize]
    
    # save arrays
    np.save('Data/X_HSC_'+dataset, shuffled_examples)
    np.save('Data/y_HSC_'+dataset, shuffled_labels)

clear_output(wait=True)
print('Done!')

Done!
