planet_download_script.py --- Last updated 12/24/21
Contact: James Sayre, sayrejay@gmail.com

Written to download images from the Planet API.

Inputs:
- Within the each subfolder "STATECODE/MUNCODE/CYCLE_NAME_YEAR/", looks for a shapefile called "search_results_CYCLE_NAME_YEAR.shp", which details each of the relevant satellite images for a given municipality and crop cycle.

- "data/muncodes/shp/MUNICIPIOS.shp" -- INEGI provided shapefile of municipality shapes, municipality codes + names


Outputs:
Downloads of each PlanetScope asset, saved within "STATECODE/Raw/ASSET_TYPE/". For more information on asset types, see https://developers.planet.com/docs/data/psscene4band/

In [20]:
from planet import api
import os
import json
import requests as re
from requests.auth import HTTPBasicAuth
import time
import geopandas as gpd
import numpy as np
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from packaging import version
from functools import partial
import multiprocessing
import wget
import itertools

### Directories
base_dir = "~/"
remote_sen_dir = os.path.join(base_dir, "Remote Sensing", "Input/")
image_dir = os.path.join(base_dir, "Remote Sensing", "Images/")

data_dir = os.path.join(base_dir, "data")
intermediate_dir = os.path.join(base_dir, "Intermediates")

### Inputs
mun_shp =  os.path.join(data_dir, "muncodes","shp","MUNICIPIOS.shp") ### Region level shapefile
search_region_satelite_imgs = os.path.join(intermediate_dir,"search_regions_img_corr.csv")

### Params
eng_or_esp = "eng" ###eng for english, else for spanish

### Save your Planet API key a
PLANET_API_KEY = os.environ['PL_API_KEY_James']


In [21]:
### Programs
def return_date_id(image_id):
    parts = image_id.split('_')[0]
    year,month, day = parts[:4], parts[4:6], parts[6:]
    if eng_or_esp == "eng":
        return str(month)+'/'+str(day)+'/'+str(year)
    else:
        return str(day)+'/'+str(month)+'/'+str(year)

def re_get_check(url, api_key=PLANET_API_KEY):
    ### Written to ignore connection snags
    has_connected = False
    while has_connected == False:
        result = re.get(url, auth=HTTPBasicAuth(api_key, ''))
#         print(result.status_code)
        if result.status_code != 404 and result.status_code != 429:
            has_connected = True
            break
        else:
            time.sleep(5)
    return result
    
def check_id_assets(image_id, item_type="PSScene4Band"):
    id_url = 'https://api.planet.com/data/v1/item-types/{}/items/{}/assets'.format(item_type, image_id)
    # Returns JSON metadata for assets in this ID.
    try:
        asset = re_get_check(id_url)
        # List of asset types available for this particular satellite image
        return list(asset.json().keys())
    except:
        return []

def return_image_file_path(chosen_mun,ciclo,image_id,asset_type='analytic_sr',
                          create_path=True):
    state_code = str(int(chosen_mun)).replace(str(int(chosen_mun))[-3:],"")+"/"+ciclo+"/"

    fl_dr = image_dir+state_code+asset_type+"/"
    if create_path:
        if not os.path.exists(fl_dr):
            os.makedirs(fl_dr)
    
    if asset_type == "analytic_sr":
        asset_abbrev = "_sr"
        output_fl = fl_dr+image_id+asset_abbrev+".tif"
    elif asset_type == "analytic":
        asset_abbrev = "_a"
        output_fl = fl_dr+image_id+asset_abbrev+".tif"
    elif asset_type == "analytic_xml":
        output_fl = fl_dr+image_id+".xml"
    else:
        asset_abbrev = "_"+asset_type
        output_fl = fl_dr+image_id+asset_abbrev+".tif"
    return output_fl

def activate_image_id(image_id,chosen_mun, ciclo, item_type="PSScene4Band", asset_type='analytic_sr'):
    id_url = 'https://api.planet.com/data/v1/item-types/{}/items/{}/assets'.format(item_type, image_id)
    output_fl = return_image_file_path(chosen_mun,ciclo,image_id,asset_type)

    if not os.path.isfile(output_fl):
        try:
            # Returns JSON metadata for assets in this ID.
            asset = re_get_check(id_url)

            if asset != None:
                print(asset.status_code)
                # List of asset types available for this particular satellite image
                status = asset.json()[asset_type]['status']
                self_link = asset.json()[asset_type]["_links"]["_self"]
                activation_link = asset.json()[asset_type]["_links"]["activate"]

                if status != "active":
                    # Request activation of the 'analytic' asset:
                    activate_result = re_get_check(activation_link)
        except:
            print("Couldn't activate image id:", image_id)
            
def download_image_id(image_id,chosen_mun, ciclo, item_type="PSScene4Band", asset_type='analytic_sr'):
    id_url = 'https://api.planet.com/data/v1/item-types/{}/items/{}/assets'.format(item_type, image_id)
    output_fl = return_image_file_path(chosen_mun,ciclo,image_id,asset_type,create_path=True)

    if not os.path.isfile(output_fl):
        # Returns JSON metadata for assets in this ID.
        asset = re_get_check(id_url, PLANET_API_KEY)

        # List of asset types available for this particular satellite image
        if asset != None:
            status = asset.json()[asset_type]['status']
            self_link = asset.json()[asset_type]["_links"]["_self"]
            activation_link = asset.json()[asset_type]["_links"]["activate"]

            if status != "active":
                return False
            else:
                activation_status_result = re_get_check(self_link, PLANET_API_KEY)
                wget.download(activation_status_result.json()["location"],output_fl)
                if os.path.isfile(output_fl):
                    if os.stat(output_fl).st_size < 120:
                        os.remove(output_fl)
                        return False
                    else:
                        print("Downloaded file successfully at ", output_fl)
                        return True
        else:
            print("Couldn't download:", image_id)
            return False
    else:
        print("Already downloaded "+asset_type+" asset")
        return True

def download_all_in_list(to_download_list,activate=True,queue_ahead=50):
    undownloaded_parts = []
    for i, download_part in enumerate(to_download_list):
        ### Start by activating future images to download 
        if activate:
            if i == 0:
                for j in range(queue_ahead):
                    if j < len(to_download_list):
                        next_img_id, next_cic, next_mun = to_download_list[j]
                        next_img_fl = return_image_file_path(next_mun,next_cic,next_img_id)
                        if not os.path.isfile(next_img_fl):
                            print("Now activating asset: ", j)
                            print(next_img_id)
                            asset_types = check_id_assets(next_img_id)
                            if 'analytic_sr' in asset_types:
                                activate_image_id(next_img_id, chosen_mun = next_mun,  ciclo=next_cic,asset_type='analytic_sr') ### This activates the SR image
                            if 'udm2' in asset_types:
                                activate_image_id(next_img_id, chosen_mun = next_mun, ciclo=next_cic, asset_type='udm2') ### This activates the v2 cloud map
                            elif 'udm' in asset_types:
                                activate_image_id(next_img_id, chosen_mun = next_mun, ciclo=next_cic, asset_type='udm') ### This activates the cloud map
                            elif 'analytic_xml' in asset_types:
                                activate_image_id(next_img_id, chosen_mun = next_mun, ciclo=next_cic, asset_type='analytic_xml') ### This activates the xml file
            if i+queue_ahead < len(to_download_list):
                next_img_id, next_cic, next_mun = to_download_list[i+queue_ahead]
                next_img_fl = return_image_file_path(next_mun,next_cic,next_img_id)
                if not os.path.isfile(next_img_fl):
                    print("Now activating asset: ", i+queue_ahead)
                    asset_types = check_id_assets(next_img_id)
                    if 'analytic_sr' in asset_types:
                        activate_image_id(next_img_id, chosen_mun = next_mun,  ciclo=next_cic,asset_type='analytic_sr') ### This activates the SR image
                    if 'udm2' in asset_types:
                        activate_image_id(next_img_id, chosen_mun = next_mun, ciclo=next_cic, asset_type='udm2') ### This activates the v2 cloud map
                    elif 'udm' in asset_types:
                        activate_image_id(next_img_id, chosen_mun = next_mun, ciclo=next_cic, asset_type='udm') ### This activates the cloud map
                    elif 'analytic_xml' in asset_types:
                        activate_image_id(next_img_id, chosen_mun = next_mun, ciclo=next_cic, asset_type='analytic_xml') ### This activates the xml file
        
        ### Now download this image 
        image_id, ciclo, muncode = download_part
        image_fl = return_image_file_path(muncode,ciclo,image_id)
        if not os.path.isfile(image_fl):
            print("Now downloading asset: ", i)
            asset_types = check_id_assets(image_id)
            if 'analytic_sr' in asset_types:
                dl_status = download_image_id(image_id, chosen_mun = muncode, ciclo = ciclo, asset_type='analytic_sr') ### This downloads the SR image
                if dl_status == False:
                    undownloaded_parts.append(download_part)
                if 'udm2' in asset_types:
                    dl_status_udm2 = download_image_id(image_id, chosen_mun = muncode, ciclo = ciclo, asset_type='udm2') ### This downloads the v2 cloud map
                    if dl_status_udm2 == False:
                        undownloaded_parts.append(download_part)
                elif 'udm' in asset_types:
                    dl_status_udm = download_image_id(image_id, chosen_mun = muncode, ciclo = ciclo, asset_type='udm') ### This downloads the cloud map
                    if dl_status_udm == False:
                        undownloaded_parts.append(download_part)
                elif 'analytic_xml' in asset_types:
                    dl_status_xml = download_image_id(image_id, chosen_mun = muncode, ciclo = ciclo, asset_type='analytic_xml') ### This downloads the xml file                                            
                    if dl_status_xml == False:
                        undownloaded_parts.append(download_part)
    return undownloaded_parts



In [None]:
### Read in shapefile for country extent
df = gpd.read_file(mun_shp)
### We need to reproject this shapefile into lat/lon coordinates, or another projection potentially
df = df.to_crs(epsg=4326)
df['muncode'] = df['CVE_ENT']+df['CVE_MUN']
df['muncode'] = df['muncode'].astype(int)

df['area'] = df.geometry.area 
df.sort_values('muncode', inplace=True)

In [23]:
target_muns =  [2001, 2002, 2003, 2004, 2005] ### Baja California
bid_images = ''
search_df = gpd.GeoDataFrame()
mun_selected_df = df[df['muncode'].isin(target_muns)]
img_fls_dled = []
for muncode in target_muns:
    state_code = str(muncode).replace(str(muncode)[-3:],"")+"/"
    ciclos = ['summer_', 'winter_', 'spring_']

    fl_dr = image_dir+state_code
    if os.path.exists(fl_dr):
        for r,d,f in os.walk(fl_dr):
            for file in f:
                if '_sr.tif' in file:
                    if file.split('_sr.tif')[0] not in img_fls_dled:
                        img_fls_dled.append(file.split('_sr.tif')[0])

    for yr in range(16,22):
        for ciclo in ciclos:
            yr = str(yr)
            input_fl_dir_srt = remote_sen_dir+state_code+str(muncode)+"/"+ciclo+'start_'+yr+"/"
            input_fl_dir_end = remote_sen_dir+state_code+str(muncode)+"/"+ciclo+'end_'+yr+"/"

            search_df_srt_fl = input_fl_dir_srt+'search_results_'+ciclo+'start_'+yr+'.shp'
            search_df_end_fl = input_fl_dir_end+'search_results_'+ciclo+'end_'+yr+'.shp'
            if os.path.isfile(search_df_srt_fl):
                if os.path.isfile(search_df_end_fl):
                    search_df_srt = gpd.read_file(search_df_srt_fl)
                    search_df_srt['time'] = ciclo+"start_"+yr
                    search_df_end = gpd.read_file(search_df_end_fl,SHAPE_RESTORE_SHX='YES')
                    search_df_end['time'] = ciclo+"end_"+yr

                    if os.path.isfile(input_fl_dir_srt+'mun_boxes_'+ciclo+'start_'+yr+'.csv'):
                        box_df_srt = pd.read_csv(input_fl_dir_srt+'mun_boxes_'+ciclo+'start_'+yr+'.csv')
                    if os.path.isfile(input_fl_dir_end+'mun_boxes_'+ciclo+'end_'+yr+'.csv'):
                        box_df_end = pd.read_csv(input_fl_dir_end+'mun_boxes_'+ciclo+'end_'+yr+'.csv')

                    search_df = search_df.append(search_df_srt).reset_index(drop=True)
                    search_df = search_df.append(search_df_end).reset_index(drop=True)

search_df['date']=search_df['id'].apply(return_date_id)
search_df['year'] = search_df['time'].apply(lambda x: str(x).split('_')[-1])
search_df['cic'] = search_df['time'].apply(lambda x: str(x).split('_')[0])
search_df['uid'] = 'MUN'+search_df['muncode'].astype(str)+"_YEAR"+search_df['year']+"_CIC"+search_df['cic']

In [None]:
### For areas with multiple epsg codes in one muncode-cycle, only download images from "largest" epsg
### If area has only one epsg, this also selects the only epsg
epsg_df = search_df.groupby(['uid','epsg']).size().reset_index()
print(len(epsg_df))
epsg_df.columns = ['uid','epsg','num_epsg']
count_df = search_df.groupby('uid').size().reset_index()

count_df.columns = ['uid','cicyr_tot']
epsg_df = epsg_df.merge(count_df,on='uid',how='left')
epsg_df['share'] = epsg_df['num_epsg']/epsg_df['cicyr_tot']
### Keep EPSG with largest share of EPSG codes
max_epsg_df = epsg_df.groupby('uid')['share'].max().reset_index()
max_epsg_df.columns = ['uid','max_share']
epsg_df = epsg_df.merge(max_epsg_df,on='uid',how='left')
epsg_df = epsg_df[epsg_df['share'] >= epsg_df['max_share']]
search_df = search_df.merge(epsg_df,on=['uid','epsg'],how='inner')
search_df.drop('uid',1,inplace=True)

In [None]:
### Subset down to images that we don't already have
already_dl_df = search_df[search_df['id'].isin(img_fls_dled)]
download_df = search_df[~search_df['id'].isin(img_fls_dled)]

In [None]:
## Important: WE HAVE TO ENSURE THAT ALL IMAGES HAVE THE SAME CRS (geographic projection)
## This part here is designed to ensure we don't download images with different CRSes
## What if we do -- go back to planet_api_querying and force imagery to be downloaded from one CRS
## How? Modify return_relevant_shapes fxn with the return_epsg arg set to the CRS we want
## To requery imagery for a crop cycle, just delete the saved shapefile listing the relevant satellite images
if len(download_df['epsg'].unique()) > 1:
    raise NOTTHESAMEEPSG

In [None]:
## Plot available imagery selected for each season
for yr in range(16,22):
    for ciclo in ciclos:
        for start_end in ['start','end']:
            seasons = [ciclo+start_end+'_'+str(yr)]
            ciclo_download_df = search_df[search_df['time'].isin(seasons)]
            ### Optional: subset down toonly image ids that cover box id of interest
#             ciclo_download_df = ciclo_download_df[ciclo_download_df['id'].isin(bid_images)]
            ciclo_already_dl_df = ciclo_download_df[ciclo_download_df['id'].isin(img_fls_dled)]
            ciclo_to_download_df = ciclo_download_df[~ciclo_download_df['id'].isin(img_fls_dled)]
            ciclo_badcov_df = ciclo_to_download_df[ciclo_to_download_df['cloud_cov'] > 0.30] #needs to scale with size
            ciclo_goodcov_df = ciclo_to_download_df[ciclo_to_download_df['indep_cvrg'] >= 0.0000001] # needs to scale with size
            if len(ciclo_download_df) > 0:
                ### If you want to plot the outline of the municipality against the chosen imagery outlines
                fig, ax = plt.subplots(figsize=(7, 7))
                mun_selected_df.plot(ax=ax,color='white', edgecolor='black', alpha=0.7)
                ### Blue are images we already have
                ciclo_already_dl_df.plot(ax=ax,color='Blue', edgecolor='white',alpha=0.4)
                ### Red are images we are chosing not to download based on poor cloud coverage coverage
                ciclo_badcov_df.plot(ax=ax,color='Red', edgecolor='white',alpha=0.4)
                ### Green are images we will download
                ciclo_goodcov_df.plot(ax=ax,color='green', edgecolor='white',alpha=0.4)

                plt.title(ciclo+str(yr)+'_'+start_end)
                plt.show()

In [None]:
### Subset down to images whose independent coverage (i.e. whose coverage of the municipality is independent 
### of the coverage of the other queried images is greater than some threshold).
### The tradeoff is that occasionally we'll miss being able to look at areas on the borders, but at a great payoff
### in terms of necessary image reduction.
download_df = download_df[download_df['indep_cvrg'] >= 0.0000001] 
download_df = download_df[download_df['cloud_cov'] <= 0.3]
### To download images in order of importance/coverage
download_df = download_df.sort_values('indep_cvrg',ascending=False).reset_index(drop=True)


In [None]:
to_download = []
for i in download_df.index:
    image_id = download_df.loc[i,'id']
    ciclo = download_df.loc[i,'time']
    mc = download_df.loc[i,'muncode']
    to_download.append((image_id,ciclo,int(mc)))

In [None]:
mgr = multiprocessing.Manager()
pool_size = multiprocessing.cpu_count()
split_ids = np.array_split(to_download, pool_size)
pool = multiprocessing.Pool(processes=pool_size)
download_these=partial(download_all_in_list, activate=True)
undled_images = zip(*pool.map(download_these, split_ids))
undled_images = list(itertools.chain(*undled_images))
undled_split_ids = np.array_split(undled_images, pool_size)
undled_images_v2 = zip(*pool.map(download_these, undled_split_ids))
undled_images_v2 = list(itertools.chain(*undled_images_v2))
undled_split_ids_v2 = np.array_split(undled_images_v2, pool_size)
undled_images_v3 = zip(*pool.map(download_these, undled_split_ids_v2))
undled_images_v3 = list(itertools.chain(*undled_images_v3))
pool.close()
print("Number of undownloaded images:", len(undled_images_v3))
