In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import gdal

In [6]:
    # used in __init__
    def load_clean_yield_data(yield_data_filepath):
        """
        Cleans the yield data by making sure any Nan values in the columns we care about
        are removed
        """
        important_columns=['Year', 'State ANSI', 'County ANSI', 'Value']
        yield_data=pd.read_csv(yield_data_filepath).dropna(subset=important_columns,how='any')

        return yield_data

    # used in export
    # def get_tif_files(image_path):
    #     """
    #     Get all the .tif files in the image folder.
    #     Parameters
    #     ----------
    #     image_path: pathlib Path
    #         Directory to search for tif files
    #     Returns:
    #         A list of .tif filenames
    #     """
    #     files=[]
    #     for dir_file in image_path.iterdir():
    #         if str(dir_file).endswith('tif'):
    #         # strip out the directory so its just the filename
    #             files.append(str(dir_file.parts[-1]))               
    #     print(files)
    #     return files
    def get_tif_files(image_path):
        """
        Get all the .tif files in the image folder.

        Parameters
        ----------
        image_path: pathlib Path
            Directory to search for tif files
        Returns:
            A list of .tif filenames
        """
        files = []
        for dir_file in image_path.iterdir():
            # print("dir_file")
            # print(str(dir_file))
            if str(dir_file).endswith('tif'):
                # strip out the directory so its just the filename
                files.append(str(dir_file.parts[-1]))
                # print(str(dir_file.parts[-1]))
        return files
    def get_tif_files_12(image_path):#name length less than 12
        """
        Get all the .tif files in the image folder.

        Parameters
        ----------
        image_path: pathlib Path
            Directory to search for tif files
        Returns:
            A list of .tif filenames
        """
        files = []
        for dir_file in image_path.iterdir():
            # print("dir_file")
            # print(str(dir_file))
            if str(dir_file).endswith('tif'):
                # strip out the directory so its just the filename
                if len(str(dir_file.parts[-1]))<12:#max possible name is ss_ccc.tif, len=10
                    files.append(str(dir_file.parts[-1]))
                # print(str(dir_file.parts[-1]))
        return files
    def get_tif_files_12(mask_path,temperature_path,image_path,weather_path):
#         print(len(mask_path.iterdir()))
        print(len([1 for x in list(os.scandir(mask_path)) if x.is_file()]))
        print(len([1 for x in list(os.scandir(temperature_path)) if x.is_file()]))
        print(len([1 for x in list(os.scandir(image_path)) if x.is_file()]))
        print(len([1 for x in list(os.scandir(weather_path)) if x.is_file()]))
        mask_files=[f for f in listdir(mask_path) if isfile(join(mask_path, f))]
        print("\n\n")
        print(mask_files)
        temperature_files=[f for f in listdir(temperature_path) if isfile(join(temperature_path, f))]
        print("\n\n")
        print(temperature_files)
        image_files=[f for f in listdir(image_path) if isfile(join(image_path, f))]
        print("\n\n")
        print(image_files)
        weather_files=[f for f in listdir(weather_path) if isfile(join(weather_path, f))]
        print("\n\n")
        print(weather_files)
        files = []
        for dir_file in image_path.iterdir():
            # print("dir_file")
            # print(str(dir_file))
            if str(dir_file).endswith('tif'):
                # strip out the directory so its just the filename
                if len(str(dir_file.parts[-1]))<12:#max possible name is ss_ccc.tif, len=10
                    if str(dir_file.parts[-1]) in temperature_files:
                        if str(dir_file.parts[-1]) in weather_files:
                            if str(dir_file.parts[-1]) in mask_files:
                                files.append(str(dir_file.parts[-1]))
                # print(str(dir_file.parts[-1]))
        return files


In [None]:
#for exporting from ee to drive
import ee
import ssl
import time
from pathlib import Path
import numpy as np
np.set_printoptions(threshold=np.inf)

In [None]:
MAJOR_STATES=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, 17, 18, 19, 20, 26, 27, 28, 29, 31, 38, 39, 46, 47, 55]
# MAJOR_STATES=[1, 5, 17, 18, 19, 20, 27, 29, 31, 38, 39, 46]

In [None]:
class ModisExporter:
    """A class to export MODIS data from 
    Google Earth Engine to Google Drive

    Parameters"""

    def __init__(self, locations_filepath=Path('SOP_download/yield_data.csv'),collection_id='MODIS/006/MOD09A1'):#006 instead of 051
        self.locations=load_clean_yield_data(locations_filepath)
        self.collection_id=collection_id

        try:
            # https://developers.google.com/earth-engine/guides/python_install-colab
            # Trigger the authentication flow.
            # Google Earth Engine Python Authenticator
            ee.Authenticate()
            # Initialize the Earth Engine module.
            ee.Initialize()
            print('Earth Engine initialized successfully!!')
        except ee.EEException:
            print('Earth Engine failed to initialize. Use authenticate in command line.')
        
    def update_parameters(self, location_filepath=None,collection_id=None):
        """
        Update the locations file or the collection id
        """
        if location_filepath is not None:
            # self.locations=load(locations_filepath)
            # from utils import load_clean_yield_data as load
            self.locations=load_clean_yield_data(locations_filepath)
        if collection_id is not None:
            self.collection_id=collection_id

    @staticmethod
    def _export_one_image(img,folder, name, region, scale, crs):
        # export one image from Earth Engine to Google Drive
        # Author: Jiaxuan You, https://github.com/JiaxuanYou
        print(f'Exporting to {folder}/{name}')
        task_dict={
            'folder': folder,
            'fileNamePrefix': name,
            'scale': scale,
            'crs': crs
        }
        if region is not None:
            task_dict.update({
                'region': region
            })
        task=ee.batch.Export.image.toDrive(img,name, **task_dict)
        task.start()
        while task.status()['state']=='RUNNING':
            print('Running....')
            # perhaps task.cancel() at some point
            time.sleep(10)
        print(f'Done: {task.status()}')
    
    def export(self, folder_name, data_type, coordinate_system='EPSG:4326', scale=500,
               export_limit=None, min_img_val=None, max_img_val=None, major_states_only=True,
               check_if_done=False, download_folder=None):
        """Export an Image Collection from Earth Engine to Google Drive
        Parameters
        ----------
            folder_name: str
                The name of the folder to export the images to in
                Google Drive. If the folder is not there, this process
                creates it
            data_type: str {'image', 'mask', 'temperature'}
                The type of data we are collecting. This tells us which bands to collect.
            coordinate_system: str, default='EPSG:4326'
                The coordinate system in which to export the data
            scale: int, default=500
                The pixel resolution, as determined by the output.
                https://developers.google.com/earth-engine/scale
            export_limit: int or None, default=None
                If not none, limits the number of files exported to the value
                passed.
            min_img_val = int or None:
                A minimum value to clip the band values to
            max_img_val: int or None
                A maximum value to clip the band values to
            major_states_only: boolean, default=True
                Whether to only use the 11 states responsible for 75 % of national soybean
                production, as is done in the paper
            check_if_done: boolean, default=False
                If true, will check download_folder for any .tif files which have already been
                downloaded, and won't export them again. This effectively allows for
                checkpointing, and prevents all files from having to be downloaded at once.
            download_folder: None or pathlib Path, default=None
                Which folder to check for downloaded files, if check_if_done=True. If None, looks
                in data/folder_name
        """

        if check_if_done:
            if download_folder is None:
                # download_folder=Path('data')/folder_name    #DATA folder???
                download_folder=Path('')/folder_name
                # folder_names is passed as 'SOP- Time Series Analysis with Deep Learning/crop_yield-data_image', etc...
                already_downloaded= get_tif_files(download_folder)

        # imgcoll=ee.ImageCollection(self.collection_id)\
        #     .filterBounds(ee.Geometry(-106.5,50,-64,23))\
        #     .filterDate('2002-12-31','2016-8-4')
        imgcoll = ee.ImageCollection(self.collection_id) \
            .filterBounds(ee.Geometry.Rectangle(-106.5, 50, -64, 23)) \
            .filterDate('2002-12-31', '2012-12-31')

        datatype_to_func={
            'image': _append_im_band,
            'mask': _append_mask_band,
            'temperature': _append_temp_band,
            'weather': _append_weather_band
        }
        
            # iterate(algorithm, first)
            # Applies a user-supplied function to each element of a collection. 
            # The user-supplied function is given two arguments: the current element,and the value returned by the previous call to iterate()
            # or the first argument, for the first iteration. 
            # The result is the value returned by the final call to the user-supplied function.
            # Returns the result of the Collection.iterate() call.
            # Arguments:

            # this:collection (Collection):
            #     The Collection instance.
            # algorithm (Function):
            #     The function to apply to each element. Must take two arguments: an element of the collection and the value from the previous iteration.
            # first (Object, optional):
            #     The initial state.
            # Returns: ComputedObject

        img = imgcoll.iterate(datatype_to_func[data_type])#data_type-> whether it is image, mask or temperature
        #stacking images all over the years for particular state and county
        img=ee.Image(img)   #need to explicitly use ee.Image because .iterate function returns Object

        # 'clip' the values opf the band
        if min_img_val is not None:
            # passing an ee.Number creates a constant image
            img_min=ee.Image(ee.Number(min_img_val))
            img=img.min(img_min)
#             min(image2)
            # Selects the minimum of the first and second values for each matched pair of bands in image1 and image2. If either image1 or image2 has only 1 band, then it is used against all the bands in the other image. If the images have the same number of bands, but not the same names, they're used pairwise in the natural order. The output bands are named for the longer of the two inputs, or if they're equal in length, in image1's order. The type of the output pixels is the union of the input types.
            # Arguments:
            # this:image1 (Image):
            #     The image from which the left operand bands are taken.
            # image2 (Image):
            #     The image from which the right operand bands are taken.
            # Returns: Image

        if max_img_val is not None:
            # passing an ee.Number creates a constant image
            img_max=ee.Image(ee.Number(max_img_val))
            img=img.max(img_max)

        # ????????????????????????? 
        # note that the county regions are pulled from Google's Fusion tables. This calls a merge
        # of county geometry and census data:
        # https://fusiontables.google.com/data?docid=1S4EB6319wWW2sWQDPhDvmSBIVrD3iEmCLYB7nMM#rows:id=1
        #Fusion tables are now deprecated. Use Earth engine catalog instead.

        region = ee.FeatureCollection('TIGER/2018/Counties')

        # turn the strings into numbers, see
        # https://developers.google.com/earth-engine/datasets/catalog/TIGER_2018_Counties
        def state_to_int(feature):
            return feature.set('COUNTYFP', ee.Number.parse(feature.get('COUNTYFP')))#how we got to know about string 'COUNTYFP' what does this mean
        region=region.map(state_to_int)

        count=0
        for state_id, county_id in np.unique(self.locations[['State ANSI','County ANSI']].values,axis=0):
            if major_states_only:
                if (int)(state_id) not in MAJOR_STATES:
                    print(f'Skipping state id {int(state_id)}')
                    continue
            fname='{}_{}'.format(int(state_id),int(county_id))

            if check_if_done:
                if f'{fname}.tif' in already_downloaded:
                    print(f'{fname}.tif already downloaded! Skipping')
                    continue
            
            file_region=region.filterMetadata('COUNTYFP','equals', int(county_id))
            file_region=ee.Feature(file_region.first())

        # ee.Feature(geometry, properties)
        # Features can be constructed from one of the following arguments plus an optional dictionary of properties:
        # - An ee.Geometry.
        # - A GeoJSON Geometry.
        # - A GeoJSON Feature.
        # - A computed object: reinterpreted as a geometry if properties are specified, and as a feature if they aren't.
        # Arguments:
        # geometry (ComputedObject|Feature|Geometry|Object):
        #     A geometry or feature.
        # properties (Object, optional):
        #     A dictionary of metadata properties. If the first parameter is a Feature (instead of a geometry), this is unused.
        # Returns: Feature

        # filterMetadata(name, operator, value)
        # Shortcuts to filter a collection by metadata. This is equivalent to this.filter(ee.Filter.metadata(...)).
        # Returns the filtered collection.
        # Arguments:
        # this:collection (Collection):
        #     The Collection instance.
        # name (String):
        #     The name of a property to filter.
        # operator (String):
        #     The name of a comparison operator. Possible values are: "equals", "less_than", "greater_than",
        #     "not_equals", "not_less_than", "not_greater_than", "starts_with",
        #     "ends_with", "not_starts_with", "not_ends_with", "contains",
        #     "not_contains".
        # value (Object):
        #     - The value to compare against.
        # Returns: Collection
            processed_img=img.clip(file_region)

        # clip(geometry)
        
        # Clips an image to a Geometry or Feature.
        # The output bands correspond exactly the input bands, except data not covered by the geometry is masked. The output image retains the metadata of the input image.
        # Use clipToCollection to clip an image to a FeatureCollection.
        # Returns the clipped image.
        # Arguments:
        # this:image (Image):
        #     The Image instance.
        # geometry (Feature|Geometry|Object):
        #     The Geometry or Feature to clip to.
        # Returns: Image

            file_region=None
            while True:
                # try:
                self._export_one_image(processed_img,folder_name,fname, file_region, scale, coordinate_system)
                # except (ee.ee_exception.EEException, ssl.SSLEOFError):
                #     print(f'Retrying State {int(state_id)}, County  {int(county_id)}')
                #     # print(f'Unable to export State {int(state_id)}, County  {int(county_id)}, so skipping')
                #     # break
                    # time.sleep(10)
                    # continue
                break

            count+=1
            if export_limit:
                if count>=export_limit:
                    print('Reached export limit!! Stopping...')
                    break
        print(f'Finished Exporting {count} files!')


    def export_all(self,export_limit=None, major_states_only=True, check_if_done=True,
                   download_folder=None):
        """
        Export all the data.
        download_folder = list of 3 pathlib Paths, for each of the 3 downloads
        """
        if download_folder is None:
            download_folder=[None]*4
        assert len(download_folder)==4, "Must have 3 download folders for the 3 exports!"

        #first, make sure the class was initialized correctly
        # self.update_parameters(locations_filepath=load_clean_yield_data('SOP- Time Series Analysis with Deep Learning/yield_data.csv'),
                            #    collection_id='MODIS/MOD09A1')
        
        # # pull_MODIS_entire_county_clip.py
        # self.export(folder_name='SOP- Time Series Analysis with Deep Learning/crop_yield-data_image', data_type='image',
        #             min_img_val=16000, max_img_val=100,
        #             export_limit=export_limit, major_states_only=major_states_only,
        #             check_if_done=check_if_done, download_folder=download_folder[0])
        
        self.export(folder_name='crop_yield_image', data_type='image',
                    min_img_val=16000, max_img_val=100,
                    export_limit=export_limit, major_states_only=major_states_only,
                    check_if_done=check_if_done, download_folder=download_folder[0])

        # pull_MODIS_landcover_entire_county_clip.py
        self.update_parameters(collection_id='MODIS/006/MCD12Q1')#006 instead of 051 as 051 showed unavailable and suggested 006 to use
        # self.export(folder_name='SOP- Time Series Analysis with Deep Learning/crop_yield-data_mask', data_type='mask',
        #             export_limit=export_limit, major_states_only=major_states_only,
        #             check_if_done=check_if_done, download_folder=download_folder[1])
        self.export(folder_name='crop_yield_mask', data_type='mask',
                    export_limit=export_limit, major_states_only=major_states_only,
                    check_if_done=check_if_done, download_folder=download_folder[1])

        # pull_MODIS_temperature_entire_county_clip.py
        self.update_parameters(collection_id='MODIS/006/MOD11A2')
        # self.export(folder_name='SOP- Time Series Analysis with Deep Learning/crop_yield-data_temperature', data_type='temperature',
        #             export_limit=export_limit, major_states_only=major_states_only,
        #             check_if_done=check_if_done, download_folder=download_folder[2])
        self.export(folder_name='crop_yield_temp', data_type='temperature',
                    export_limit=export_limit, major_states_only=major_states_only,
                    check_if_done=check_if_done, download_folder=download_folder[2])
        
        self.update_parameters(collection_id="NASA/ORNL/DAYMET_V3")
        # self.export(folder_name='SOP- Time Series Analysis with Deep Learning/crop_yield-data_temperature', data_type='temperature',
        #             export_limit=export_limit, major_states_only=major_states_only,
        #             check_if_done=check_if_done, download_folder=download_folder[2])
        self.export(folder_name='crop_yield_daymet', data_type='weather',
                    export_limit=export_limit, major_states_only=major_states_only,
                    check_if_done=check_if_done, download_folder=download_folder[3])



        print('Done exporting! Download the folders from your Google Drive')



    # first call current=1st image, previous=none, return 1st image
    # 2nd call current, 2nd image, first
    # 1_1, 1_2, 1_3, 2_1, 2_2, 2_3
def _append_im_band(current,previous):                                  #how it's working??
                                                                    # type for previous and cuurent??

    # Transforms an Image Collection with 1 band per Image into a single Image with items as bands
    # Author: Jamie Vleeshouwer

    #Rename the band
    previous=ee.Image(previous)
    current=current.select([0, 1, 2, 3, 4, 5, 6])
    # Append it to the result (Note: only return current item on first element/iteration)
    return ee.Algorithms.If(ee.Algorithms.IsEqual(previous,None),current,previous.addBands(ee.Image(current)))
    # so basically we are adding bands here, and now our final image have all images and their corresponding bands
    
    # select(var_args)
    # Selects bands from an image.
    # Returns an image with the selected bands.
    # Arguments:
    # this:image (Image):
    #     The Image instance.
    # var_args (VarArgs<Object>):
    #     One of two possibilities:
    #     - Any number of non-list arguments. All of these will be interpreted as band selectors. 
    # These can be band names, regexes, or numeric indices. E.g. selected = image.select('a', 'b', 3, 'd');
    #     - Two lists. The first will be used as band selectors and the second as new names for the selected bands. 
    # The number of new names must match the number of selected bands. E.g. selected = image.select(['a', 4], ['newA', 'newB']);
    # Returns: Image

    # addBands(srcImg, names, overwrite)
    # Returns an image containing all bands copied from the first input and selected bands from the second input, optionally overwriting bands in the first image with the same name. 
    # The new image has the metadata and footprint from the first input image.
    # Arguments:
    # this:dstImg (Image):
    #     An image into which to copy bands.
    # srcImg (Image):
    #     An image containing bands to copy.
    # names (List, default: null):
    #     Optional list of band names to copy. If names is omitted, all bands from srcImg will be copied over.
    # overwrite (Boolean, default: false):
    #     If true, bands from srcImg will override bands with the same names in dstImg. 
    # Otherwise the new band will be renamed with a numerical suffix ('foo' to 'foo_1' unless 'foo_1' exists, then 'foo_2' unless it exists, etc).
    # Returns: Image    

def _append_mask_band(current,previous):
    # Transforms an Image Collection with 1 band per Image into a single Image with items as bands
    # Author: Jamie Vleeshouwer

    # Rename the band
    previous=ee.Image(previous)
    current=current.select([0])
    # Append it to the result (Note: only return current item on first element/iteration)
    return ee.Algorithms.If(ee.Algorithms.IsEqual(previous,None),current,previous.addBands(ee.Image(current)))

def _append_temp_band(current,previous):
    # Transforms an Image Collection with 1 band per Image into a single Image with items as bands
    # Author: Jamie Vleeshouwer

    # Rename the band
    previous=ee.Image(previous)
    current=current.select([0,4])
    # Append it to the result (Note: only return current item on first element/iteration)
    return ee.Algorithms.If(ee.Algorithms.IsEqual(previous, None), current, previous.addBands(ee.Image(current)))

def _append_weather_band(current,previous):
    # Transforms an Image Collection with 1 band per Image into a single Image with items as bands
    # Author: Jamie Vleeshouwer

    # Rename the band
    previous=ee.Image(previous)
    current=current.select([1,6])
    # Append it to the result (Note: only return current item on first element/iteration)
    return ee.Algorithms.If(ee.Algorithms.IsEqual(previous, None), current, previous.addBands(ee.Image(current)))




In [None]:
yield_data_path='SOP_download/yield_data.csv' #taken from pycrop
                                                                # putting less entries
yield_data_path = Path(yield_data_path)
exporter = ModisExporter(locations_filepath=yield_data_path)
exporter.export_all()

In [7]:
image=np.transpose(np.array(gdal.Open(str('/content/drive/MyDrive/data_Lakshya/crop_yield-data_mask_Lakshya/18_159.tif')).ReadAsArray(), dtype='uint16' ),
                        axes=(1,2,0))

In [10]:
print(np.count_nonzero(image==12))
print(image.shape)

2
(89, 75, 10)
