In [8]:
import sys
import glob
import os
import pandas as pd
import ee
import numpy as np
import geopandas as gp

import matplotlib.pyplot as plt
import rsfuncs as rs

from scipy import stats
from scipy.signal import savgol_filter
from datetime import datetime, timedelta
from datetime import datetime as dt

%load_ext autoreload
%autoreload 2

plt.rcParams['figure.figsize'] = [18,16]
ee.Initialize()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# EE functions are in the rsfuncs module. Loacal functions are here: 

In [10]:
def read_sm_file(filename):
    with open(filename) as f:
        contents = f.readlines()

    data = []
    for line in contents:
        if line[0:1] == "#":
            continue
        else:
            data.append(line)

    headers = [x.replace("Soil Moisture Percent","smp").replace(" ","_") for x in data[0].split(",")]
    cols = [x.strip("\n").split(",") for x in data[1:]]

    df = pd.DataFrame(cols, columns = headers)
    
    return df

def imlist_2_timeseries(imlist,polarization, area):
    
    '''
    This essemtially combines `get_ims_by_date` and `array from latlon` functions below 
    '''
    num_ims = len(imlist.getInfo())

    ims = []
    dates = []
    
    for idx in range (0, num_ims):
        latlon = ee.Image.pixelLonLat().addBands(imlist.get(idx))
        res = latlon.reduceRegion(reducer=ee.Reducer.toList(),geometry=area,maxPixels=1e8,scale=10)
        
        try:
            lats = np.array((ee.Array(res.get("latitude")).getInfo()))
            lons = np.array((ee.Array(res.get("longitude")).getInfo()))
            data = np.array((ee.Array(res.get(polarization)).getInfo()))
        except:
            data = np.full_like(lats, np.nan,dtype=np.float64)

        im = make_np_array(data, lats, lons)
        ims.append(im)
        
        date =  latlon.get('system:time_start')
        info_dict = imlist.get(i).getInfo()
        date = info_dict['id']
        dates.append(date)
        
    return ims, dates

def get_ims_by_date(ims_list, var, res=10):
    imlist = []
    imdates = []
    num_images = len(ims_list.getInfo())

    for i in range (0, num_images):
        
        if i % 5 == 0:
            print(str((i / num_images)*100)[:5] + " % ")

        latlon = ee.Image.pixelLonLat().addBands(ims_list.get(i))
        imlist.append(array_from_latlon(latlon, var, res))
        date =  latlon.get('system:time_start')
        info_dict = ims_list.get(i).getInfo()
        date = info_dict['id']
        imdates.append(date)
  
    return imlist, imdates

def array_from_latlon(latlon_obj, var, res):
    res = latlon_obj.reduceRegion(reducer=ee.Reducer.toList(),geometry=area,maxPixels=1e8,scale=res)
    try:
        lats = np.array((ee.Array(res.get("latitude")).getInfo()))
        lons = np.array((ee.Array(res.get("longitude")).getInfo()))
        data = np.array((ee.Array(res.get(var)).getInfo()))
    except:
        data = np.full_like(lats, np.nan,dtype=np.float64)
    
    out = make_np_array(data, lats, lons)
    return out   

def make_np_array(data, lats, lons):
    # get data from df as arrays
    lons = np.array(lons)
    lats = np.array(lats)
    data = np.array(data) # Set var here 
                                              
    # get the unique coordinates
    uniqueLats = np.unique(lats)
    uniqueLons = np.unique(lons)

    # get number of columns and rows from coordinates
    ncols = len(uniqueLons)    
    nrows = len(uniqueLats)

    # determine pixelsizes
    ys = uniqueLats[1] - uniqueLats[0] 
    xs = uniqueLons[1] - uniqueLons[0]

    # create an array with dimensions of image
    arr = np.zeros([nrows, ncols], np.float32)

    # fill the array with values
    counter =0
    for y in range(0,len(arr),1):
        for x in range(0,len(arr[0]),1):
            if lats[counter] == uniqueLats[y] and lons[counter] == uniqueLons[x] and counter < len(lats)-1:
                counter+=1
                arr[len(uniqueLats)-1-y,x] = data[counter] # we start from lower left corner
    
    return arr

def filter_date(product,y,m,d):
    start = ee.Date.fromYMD(y,m,d).advance(-1, "day")
    end = ee.Date.fromYMD(y,m,d)
    prod = product.filterDate(start, end).sort('system:time_start', False).select("ppt")
    return prod

def format_dates(dates):
    for idx, x in enumerate(dates):
        timestamp = x.find("V_")+2
        timestr = x[timestamp:timestamp+13]
        dates[idx] = pd.to_datetime(timestr, format='%Y%m%d %H:%M')
    return dates

def get_2day_precip(latlon_obj, area):
    res = latlon_obj.reduceRegion(reducer=ee.Reducer.sum(),geometry=area,scale=10)
    data = np.array((ee.Array(res.get("ppt")).getInfo()))
    out = np.array(data)
    return out 

def get_ndvi(latlon_obj, area):
    res = latlon_obj.reduceRegion(reducer=ee.Reducer.mean(),geometry=area,scale=10)
    data = np.array((ee.Array(res.get("NDVI")).getInfo()))
    out = np.array(data)
    return out 

def round_up_to_odd(f):
    f = int(np.ceil(f))
    return f + 1 if f % 2 == 0 else f

In [11]:
site_file = gp.read_file("../shape/scan_sites.shp")
sites = site_file[~site_file['state'].isin(["AK", "HI", "PR", "VI"])]

In [12]:
data = rs.load_data()

In [13]:
data_dir = "../data"

In [14]:
out_dict = {}

In [37]:
# For each site id, find the sm file with the data (in data dir), 
# calculate psi as psi = A * SM ^b (krishna's paper )
# query the (1) landcover, (2) Sentinel backscatter (Prism P), MODIS / Landsat LAI for the whole timeseries 
   
for idx, row in sites[10:].iterrows():
    
    if row.id in out_dict.keys():
        print(row.id)
        continue
        
    print("Processings site no {}".format(row.id))
    
    # Make geom to submit to EE 
    x,y = row.geometry.buffer(0.0001).envelope.exterior.coords.xy
    coords = [list(zip(x,y))]
    area = ee.Geometry.Polygon(coords)
    
    # Get the corresponding SCAN data file from data folder
    site_id = row.id
    sm_file = [os.path.join(data_dir,x) for x in os.listdir(data_dir) if site_id in x][0]
    sm_dat = read_sm_file(sm_file)
    sm_dat['Date'] =  pd.to_datetime(sm_dat['Date'], format='%Y%m%d %H:%M')
    sm_dat.set_index('Date', inplace=True)
        
    # start and end date
    if sm_dat.empty:
        print("no valid soil moisture data for {}".format(row.id))
        continue
        
    startdate = sm_dat.index[0]
    enddate = sm_dat.index[-1]
    
    date = startdate.strftime("%Y-%m-%d")
        
    # Select the nlcd dataset
    dataset =  rs.load_data()['nlcd']
    ic = dataset[0]
    var = dataset[1]
    res = dataset[3]
    
    # find the nearest nlcd dataset
    col = ic.filterDate(ee.Date(date).advance(-1, 'years'), ee.Date(date).advance(2, 'years')).first()
    if not col.getInfo():
        col = ic.filterDate(ee.Date(date).advance(-3, 'years'), ee.Date(date).advance(2, 'years')).first()
    t = col.reduceRegion(ee.Reducer.frequencyHistogram(), area, res).get(var)
    meta = col.getInfo()
    lc_class = int(list(t.getInfo().keys())[0])
    
    # Get the landcover type
    lcidx = meta['properties']['landcover_class_values'].index(lc_class)
    lctype = meta['properties']['landcover_class_names'][lcidx]
    print(lctype)
    
    # Get Sentinel images and dates (descending orbits only, VV, HV polarization)
    s1 = rs.load_data()['s1']
    s1ic, s1var, s1res = s1[0], s1[1], s1[3]
    
    # Krishna used ascending pass... I think descending is the correct orbit for the AM 

    col = s1ic.filterBounds(area).filter(ee.Filter.eq('orbitProperties_pass', 'DESCENDING')).select(s1var).filterDate(startdate,enddate).sort('system:time_start')
    vv = col.filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'VV'))
    hv = col.filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'VH'))
    
    # Fetch the S1 data 
    try:
        vv_ims = vv.toList(vv.size())
        print("Processing {} VV sentinel overapasses".format(len(vv_ims.getInfo())))
        s1_vv, vv_dates = get_ims_by_date(vv_ims,"VV")
    except:
        print("no valid VV overpasses")
        s1_vv = []
        vv_dates = []
        pass
    
    try:
        hv_ims = hv.toList(hv.size())
        print("Processing {} HV sentinel overapasses".format(len(hv_ims.getInfo())))
        s1_hv, hv_dates = get_ims_by_date(hv_ims,"VH")
    except:
        print("no valid HV overpasses")
        s1_hv = []
        hv_dates = []
        pass 
    
    if len(s1_hv) == 0 and len(s1_vv) == 0:
        print("No valid sentinel data ")
        continue
    
    vvdates = format_dates(vv_dates)
    hvdates = format_dates(hv_dates)
    
    vvdf = pd.DataFrame(list(dict(zip(vv_dates, [item for sublist in s1_vv for item in sublist])).items()),
                      columns=['t','vv'])
    hvdf = pd.DataFrame(list(dict(zip(vv_dates, [item for sublist in s1_hv for item in sublist])).items()),
                      columns=['t','vv'])
    print(vvdf)
    print(hvdf)
        
    # Calc the S1 backscatter in each image
#     mean_sigmas = []
#     std_sigmas = []

#     for i in s1dat:
#         mean_sigmas.append(np.mean(i))
#         std_sigmas.append(np.std(i))
        
    # Convert the datestrings from S1 to pandas datetimes 
    

    
    print(vvdates)
        
    # Get PRISM data for all the S1 overpass dates to filter the rainy days
    print("processing PRISM")
    rainfall = []

    for i,x in enumerate(dates):
        if i % 5 == 0:
            print(str((i / len(dates))*100)[:5] + " % ")
        
        y,m,d = dates[i].year, dates[i].month, dates[i].day
        t = filter_date(rs.load_data()['prism_daily'][0], y, m, d).sum()
        precip_total = get_2day_precip(t, area)
        rainfall.append(precip_total)

                
    # Landsat - Note: some sites are in the overlap areas between passes.
    # these sites can have multiple obs / day or obs separated by 8days instead of 16. 
    
    print("Processing Landsat")
    landsat = rs.load_data()['l8_sr']
    lic, lvar, lsf = landsat[0],landsat[1], landsat[2]

    lstart = ee.Date.fromYMD(startdate.year,startdate.month,startdate.day).advance(-9,"day")
    lend = ee.Date.fromYMD(enddate.year,enddate.month,enddate.day).advance(8, "day")

    l8_col = lic.filterDate(lstart,lend).filterBounds(area).map(rs.mask_quality) # Mask clouds and shadows 
    lt = l8_col.sort('system:time_start')
    lims = lt.toList(lt.size())

    num_ims = len(lims.getInfo())

    ldfs = []

    for i in range (0, num_ims):
        if i % 5 == 0:
            print(str((i / num_ims)*100)[:5] + " % ")

        ls_latlon = ee.Image.pixelLonLat().addBands(lims.get(i))
        ltemp = ls_latlon.select(["B1","B2","B3","B4","B5","B6","B7"]).multiply(lsf)
        l8_res = ltemp.reduceRegion(reducer=ee.Reducer.mean(),geometry=area,bestEffort=True,scale=30)

        l8_info_dict = lims.get(i).getInfo()
        l8_date = l8_info_dict['id'][-8:]

        l8_out = l8_res.getInfo()

        ldf = pd.DataFrame.from_dict(l8_out.values()).T
        ldf.columns = l8_out.keys()
        ldf.index = pd.to_datetime([l8_date])
        ldfs.append(ldf)
        
                        
    # Filter the SCAN data for the S1 dates, use 3 am - 7 am mean 
    rzsm = []
    ssm = []
    
    for i in dates:
        starttime = i.replace(second=0, microsecond=0, minute=0, hour=3)
        endtime = starttime+timedelta(hours= 4)
        df = pd.DataFrame(sm_dat[starttime:endtime])
        
        rzsm.append(df[df.columns[-1]].values)
        ssm.append(df[df.columns[-3]].values)
    
    # In case there are nans or data gaps in the sm data
    rzsm = [list(filter(None, x)) for x in rzsm]
    ssm = [list(filter(None, x)) for x in ssm]
    
    # Calc the 5 hour mean for each sensor reading 
    for idx,x in enumerate(rzsm):
        rzsm[idx] = np.nanmean([np.float(i) for i in x])
    
    for idx,x in enumerate(ssm):
        ssm[idx] = np.nanmean([np.float(i) for i in x])

    # params to calculate psi 
    a = row.a
    b = row.b
    
    # RZ and Surface LWP = A * sm ^b
    psi1 = [a*(x/100)**b for x in rzsm]
    psi2 = [a*(x/100)**b for x in ssm]
    
    # Make a dataframe out of everything
    df = pd.DataFrame([dates, mean_sigmas, std_sigmas, rainfall, modis_lai, rzsm,psi1,ssm,psi2, [lcidx]*len(dates)])
    df = df.T
    df.columns = (['date', "sigma", "std_sigma", "precip","LAI", "rzsm","psi_rz", "ssm", "psi_s", "lc_type"])
    df = rs.col_to_dt(df) # set the date col as datetime index 
    dfout = df.copy()
    
    # Drop non rainy overpasses from df 
#     mask=(df['precip'] < 0.1)
#     dfout = df[mask]

    # Golay interpolation for landsat 
    
    ls = pd.concat(ldfs)
    win_len = round_up_to_odd(num_ims/7)
    
    for i in ls.columns:
        ls[i][ls[i] == 0] = np.nan
        ls[i] = ls[i].interpolate(method = "linear")
        ls[i+"_filt"] = savgol_filter(ls[i], window_length=win_len, polyorder=2)
    
    ls_df = ls[(ls.T != 0).any()]
    ls_df = ls_df.groupby(level = 0).mean()
    
    # Break the loop if there are no non- rainy sentinel overpasses 
    if dfout.empty:
        print("No non-rainy overpaasses ")
        continue 
        
    l8_dfs = []
    
    for i in dfout.index:
        l8_idx = ls_df.index.get_loc(i, method='nearest')
        l8_contemp = ls_df.iloc[l8_idx]
        t = pd.DataFrame(l8_contemp).T
        l8_dfs.append(t)
    
    fin_ls = pd.concat(l8_dfs)

    FIN = pd.concat( [dfout.reset_index(drop=True), fin_ls.reset_index(drop=True)], axis=1) 
    FIN.index = dfout.index

    print(FIN.head())
    
    out_dict[(row.id)] = FIN

Processings site no 2181
Cultivated Crops - Areas used for the production of annual crops, such as corn, soybeans, vegetables, tobacco, and cotton, and also perennial woody crops such as orchards and vineyards. Crop vegetation accounts for greater than 20 percent of total vegetation. This class also includes all land being actively tilled.
Processing 4 VV sentinel overapasses
0.0 % 
Processing 4 HV sentinel overapasses
0.0 % 
                    t                        vv
0 2016-09-02 11:49:00         [-11.077534, 0.0]
1 2017-09-15 11:48:00  [-11.077534, -12.701497]
2 2018-10-10 11:49:00        [-11.4798155, 0.0]
3 2018-10-16 11:48:00  [-10.479199, -10.700825]
                    t          vv
0 2016-09-02 11:49:00  [nan, 0.0]
1 2017-09-15 11:48:00  [nan, nan]
2 2018-10-10 11:49:00  [nan, 0.0]
3 2018-10-16 11:48:00  [nan, nan]
[Timestamp('2016-09-02 11:49:00'), Timestamp('2017-09-15 11:48:00'), Timestamp('2018-10-10 11:49:00'), Timestamp('2018-10-16 11:48:00')]
processing PRISM


NameError: name 'dates' is not defined

In [36]:
vvdf, hvdf

(                    t                                              vv
 0 2016-09-02 11:49:00   [[-11.077534, 0.0], [-11.077534, -12.701497]]
 1 2017-09-15 11:48:00  [[-11.4798155, 0.0], [-10.479199, -10.700825]]
 2 2018-10-10 11:49:00   [[-11.404377, 0.0], [-11.404377, -11.377384]]
 3 2018-10-16 11:48:00     [[-10.835797, 0.0], [-12.53007, -8.870243]],
                     t                        vv
 0 2016-09-02 11:49:00  [[nan, 0.0], [nan, nan]]
 1 2017-09-15 11:48:00  [[nan, 0.0], [nan, nan]]
 2 2018-10-10 11:49:00  [[nan, 0.0], [nan, nan]]
 3 2018-10-16 11:48:00  [[nan, 0.0], [nan, nan]])

In [33]:
vvdf

(vvdf.groupby('t')['vv']
            .apply(lambda x: pd.DataFrame(x.iloc[0], columns=['A', 'B', 'C', 'D']))
            .reset_index())

ValueError: Shape of passed values is (2, 2), indices imply (2, 4)

In [24]:
pd.DataFrame(list(dict(zip(vv_dates, s1_vv)).items()),
                      columns=['t','vv'])

Unnamed: 0,t,vv
0,2016-09-02 11:49:00,"[[-11.077534, 0.0], [-11.077534, -12.701497]]"
1,2017-09-15 11:48:00,"[[-11.4798155, 0.0], [-10.479199, -10.700825]]"
2,2018-10-10 11:49:00,"[[-11.404377, 0.0], [-11.404377, -11.377384]]"
3,2018-10-16 11:48:00,"[[-10.835797, 0.0], [-12.53007, -8.870243]]"


In [None]:
t1_ims = t1.toList(t1.size())
s1_vv, vv_dates = get_ims_by_date(t1_ims,s1var)

In [None]:
s1_vv

In [None]:
t = pd.concat(out_dict)

In [None]:
len(t)

In [None]:
t.to_csv("../data/all_dat_desc.csv")