In [1]:
import sys
import glob
import os
import math
import pandas as pd
import numpy as np
import geopandas as gp

import matplotlib.pyplot as plt
import rsfuncs as rs

from scipy import stats
from scipy.signal import savgol_filter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

from pandas.tseries.offsets import MonthEnd, SemiMonthEnd
from datetime import datetime, timedelta
from datetime import datetime as dt

from keras import regularizers, optimizers
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [13]:
# Read data
states_file = gp.read_file("../shape/states.shp")
states = states_file[~states_file['STATE_ABBR'].isin(["AK", "HI", "PR", "VI"])]

site_file = gp.read_file("../shape/scan_sites.shp")
sites = site_file[~site_file['state'].isin(["AK", "HI", "PR", "VI"])]

# formate data 
df = pd.read_csv("../data/all_dat_f.csv")
df.rename(columns={ df.columns[0]: "site" , df.columns[1]:"date"}, inplace = True)
df.replace([np.inf, -np.inf], np.nan, inplace = True)
df = df.set_index(pd.to_datetime(df.date))
df['date'] = pd.to_datetime(df.date)

In [14]:
# Filter the data

# Drop days with 2day precip less than 1 mm
df = df[df.precip<1]

# Remove Urban Areas
df = df[df.lc_type != 2]
df = df[df.lc_type != 3]

# Remove sites with <10 datapoints
for i in df.site.unique():
    if len(df[df.site == i]) < 10:
        df = df[df.site != i]

# Calculate spectral indices
df['ndvi'] = (df.B5 - df.B4) / (df.B5 + df.B4)
df["ndmi"] = (df.B5 - df.B6) / (df.B5 + df.B6)
df["evi"] = 2.5*(df.B5 - df.B4) / (df.B5 + 6*df.B4- 7.5*df.B2 + 1)

# For the backscatter columns (df.vv, df.hv), delete any zeros, nans, deal with weird formatting, and calc the mean 
vv_eff = []

for i in df.vv:
    line = i.replace("[","")
    line = line.replace("]","")
    line = ' '.join(line.split())
    data = [float(i) for i in line.split(' ')]
    data = [i for i in data if i !=0.]
    vv_eff.append(np.nanmean(data))
    

hv_eff = []

for i in df.hv:
    if type(i) is float:
        hv_eff.append(np.nan)
    else:
        line = i.replace("[","")
        line = line.replace("]","")
        line = ' '.join(line.split())
        data = [float(i) for i in line.split(' ')]
        data = [i for i in data if i !=0.]
        hv_eff.append(np.nanmean(data))


df['vv'] = vv_eff
df['hv'] = hv_eff

# calc the 12 day means for each site: 
df = df.groupby(['site']).resample('12D').mean().fillna(np.nan).dropna()


In [15]:
# Drop the non interpolated landsat bands
cols = [c for c in df.columns if 'filt' not in c]
df=df[cols]

In [16]:
# One hot encode the landcover types *** figure this out
# df = df.lc_type.astype(str)
one_hot = pd.get_dummies(df.lc_type, drop_first=True )
rdf = pd.concat([df, one_hot], axis = 1)
rdf = rdf.drop(["lc_type"], axis = 1)
# df = rdf.reset_index(level='site')
df = rdf.dropna()
df.columns = df.columns.map(str)

In [17]:
df.columns

Index(['site', 'precip', 'rzsm', 'psi_rz', 'ssm', 'psi_s', 'vv', 'hv', 'B1',
       'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'ndvi', 'ndmi', 'evi', '6.0', '7.0',
       '8.0', '11.0', '12.0', '16.0', '17.0'],
      dtype='object')

In [19]:
# Add the time lagged predictors 
s1 = df.groupby(level="site").shift(1)
df1 = df.join(s1.rename(columns=lambda x: x+"_t1"))

s2 = df.groupby(level="site").shift(2)
df2 = df1.join(s2.rename(columns=lambda x: x+"_t2"))

s3 = df.groupby(level="site").shift(3)
df3 = df2.join(s3.rename(columns=lambda x: x+"_t3"))

In [20]:
# Drop unnecessary cols 

In [21]:
dropstrs = ['site','precip', 'rzsm_', 'ssm', 'psi_rz', "psi_s"]
dropcols = []

for i in df3.columns:
    for x in dropstrs:
        if x in i:
            dropcols.append(i)
            
df3 = df3.drop(dropcols, axis = 1)

In [None]:
# One hot encode the landcover types *** figure this out
# df = df.lc_type.astype(str)
# one_hot = pd.get_dummies(df3.lc_type, drop_first=True )
# rdf = pd.concat([df3, one_hot], axis = 1)
# rdf = rdf.drop(["lc_type"], axis = 1)
# df = rdf.reset_index(level='site')
# df = df.dropna()

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# onehot_encoder = OneHotEncoder(sparse=False)
# integer_encoded = np.array(df.lc_type).reshape(len(np.array(df.lc_type)), 1)
# onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
# print(onehot_encoded.shape)
# # invert first example

In [None]:
# For now... 

In [23]:
# df3 = df3.drop(["lc_type"], axis = 1)
df = df3.reset_index(level='site')
df = df.dropna()

In [33]:
# Modeling options

EPOCHS = 100
BATCHSIZE = 50
DROPOUT = 0.25
LOSS = 'mse'

Areg = regularizers.l1(0.00005)
Breg = regularizers.l2(0.000000001)
Kreg = regularizers.l2(1e-15)
Rreg = regularizers.l2(1e-15)

def build_model(input_shape):
    # make the model 
    model = Sequential()
    
#     model.add(LSTM(10, input_shape=input_shape, dropout = 0.25,recurrent_dropout=0.05,return_sequences=True,bias_regularizer= Breg))
    model.add(LSTM(10, input_shape=input_shape, dropout = 0.25,recurrent_dropout=0.05,return_sequences=True,bias_regularizer= Breg))
    model.add(LSTM(10, input_shape=input_shape, dropout = 0.25,recurrent_dropout=0.05,bias_regularizer= Breg))

    model.add(Dense(1))
    model.compile(loss=LOSS, optimizer= "Nadam",
                  metrics=['mse'])
    
    return model


In [34]:
out = {"train_preds":[], # rs
       "test_preds":[], # wells
       "train_act":[], # c2vsim
       "test_act":[]  # grace 
        }

In [None]:
for i in df.site.unique():

    print("Processing {}".format(i))
    
    # Hold one out cross validation - loop through sites and select 1 to test on and the rest to train 
    sdf = df[df.site == i]
    y_test = sdf.rzsm
    X_test = sdf.drop(["site","rzsm"], axis=1)

    nsdf = df[df.site != i]
    y_train = nsdf.rzsm 
    X_train = nsdf.drop(["site","rzsm"], axis=1)
        
    # Scale data
    transformer_x = StandardScaler().fit(X_train)
    transformer_y = StandardScaler().fit(np.array(y_train).reshape(-1, 1)) 
    
#     transformer_x = MinMaxScaler(feature_range=(0,1)).fit(X_train)
#     transformer_y = MinMaxScaler(feature_range=(0,1)).fit(np.array(y_train).reshape(-1, 1)) 
    
    X_train = transformer_x.transform(X_train)
    y_train = transformer_y.transform(np.array(y_train).reshape(-1, 1))
    X_test = transformer_x.transform(X_test)
    y_test = transformer_y.transform(np.array(y_test).reshape(-1, 1))
    
    # LSTM params
    n_lags = 4
    n_features = 19

    X_train = X_train.reshape((X_train.shape[0], n_lags, 19))
    X_test = X_test.reshape((X_test.shape[0], n_lags, 19))

    input_shape=(X_train.shape[1], X_train.shape[2])
    
    model = build_model(input_shape=input_shape)
    
    # Fit
#     model.fit(X_train, y_train, epochs=1000, batch_size=1000, verbose=1)
    
    train_history = model.fit(X_train, y_train,epochs = 400,
                    batch_size=50,
                    verbose=2, validation_data=(X_test, y_test))
    loss = train_history.history['loss']
    val_loss = train_history.history['val_loss']
    val_acc = train_history.history['val_mean_squared_error']

    # Get the predictions
    yhat_train = model.predict(X_train)
    
    trainPredict = transformer_y.inverse_transform(yhat_train.reshape(-1,1))
    trainY = transformer_y.inverse_transform(y_train)
    
    yhat = model.predict(X_test)
    testPredict = transformer_y.inverse_transform(yhat.reshape(-1,1))
    testY = transformer_y.inverse_transform(y_test)

    trainScore = math.sqrt(mean_squared_error(y_train, trainPredict))
    print('Train Score: %.2f RMSE' % (trainScore))
    testScore = math.sqrt(mean_squared_error(y_test, testPredict))
    print('Test Score: %.2f RMSE' % (testScore))
    
    out["test_preds"] = testPredict
    out["train_preds"] = trainPredict
    
    out["test_act"] = testY
    out["train_act"] = trainY
    
    plt.plot(loss)
    plt.plot(val_loss)
    plt.legend(['loss', 'val_loss'])
    plt.show()


    plt.scatter(trainPredict, trainY)
    plt.scatter(testPredict, testY)
    plt.title('''
                train set RMSE = {}
                test set RMSE = {}
                '''.format(round(trainScore,2),round(testScore,2)))

    plt.xlabel("predicted")
    plt.ylabel("actual")
    plt.show()

Processing 581
Train on 2912 samples, validate on 6 samples
Epoch 1/400
 - 3s - loss: 0.9300 - mean_squared_error: 0.9300 - val_loss: 0.0134 - val_mean_squared_error: 0.0134
Epoch 2/400
 - 0s - loss: 0.8221 - mean_squared_error: 0.8221 - val_loss: 0.0352 - val_mean_squared_error: 0.0352
Epoch 3/400
 - 0s - loss: 0.7803 - mean_squared_error: 0.7803 - val_loss: 0.0545 - val_mean_squared_error: 0.0545
Epoch 4/400
 - 0s - loss: 0.7465 - mean_squared_error: 0.7465 - val_loss: 0.0693 - val_mean_squared_error: 0.0693
Epoch 5/400
 - 0s - loss: 0.7290 - mean_squared_error: 0.7290 - val_loss: 0.0592 - val_mean_squared_error: 0.0592
Epoch 6/400
 - 0s - loss: 0.7276 - mean_squared_error: 0.7276 - val_loss: 0.0218 - val_mean_squared_error: 0.0218
Epoch 7/400
 - 0s - loss: 0.7152 - mean_squared_error: 0.7152 - val_loss: 0.0581 - val_mean_squared_error: 0.0581
Epoch 8/400
 - 0s - loss: 0.7015 - mean_squared_error: 0.7015 - val_loss: 0.0430 - val_mean_squared_error: 0.0430
Epoch 9/400
 - 0s - loss: 0.

In [None]:
for i in df.site.unique():
    sdf = df[df.site == i]
    max_len = 0
    if len(sdf)>max_len:
        max_len = len(sdf)
        answer = i
    else:
        continue

In [None]:
answer