In [None]:
# Adapted from Robert Guthrie https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
# And: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
import sklearn
from sklearn.linear_model import LinearRegression
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import json
import zoib

torch.manual_seed(1)

In [None]:
def split_train_test_val(df):
    ind_year = np.where(np.array(traj.index.names)=='year')[0][0]
    train_df = df.loc[df.index.get_level_values(ind_year)<=2010]
    val_df = df.loc[(df.index.get_level_values(ind_year)>2010) & (df.index.get_level_values(ind_year)<=2014)]
    test_df = df.loc[df.index.get_level_values(ind_year)>2014]
    return train_df, val_df, test_df

In [None]:
def prep_lc_frac_df(ids=[]):
    """LC Frac csv is hardcoded! Change if you need it"""
    lc_df = pd.read_csv('../data/fraster_landcover_allyears_bigger.csv').set_index('id')
    if len(ids)>0:
        lc_df = lc_df.loc[ids]
    lc_frac = pd.DataFrame()
    for col in lc_df.columns:
        year = int(col[0:4])
        jsond = lc_df[col].str.replace(r'([0-9]+)(:)', r'"\1"\2', regex=True).apply(json.loads)
        temp_frac_df = (pd.json_normalize(jsond)/5000)
        temp_frac_df.columns = ['lcf{}'.format(lc) for lc in temp_frac_df.columns]
        temp_frac_df = temp_frac_df.assign(id=lc_df.index, year=year)
        lc_frac = lc_frac.append(temp_frac_df)
    lc_frac.fillna(0,inplace=True)
    
    return lc_frac.set_index(['id','year'])

In [None]:
def read_join_csv(inun_csv, drop_zeros=True):
    # Prep inundation data
    inun_df = pd.read_csv(inun_csv)
    inun_df.set_index(['id','year','month'], inplace=True)
    inun_df = inun_df.loc[~inun_df['inundation'].isna()]
    if drop_zeros:
        max_inun = inun_df.groupby('id').agg({'inundation':'max'})
        zero_ids = max_inun.loc[max_inun['inundation']==0].index
        inun_df.drop(zero_ids, inplace=True)
        if inun_df.shape[0]==0:
            return 
        
    # Prep weather data
    weather_csv = inun_csv.replace('inun_frac_','weather_')
    weather_df = pd.read_csv(weather_csv)
    weather_df.set_index(['id','year','month'], inplace=True)
    joined_df = weather_df.join(inun_df, how='inner')
    
    # Finally, prep landcover fraction dataframe
    # Both prep and join are a bit slow
    # Could prep into fractions ahead of time
    # And also split up lc df by county
    lc_frac_df = prep_lc_frac_df(ids=joined_df.index.get_level_values(0).unique())
    joined_df = joined_df.join(lc_frac_df, how='inner')
    
    return joined_df

# Load data

In [None]:
target_num_playas = 1

In [None]:
inun_csv_list = glob.glob('../data/state_county_csvs/counties/inun_frac*')

In [None]:
joined_df = pd.DataFrame()
while joined_df.index.get_level_values(0).unique().shape[0] <= target_num_playas:
    rand_csv = np.random.choice(inun_csv_list)
    joined_df = pd.concat([joined_df, read_join_csv(rand_csv, drop_zeros=True)])
    
joined_df.fillna(0, inplace=True)

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    # Adapted from: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
    n_vars = 1 if type(data) is list else data.shape[1]
    old_cols = data.columns
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.groupby('id').shift(i))
        names += [('%s(t-%d)' % (old_cols[j], i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('%s(t)' % (old_cols[j])) for j in range(n_vars)]
        else:
            names += [('%s(t+%d)' % (old_cols[j], i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

# Prep Data

In [None]:
traj = joined_df.loc[joined_df.index.get_level_values(0)]
traj = traj.drop(columns=['area'])#[['inundation', 'acres', 'vpd', 'temp','precip']]
n_features = traj.shape[1]
traj['inundation'].plot()
print(traj.shape)

In [None]:
traj = series_to_supervised(traj, n_in=0)

In [None]:
len_of_timeseries = traj.loc[traj.index.get_level_values(0)[0]].shape[0]
new_ids = np.array([
    np.repeat(i, len_of_timeseries) for i in range(traj.index.get_level_values(0).unique().shape[0])]
).flatten()
# traj = traj.assign(id=traj.index.get_level_values(0)) # Put id at end for embedding
traj = traj.assign(id=new_ids) # Put id at end for embedding

In [None]:
# Pop inundation to end
inun = traj.pop('inundation(t)')
traj['inundation(t)'] = inun


# Prep and run model

In [None]:
def tensorfy(x, y, batch_size):
    batch_starts = np.arange(0, x.shape[0], batch_size)
    x_tensor = [torch.tensor(np.array(x[i:(i+batch_size)])).float() for i in batch_starts]
    if len(x_tensor[-1]) < batch_size: # drop last batch if not even
        y = y[:-len(x_tensor[-1])]
        x_tensor = x_tensor[:-1]
    return x_tensor, y

In [None]:
scaler = StandardScaler()
train, val, test = split_train_test_val(traj)
train_X, train_y = train.values[:, :-1], train.values[:, -1]
val_X, val_y = val.values[:, :-1], val.values[:, -1]
test_X, test_y = test.values[:, :-1], test.values[:, -1]
# Run scaler, but not on ID
train_X[:,:-1] = scaler.fit_transform(train_X[:,:-1])
val_X[:,:-1] = scaler.transform(val_X[:,:-1])
test_X[:,:-1] = scaler.transform(test_X[:,:-1])

In [None]:
batch_size = 24# int(train_X.shape[0]/traj.index.get_level_values(0).unique().shape[0])
batch_size_val = 48#int(val_X.shape[0]/traj.index.get_level_values(0).unique().shape[0])
num_playas = int(train_X[:, -1].max())+1
lstm_input_size = traj.shape[1]-1

# Params to set
hidden_dim = 128
embedding_dim = 2
loss_fn = 'zoib' # 'mae' or 'zoib'
if loss_fn=='zoib':
    output_dim=4
else:
    output_dim=1
num_layers=1
learning_rate = 0.1
num_epochs = 5000
regularization_weight = 0.01

In [None]:
# Create train, val, and test sets
train_X_tensor, train_y = tensorfy(train_X, train_y, batch_size)
val_X_tensor, val_y = tensorfy(val_X, val_y, batch_size_val)
test_X_tensor, test_y = tensorfy(test_X, test_y, batch_size_val)


In [None]:
# Here we define our model as a class
class LSTM(nn.Module):

    def __init__(self, input_dim, embedding_dim, num_playas, hidden_dim, batch_size, output_dim=1,
                    num_layers=1):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.num_playas = num_playas

        # Define the LSTM layer
        self.embedding = nn.Embedding(self.num_playas, self.embedding_dim)
        self.lstm = nn.LSTM(self.input_dim + self.embedding_dim - 1, self.hidden_dim, self.num_layers)

        # Define the output layer
        self.linear = nn.Linear(self.hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.exp = torch.exp
        self.sigmoid = nn.Sigmoid()
    
    def relu_01(self, x):
        x = torch.max(torch.zeros_like(x), torch.min(torch.ones_like(x), x))
        return x

    def init_hidden(self):
        # This is what we'll initialise our hidden state as
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, input):
        # Forward pass through LSTM layer
        # shape of lstm_out: [input_size, batch_size, hidden_dim]
        # shape of self.hidden: (a, b), where a and b both 
        # have shape (num_layers, batch_size, hidden_dim).
        input_reshape = torch.cat(input).view(len(input), self.batch_size, -1)
        self.emb_layer = self.embedding(input_reshape[:,:,-1].long())
        
        # Concat and run through LSTM
        lstm_out, self.hidden = self.lstm(torch.cat((input_reshape[:,:,:-1],self.emb_layer), 2))
        
        # Only take the output from the final timetep
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        lin_act = self.linear(lstm_out)
#         y_pred = self.sigmoid(lin_act)
#         y_pred = torch.cat((self.sigmoid(lin_act[:,:,0:2]),self.relu(lin_act[:,:,2:4])), 2)
        y_pred = torch.cat((self.sigmoid(lin_act[:,:,0:2]),torch.exp(lin_act[:,:,2:4])), 2)

    
        return y_pred#(y_pred-y_pred.min())/(y_pred.max()-y_pred.min())


model = LSTM(input_dim = lstm_input_size,
             embedding_dim=embedding_dim,
             num_playas=num_playas,
             hidden_dim=hidden_dim,
             batch_size=batch_size,
             output_dim=output_dim,
             num_layers=1)

In [None]:
l1_loss = nn.L1Loss()
    
optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=regularization_weight)
#####################---------------------------------------------------------------------------
# Train model
#####################

hist = np.zeros(num_epochs)

for t in range(num_epochs):
    # Clear stored gradient
    model.zero_grad()
    
    # Initialise hidden state
    # Don't do this if you want your LSTM to be stateful
    model.hidden = model.init_hidden()
    
    # Forward pass
    model.batch_size=batch_size
    train_pred = model(train_X_tensor) #.requires_grad_(True)
    
    # Val pred
    model.batch_size=batch_size_val
    val_pred = model(val_X_tensor)
    
    if loss_fn=='zoib':
        train_pred = train_pred.view(train_pred.shape[0]*train_pred.shape[1], 4)
        loss = zoib.zoib_loss(train_pred, train_y).float()
        val_pred = val_pred.view(val_pred.shape[0]*val_pred.shape[1], 4)
        val_loss = zoib.zoib_loss(val_pred, val_y).float()
    else:
        train_pred = train_pred.view(train_pred.shape[0]*train_pred.shape[1])
        loss = l1_loss(train_pred, torch.tensor(train_y)).float()
        val_pred = val_pred.view(val_pred.shape[0]*val_pred.shape[1])
        val_loss = l1_loss(val_pred, torch.tensor(val_y)).float()
    


    if t%50==0:
        print("Epoch ", t, "Train Loss: ", loss.item(), ", Val Loss: ", val_loss.item())
    hist[t] = loss
    if torch.isnan(loss).item():
        break
    # Zero out gradient, else they will accumulate between epochs
    optimiser.zero_grad()

    # Backward pass
    loss.backward()

    # Update parameters
    optimiser.step()
    
    last_train_pred = train_pred.clone()
    last_val_pred = val_pred.clone()

# View results

In [None]:
def zoib_expected(t):
    # E = q*(1-p) + (1-p-q)*(conc1/(conc1+conc0))
    # Or # = prob_1_given_not0*(1-prob_0) + (1 - prob_bernoulli)*(expect_val_beta)
    t = t.detach().numpy()
    prob_1 = t[:,1]*(1-t[:,0])
#     prob_bern = t[:,0]+t[:,1]
    prob_beta = (1 - t[:,0])*(1 - t[:,1])
    beta_expected = t[:,2]/(t[:,3]+t[:,2])
    return prob_1 + prob_beta*beta_expected

In [None]:
plt.scatter(zoib_expected(last_train_pred), train_y)

In [None]:
pd.DataFrame({'Pred':zoib_expected(last_train_pred), 'True':train_y}).plot(xlim=[0,100])

In [None]:
plt.scatter(zoib_expected(last_val_pred), val_y)

In [None]:
pd.DataFrame({'Pred':zoib_expected(last_val_pred), 'True':val_y}).plot()

In [None]:
param_df = pd.DataFrame(last_train_pred.detach().numpy())
param_df.columns = ['p','q','conc1','conc0']
pd.plotting.scatter_matrix(param_df)
plt.show()