<a href="https://colab.research.google.com/github/issmythe/CS194App/blob/master/kenya/full_season_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Colab setup
from google.colab import auth, drive
from datetime import datetime

auth.authenticate_user()
drive.mount('/content/drive')

FIG_PATH = 'content/drive/MyDrive/figs/in_season/%s/' % datetime.today().strftime('%Y%m')
CODE_PATH = 'drive/MyDrive/Kenya/code/src/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# GCS setup
from datetime import datetime
FIG_PATH = '/home/jupyter/figs/%s/' % datetime.today().strftime('%Y%m')
CODE_PATH = 'nsf-p2/kenya/utils/'


# Setup

In [4]:
# Installs
!pip install fsspec gcsfs geopandas kaleido plotly torch &> /dev/null


In [5]:
#@title Imports
import itertools
import os
import sys
import time

import geopandas as gpd
import numpy as np
import pandas as pd

from datetime import datetime, timedelta
from functools import reduce

# Plotting
import plotly
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Analysis
from scipy.stats import pearsonr
from scipy.stats.mstats import winsorize

import sklearn
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_val_predict, HalvingGridSearchCV

from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import ignore_warnings

# Helpers
sys.path.append(CODE_PATH)
sys.path.append('content/' + CODE_PATH)

from harmonics import get_id_tuple
from utils import *

# Tensorflow
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter


In [6]:
#@title Constants and misc setup
sns.set()
path = os.getcwd()
os.chdir('../..')

GT_PATH = 'gs://nsf-phase2/yield/ground_data/Kenya/1AF_cropcuts_%d.csv'

DATA_PATH = 'content/drive/MyDrive/Kenya/data/'
! mkdir $FIG_PATH

pd.options.mode.chained_assignment = None

N_PERIODS = 14


mkdir: cannot create directory ‘content/drive/MyDrive/figs/in_season/202209/’: File exists


# Data

In [7]:
#@title Get constants and prediction df
START_MONTH = 3
SAMPLE_PERIODS = [(x - 2) * 2 for x in [5, 6, 7, 8, 10]]
YEARS = range(2016, 2020)

pred_df = pd.read_csv('gs://nsf-phase2/kenya/formatted_predictors_df.csv')

In [8]:
#@title Get yield df
def get_id(row):
    data = [int(row['year']), round(row['latitude'], 6), round(row['longitude'], 6)]
    return '_'.join([str(x) for x in data])

ground_data_all = read_ground_data()
ground_data_all['id'] = ground_data_all.apply(get_id, axis=1)
yields = ground_data_all.drop_duplicates(subset='id', keep=False)
yields = yields[(yields['yield'] >= yields['yield'].quantile(0.01)) &
                (yields['yield'] <= yields['yield'].quantile(0.99))]



# Metaparameters exploratory

In [9]:
dirs = ['gs://nsf-phase2/kenya/metaparams_test2016.csv',
        'gs://nsf-phase2/kenya/metaparams_test2017.csv',
        'gs://nsf-phase2/kenya/metaparams_v0.csv',
        'gs://nsf-phase2/kenya/metaparams_test2018_100.csv',
        'gs://nsf-phase2/kenya/metaparams_test2019.csv']

years = [2016, 2017, 2018, 2018, 2019]
early_stopping = [True, True, True, False, True]

mp = pd.concat([pd.read_csv(dirs[i]).assign(year=years[i], early_stopping=early_stopping[i]) 
                for i in range(len(dirs))])


In [25]:
"""
batch_size: 32 same/worse
conv_layers: 2 same/worse, might need more epochs
dense_neurons: 32 a little better, 8 a little worse
dropout: 0.5 is bad
initial_filters: 16 a little better, 4 a little worse
lr: 1e-5 is too low - check if 1e-3 looks stable, if so just go with it?
stride: seems worse though maybe better w diff kernel?
weight decay: can leave out
"""
print()




In [48]:
i = 10
p = mp['param'].unique()[i]
mp = mp.sort_values('corr')

# mp.drop_duplicates(subset=['year', 'param'], keep='last').sort_values('param')
baseline = mp.loc[mp['value'] == "['gcvi', 'dday29C', 'rain']"]
baseline = baseline[['year', 'early_stopping', 'corr']].rename({'corr': 'bl_corr'}, axis=1)
mp = mp.merge(baseline)
mp['corr_delta'] = mp['corr'] - mp['bl_corr']
mp = mp.sort_values(['param', 'value', 'year'])
x = mp[mp['param'] == p]
# x['value'].iloc[6]


In [None]:
df1 = mp.loc[(mp['year'] == 2018) & (mp['early_stopping'] == True), ['param', 'value', 'epochs', 'corr']]\
    .rename({'corr': 'corr_stop', 'epochs': 'epochs_stop'}, axis=1)
df2 = mp.loc[(mp['year'] == 2018) & (mp['early_stopping'] == False), ['param', 'value', 'epochs', 'corr']]
df = df1.merge(df2)
df['delta'] = (df['corr'] - df['corr_stop']) / df['corr'] * 100
df['delta'].min(), df['delta'].mean(), df['delta'].median(), df['delta'].max()

In [None]:
df

# Fit NN

In [49]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()
N_EPOCHS = 100

vars_0 = ['gcvi', 'dday29C', 'rain']
vars_1 = vars_0 + ['nbr1', 'nir', 'ndti', 't_min']
vars_full = vars_1 + ['blue', 'green', 'ndvi', 'rded4', 'red', 'sndvi',
                      'swir1', 'swir2', 'dday10C', 't_max']


In [50]:
#@title Train/test helpers

def train(model, device, train_loader, optimizer, criterion):
    running_loss = 0.0

    model.train()
    for batch_idx, data in enumerate(train_loader):
        x, x_flat, target = data[:-2], data[-2], data[-1]
        inputs = [torch.Tensor(ts).to(device) for ts in x] + [torch.Tensor(x_flat).to(device)]
        labels = target.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return (running_loss / batch_idx) ** 0.5


def test(model, device, test_x, test_y, criterion, predict=False):
    model.eval()
    with torch.no_grad():
        x, x_flat = test_x
        data = [torch.Tensor(ts).to(device) for ts in x] + [torch.Tensor(x_flat).to(device)]
        target = torch.Tensor(test_y).to(device)
        outputs = model(data)
        detached = outputs.detach().cpu().numpy().flatten() if device.type == 'cuda' \
            else outputs.detach().numpy().flatten()
        if predict:
            return detached

        mse = criterion(outputs, target).item()
        corr = pearsonr(detached, test_y.flatten())[0]

    return mse ** 0.5, corr





In [51]:
#@title Cross-validation loop helpers

def split_train_test(test_year):
    np.random.seed(123)
    train_pd = pred_df[pred_df['year'] != test_year]
    val_pd = train_pd.sample(frac=0.2)
    train_pd = train_pd[~train_pd['id'].isin(val_pd['id'])].sample(frac=1)
    test_pd = pred_df[pred_df['year'] == test_year]
    return train_pd, val_pd, test_pd

def get_model_data(df, ts_cols, ss, train, batch_size=32):

    def get_ts_cols(var, start=0, end=14):
        return [f'{var}_{i}' for i in range(start, end)]

    ts_arrays = []
    for c in ts_cols:
        arr = np.array(df[get_ts_cols(c)])
        n, t = arr.shape
        ts_arrays.append(arr.reshape(n, 1, t))

    x_flat = np.array(df[['eatlas_mean']])
    y = np.array(df['yield']).reshape(len(df), 1)
    y = ss.fit_transform(y) if train else ss.transform(y)

    tensors = [torch.Tensor(ts).type(torch.float32) for ts in ts_arrays] + \
        [torch.Tensor(x_flat).type(torch.float32), torch.Tensor(y).type(torch.float32)]

    ds = TensorDataset(*tensors)
    loader = DataLoader(ds, batch_size=batch_size)
    return (ts_arrays, x_flat), y, ds, loader

def get_model(params, device):
    model_kw = {k: params[k] for k in ['conv_layers', 'initial_filters', 'dense_neurons',
                                       'dropout', 'kernel_sz', 'stride']}
    model_kw['n_ts'] = len(params['ts_cols'])

    model = Net(**model_kw)
    model.to(device)
    optimizer = params['optimizer'](
        model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    optimizer.zero_grad()

    return model, optimizer

def do_train_loop(model, device, optimizer, criterion, max_epochs,
                  train_loader, val_x, val_y, tol=3, verbose=10):
    rmse, corr = [], []
    val_rmse, val_corr = test(model, device, val_x, val_y, criterion)
    if verbose > 0:
        print(f'{0} / {N_EPOCHS} complete.', f'Test: {round(val_rmse, 3)} {round(val_corr, 3)}')
    rmse.append(val_rmse)
    corr.append(val_corr)
    min_rmse, n_inc = val_rmse, 0

    for epoch in range(max_epochs):
        train_rmse = train(model, device, train_loader, optimizer, criterion)
        val_rmse, val_corr = test(model, device, val_x, val_y, criterion)
        rmse.append(val_rmse)
        corr.append(val_corr)
        
        n_inc = n_inc + 1 if val_rmse > min_rmse else 0
        min_rmse = min(min_rmse, val_rmse)

        if verbose > 0 and (epoch + 1) % verbose == 0:
            print(f'{epoch + 1} / {N_EPOCHS} complete. Train: {round(train_rmse, 3)}.',
                f'Test: {round(val_rmse, 3)} {round(val_corr, 3)}', n_inc)
        if n_inc >= tol:
            break
    
    return model, rmse, corr, epoch



In [53]:
#@title Define simple CNN
# From: https://pytorch.org/tutorials/recipes/recipes/defining_a_neural_network.html
# Also used: https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html

x_dim, x_flat_dim = 14, 1
kernel_sz = 3
# n_ts = len(TS_COLS)

class Net(nn.Module):

    def __conv_calc(self, in_dim, pad, stride, k):
        return int(np.floor((in_dim + 2 * pad - (k - 1) - 1) / stride + 1))

    # def __init__(self, initial_filters=4, dense_neurons=16, dropout=0.1, conv_layers=1, verbose=False):
    def __init__(self, conv_layers, initial_filters, dense_neurons, kernel_sz, stride, dropout, n_ts, verbose=False):
        super(Net, self).__init__()
        self.verbose = verbose
        self.conv_layers = conv_layers
        self.n_ts = n_ts
        n_channels, width = 1, x_dim

        for c in range(conv_layers):
            for i in range(n_ts):
                setattr(self, f'ts_conv{c}_{i}', nn.Conv1d(n_channels, initial_filters, kernel_sz, stride))
                setattr(self, f'ts_bn{c}_{i}', nn.BatchNorm1d(initial_filters))
            n_channels = initial_filters
            width = np.floor(self.__conv_calc(width, 0, stride, kernel_sz) / 2)
            
        self.dropout1 = nn.Dropout1d(dropout)
        concat_dim = width * initial_filters * n_ts
        
        self.fc1 = nn.Linear(int(concat_dim), dense_neurons)
        self.fc2 = nn.Linear(dense_neurons + 1, 1)
 

    def quick_print(self, x, i, dat=''):
        if self.verbose:
            print(i, x.shape, dat)
        return i + 1


    def forward(self, x):
        x_ts, x_flat = x[:-1], x[-1]
        i = 0

        for c in range(self.conv_layers):
            for ts_i in range(self.n_ts):
                i = self.quick_print(x_ts[ts_i], i, c)
                x_ts[ts_i] = getattr(self, f'ts_conv{c}_{ts_i}')(x_ts[ts_i])
                x_ts[ts_i] = getattr(self, f'ts_bn{c}_{ts_i}')(x_ts[ts_i])
                i = self.quick_print(x_ts[ts_i], i, c)
                x_ts[ts_i] = F.relu(x_ts[ts_i])
                x_ts[ts_i] = F.max_pool1d(x_ts[ts_i], 2)
                x_ts[ts_i] = self.dropout1(x_ts[ts_i])
                i = self.quick_print(x_ts[ts_i], i, c)
        
        for ts_i in range(self.n_ts):
            x_ts[ts_i] = torch.flatten(x_ts[ts_i], 1)
            i = self.quick_print(x_ts[ts_i], i)

        x_ts = torch.cat(x_ts, 1)
        i = self.quick_print(x_ts, i)

        x_ts = self.fc1(x_ts)
        x_ts = F.relu(x_ts)
        i = self.quick_print(x_ts, i)

        x = torch.cat((x_ts, x_flat), 1)
        x = self.fc2(x)

        return x

# print(model_kw)
# my_nn = Net(verbose=True, **model_kw) #Net(1, 4, 8, 5, 3, 0.1, verbose=True)
# optimizer = optim.SGD(my_nn.parameters(), lr=0.001)
# optimizer.zero_grad()

# x, x_flat = train_x[0], train_x[1]

# test_im = [torch.from_numpy(i[:2]).type(torch.float32) for i in x] + \
#           [torch.from_numpy(x_flat[:2]).type(torch.float32)]

# result = my_nn(test_im)
# result

# # del my_nn



In [None]:
"""
batch_size: 32 same/worse
conv_layers: 2 same/worse, might need more epochs
dense_neurons: 32 a little better, 8 a little worse
dropout: 0.5 is bad
initial_filters: 16 a little better, 4 a little worse
lr: 1e-5 is too low - check if 1e-3 looks stable, if so just go with it?
stride: seems worse though maybe better w diff kernel?
weight decay: can leave out
"""
print()

In [82]:
param_dict_full = {'lr': [1e-5, 1e-4, 1e-3],
     'weight_decay': [0, 1e-5],
     'optimizer': [optim.Adam],
     'max_epochs': [10],
     'batch_size': [32, 64],
     'conv_layers': [1, 2],
     'initial_filters': [4, 8, 16],
     'dense_neurons': [8, 16, 32],
     'kernel_sz': [2, 3, 4, 5],
     'stride': [1, 2, 3],
     'dropout': [0, 0.1, 0.5],
     'ts_cols': [vars_full, vars_0, vars_1, vars_full]
    }

param_dict_small = {'lr': [1e-4, 1e-3],
     'weight_decay': [0],
     'optimizer': [optim.Adam],
     'max_epochs': [100],
     'batch_size': [64],
     'conv_layers': [1, 2],
     'initial_filters': [4, 8, 16],
     'dense_neurons': [8, 16, 32],
     'kernel_sz': [2, 3, 4],
     'stride': [1, 2],
     'dropout': [0, 0.1, 0.25],
     'ts_cols': [vars_0, vars_1, vars_full]
    }

base_params = {'lr': [1e-4],
     'weight_decay': [0],
     'optimizer': [optim.Adam],
     'max_epochs': [100],
     'batch_size': [64],
     'conv_layers': [1],
     'initial_filters': [8],
     'dense_neurons': [16],
     'kernel_sz': [3],
     'stride': [1],
     'dropout': [0.1],
     'ts_cols': [vars_0]
    }

keys, values = zip(*param_dict_full.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
print(len(param_combinations))

keys, values = zip(*param_dict_small.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
param_combinations = [p for p in param_combinations if p['kernel_sz'] > p['stride']]
print(len(param_combinations))


keys, values = zip(*base_params.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
params = param_combinations[0]
len(param_combinations)

params



31104
1620


{'lr': 0.0001,
 'weight_decay': 0,
 'optimizer': torch.optim.adam.Adam,
 'max_epochs': 100,
 'batch_size': 64,
 'conv_layers': 1,
 'initial_filters': 8,
 'dense_neurons': 16,
 'kernel_sz': 3,
 'stride': 1,
 'dropout': 0.1,
 'ts_cols': ['gcvi', 'dday29C', 'rain']}

In [73]:
test_year = 2018
performance = []

def train_for_params(test_year, params, track_param):
    # Get datasets
    ds_kw = {'ss': StandardScaler(), 'batch_size': params['batch_size'], 'ts_cols': params['ts_cols']}
    train_pd, val_pd, test_pd = split_train_test(test_year)
    train_x, train_y, train_dataset, train_loader = get_model_data(train_pd, train=True, **ds_kw)
    val_x, val_y, val_dataset, val_loader = get_model_data(val_pd, train=False, **ds_kw)
    test_x, test_y, test_dataset, test_loader = get_model_data(test_pd, train=False, **ds_kw)

    # Setup model
    model, optimizer = get_model(params, device)

    # Train
    model, rmse, corr, epoch = do_train_loop(model, device, optimizer, criterion,
        params['max_epochs'], train_loader, val_x, val_y, tol=300, verbose=1)
    
    print([track_param, params[track_param], round(rmse[-1], 3), round(corr[-1], 3), epoch])
    # return [track_param, params[track_param], rmse[-1], corr[-1], epoch]
    return track_param, params[track_param], rmse, corr, epoch

# k = list(param_dict.keys())[0]
# curr_dict = {i: base_params[i] for i in base_params}
# curr_dict[k] = set(param_dict[k]) - set(base_params[k])

# keys, values = zip(*curr_dict.items())
# param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

# params = param_combinations[0]
# performance.append(train_for_params(test_year, params, k))

_, _, rmse, corr, _ = train_for_params(test_year, params, k)



0 / 100 complete. Test: 3.547 0.01
1 / 100 complete. Train: 0.953. Test: 0.908 0.437 0
2 / 100 complete. Train: 0.924. Test: 0.895 0.445 0
3 / 100 complete. Train: 0.917. Test: 0.889 0.454 0
4 / 100 complete. Train: 0.915. Test: 0.887 0.459 0
5 / 100 complete. Train: 0.914. Test: 0.885 0.462 0
6 / 100 complete. Train: 0.909. Test: 0.883 0.464 0
7 / 100 complete. Train: 0.909. Test: 0.882 0.467 0
8 / 100 complete. Train: 0.908. Test: 0.881 0.468 0
9 / 100 complete. Train: 0.907. Test: 0.88 0.47 0
10 / 100 complete. Train: 0.906. Test: 0.88 0.471 0
11 / 100 complete. Train: 0.906. Test: 0.879 0.473 0
12 / 100 complete. Train: 0.904. Test: 0.878 0.475 0
13 / 100 complete. Train: 0.903. Test: 0.877 0.476 0
14 / 100 complete. Train: 0.903. Test: 0.876 0.477 0
15 / 100 complete. Train: 0.903. Test: 0.876 0.478 0
16 / 100 complete. Train: 0.903. Test: 0.875 0.479 0
17 / 100 complete. Train: 0.9. Test: 0.875 0.48 0
18 / 100 complete. Train: 0.9. Test: 0.874 0.481 0
19 / 100 complete. Train: 0.

In [None]:
performance_df = pd.DataFrame(performance, columns=['param', 'value', 'rmse', 'corr', 'epochs'])

In [74]:
#@title Plot train/val error
from plotly import subplots

fig = go.Figure()
fig = subplots.make_subplots(rows=1, cols=2)
fig.add_trace(go.Scatter(x=[x for x in range(N_EPOCHS)], y=rmse[1:], showlegend=False), row=1, col=1)
fig.add_trace(go.Scatter(x=[x for x in range(N_EPOCHS)], y=corr[1:], showlegend=False), row=1, col=2)
fig.show()

In [None]:
#@title Plot predictions
x, y = test_x, test_y
# x, y = train_x, train_y
x, y = val_x, val_y

preds = test(model, device, x, y, criterion, predict=True)
transform_y = ss.inverse_transform(y).flatten()
transform_preds = ss.inverse_transform(preds.reshape(y.shape)).flatten()

rmse_sc = ((transform_y - transform_preds) ** 2).mean() ** 0.5
r, p = pearsonr(transform_y, transform_preds)
print(round(rmse_sc, 3), round(r ** 2, 3))

sns.scatterplot(transform_y, transform_preds)
# Train: 0.175 0.886

In [None]:
# 2016 0.61 0.142 
# 2017 0.531 0.119
# 2018 0.539 0.008
# 2019 0.654 0.032
