# Setup

In [None]:
#@title Mount drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
#@title Imports
import sys
import os
import time

import numpy as np
import pandas as pd

# Analysis
from datetime import datetime, timedelta
from scipy.stats import pearsonr

# Plotting
import matplotlib
import plotly
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go


In [None]:
#@title Constants
DATA_PATH = 'drive/MyDrive/current_research_projects/us_data/'
DL_DATA_PATH = 'drive/MyDrive/current_research_projects/dl_yield_forecasts/data/'
HEAT_DATA_PATH = 'drive/MyDrive/current_research_projects/heat_separability/data/'

# HEAT_DATA_PATH = 'drive/MyDrive/research_ideas/heat_separability/data/'
datestr = datetime.today().strftime('%Y%m')
FIG_PATH = f'drive/MyDrive/current_research_projects/dl_yield_forecasts/figs/{datestr}/'
! mkdir -p $FIG_PATH

PROCESS_DATA = False

In [None]:
#@title Helpers
def get_rmse(x, y):
    return ((x - y) ** 2).mean() ** 0.5


def get_corr(x, y):
    return pearsonr(x, y)[0]


def get_r2(x, y):
    return get_corr(x, y) ** 2


def quick_summarize(df, name=''):
    print(name, len(df),
          round(get_rmse(df['log_yield'], df['pred']), 6),
          round(get_r2(df['log_yield'], df['pred']), 3))


# Evaluate

In [None]:
#@title Read satellite model results
sat_preds = pd.read_csv(f'{HEAT_DATA_PATH}/yield_predictions/satellite_all_stage_cdl_adj.csv')

yield_dl_dir = f'{DL_DATA_PATH}/bagging/'
dl_preds = pd.read_csv(f'{yield_dl_dir}/pretrain_3_temperature2.csv')

sat_preds = sat_preds.merge(dl_preds[['fips', 'year']])
get_rmse(sat_preds['log_yield'], sat_preds['pred']), \
get_corr(sat_preds['log_yield'], sat_preds['pred'])

dl_preds = dl_preds.merge(sat_preds[['fips', 'year']])
get_rmse(dl_preds['log_yield'], dl_preds['pred']), \
get_corr(dl_preds['log_yield'], dl_preds['pred'])

(0.13430288215151912, 0.8354042481655665)

In [None]:
#@title Read linear model results
yield_lm_dir = f'{HEAT_DATA_PATH}/yield_predictions/'

lm_baseline_cv = pd.read_csv(f'{yield_lm_dir}/baseline_preds_31_states.csv')
lm_march_cv = pd.read_csv(f'{yield_lm_dir}/march_through_aug_31_states.csv')
lm_april_cv = pd.read_csv(f'{yield_lm_dir}/april_through_sept_31_states.csv')
lm_doy_season_cv = pd.read_csv(f'{yield_lm_dir}/doy_full_season_31_states.csv')

lm_baseline_test = pd.read_csv(f'{yield_lm_dir}/baseline_test_period.csv')
lm_march_test = pd.read_csv(f'{yield_lm_dir}/march_test_period.csv')
lm_april_test = pd.read_csv(f'{yield_lm_dir}/april_test_period.csv')
lm_doy_season_test = pd.read_csv(f'{yield_lm_dir}/doy_season_test_period.csv')

satellite_lm = pd.read_csv(f'{yield_lm_dir}/satellite_all_stage_cdl_adj.csv')


In [None]:
#@title Read DL model results
yield_dl_dir = f'{DL_DATA_PATH}/bagging/'

nn_baseline = pd.read_csv(f'{yield_dl_dir}/baseline_nn.csv')
cnn_dday = pd.read_csv(f'{yield_dl_dir}/basic_cnn_dday.csv')
cnn_temp = pd.read_csv(f'{yield_dl_dir}/basic_cnn_temperature2.csv')
lstm_dday = pd.read_csv(f'{yield_dl_dir}/basic_lstm_dday.csv')
lstm_temp = pd.read_csv(f'{yield_dl_dir}/basic_lstm_temperature2.csv')
hybrid_dday = pd.read_csv(f'{yield_dl_dir}/hybrid_lstm_dday.csv')
hybrid_temp = pd.read_csv(f'{yield_dl_dir}/hybrid_lstm_temperature.csv')
# seg_dday = pd.read_csv(f'{yield_dl_dir}/segmented_cnn_dday.csv')
# seg_cnn_temp = pd.read_csv(f'{yield_dl_dir}/segmented_cnn_temperature.csv')
# seg_lstm_dday = pd.read_csv(f'{yield_dl_dir}/segmented_lstm_dday.csv')
# seg_lstm_temp = pd.read_csv(f'{yield_dl_dir}/segmented_lstm_temperature.csv')
# seg_lstm2_dday = pd.read_csv(f'{yield_dl_dir}/segmented_lstm2_dday.csv')
hybrid_temp0 = pd.read_csv(f'{yield_dl_dir}/hybrid_lstm_0_temperature.csv')
hybrid_temp1 = pd.read_csv(f'{yield_dl_dir}/hybrid_lstm_1_temperature.csv')
hybrid_temp2 = pd.read_csv(f'{yield_dl_dir}/hybrid_lstm_2_temperature.csv')
hybrid_temp3 = pd.read_csv(f'{yield_dl_dir}/hybrid_lstm_3_temperature.csv')
seg_lstm_temp = pd.read_csv(f'{yield_dl_dir}/segmented_lstm2_temperature.csv')

pretrain_temp0 = pd.read_csv(f'{yield_dl_dir}/pretrain_0_temperature.csv')
pretrain_temp1 = pd.read_csv(f'{yield_dl_dir}/pretrain_1_temperature.csv')
pretrain_temp3 = pd.read_csv(f'{yield_dl_dir}/pretrain_3_temperature.csv')
pretrain_temp3_unnorm = pd.read_csv(f'{yield_dl_dir}/pretrain_3_temperature2.csv')

eval_sample = cnn_dday[['fips', 'year']]


In [None]:
#@title Get model performance
eval = [('Baseline CV', lm_baseline_cv[lm_baseline_cv['year'] >= 2017]),
        ('March CV', lm_march_cv[lm_march_cv['year'] >= 2017]),
        ('April CV', lm_april_cv[lm_april_cv['year'] >= 2017]),
        ('DOY season CV', lm_doy_season_cv[lm_doy_season_cv['year'] >= 2017]),
        ('\nBaseline test', lm_baseline_test),
        ('March test', lm_march_test),
        ('April test', lm_april_test),
        ('DOY season test', lm_doy_season_test),
        ('\nANN, no weather', nn_baseline),
        ('CNN, temperature', cnn_temp),
        ('LSTM, temperature', lstm_temp),
        ('Hybrid LSTM, temperature', hybrid_temp),
        ('\nSegmented LSTM, temperature', seg_lstm_temp),
        ('Hybrid LSTM, temperature 0', hybrid_temp0),
        ('Hybrid LSTM, temperature 1', hybrid_temp1),
        ('Hybrid LSTM, temperature 2', hybrid_temp2),
        ('Hybrid LSTM, temperature 3', hybrid_temp3),
        ('\nPretrain, hybrid LSTM 0', pretrain_temp0),
        ('Pretrain, hybrid LSTM 1', pretrain_temp1),
        ('Pretrain, hybrid LSTM 3', pretrain_temp3),
        ('Pretrain, hybrid LSTM 3, unnormalized', pretrain_temp3_unnorm),
        ('\nSatellite LM', satellite_lm.merge(pretrain_temp3_unnorm[['fips', 'year']])),
        ]

# Toggle to get results for all or satellite sample only
# eval_sample = cnn_dday[['fips', 'year']]
eval_sample = satellite_lm[['fips', 'year']].merge(cnn_dday[['fips', 'year']])
for name, df in eval:
    quick_summarize(df.merge(eval_sample), name)


Baseline CV 6268 0.150454 0.625
March CV 6268 0.149825 0.634
April CV 6268 0.151817 0.624
DOY season CV 6268 0.146531 0.645

Baseline test 6268 0.208261 0.384
March test 6268 0.202392 0.367
April test 6268 0.216045 0.309
DOY season test 6268 0.209891 0.335

ANN, no weather 6268 0.158322 0.621
CNN, temperature 6268 0.13786 0.711
LSTM, temperature 6268 0.158066 0.668
Hybrid LSTM, temperature 6268 0.132371 0.702

Segmented LSTM, temperature 6268 0.137336 0.69
Hybrid LSTM, temperature 0 6268 0.134929 0.689
Hybrid LSTM, temperature 1 6268 0.140933 0.674
Hybrid LSTM, temperature 2 6268 0.137691 0.69
Hybrid LSTM, temperature 3 6268 0.135513 0.692

Pretrain, hybrid LSTM 0 6268 0.151769 0.614
Pretrain, hybrid LSTM 1 6268 0.138522 0.68
Pretrain, hybrid LSTM 3 6268 0.134407 0.705
Pretrain, hybrid LSTM 3, unnormalized 6268 0.134303 0.698

Satellite LM 6268 0.139124 0.687


In [None]:
#@title Check that results look reasonable
from plotly import subplots

lm_df = lm_doy_season_cv[lm_doy_season_cv['year'] >= 2017]
lm_rmse = round(get_rmse(lm_df['log_yield'], lm_df['pred']), 3)
lm_corr = round(get_corr(lm_df['log_yield'], lm_df['pred']), 3)
dl_df = hybrid_dday[hybrid_dday['year'] >= 2017]
dl_rmse = round(get_rmse(dl_df['log_yield'], dl_df['pred']), 3)
dl_corr = round(get_corr(dl_df['log_yield'], dl_df['pred']), 3)

fig = plotly.subplots.make_subplots(rows=1, cols=2, subplot_titles=
    [f'Best LM: RMSE={lm_rmse}, R={lm_corr}', f'Best DL: RMSE={dl_rmse}, R={dl_corr}'])

fig.add_trace(go.Scatter(
    x=lm_df['log_yield'], y=lm_df['pred'], mode='markers', showlegend=False), row=1, col=1)

fig.add_trace(go.Scatter(
    x=dl_df['log_yield'], y=dl_df['pred'], mode='markers', showlegend=False), row=1, col=2)

fig.update_layout(height=500, width=1000)
# plotly.io.write_image(fig, f'{FIG_PATH}/{fname}.png', scale=2)
fig.show()

In [None]:
#@title Get percent change in performance
lm_df = lm_doy_season_cv[lm_doy_season_cv['year'] >= 2017].merge(eval_sample)
best_lm_corr = get_corr(lm_df['pred'], lm_df['log_yield'])
cnn_dday_corr = get_corr(cnn_dday['pred'], cnn_dday['log_yield'])
cnn_temp_corr = get_corr(cnn_temp['pred'], cnn_temp['log_yield'])

best_lm_rmse = get_rmse(lm_df['pred'], lm_df['log_yield'])
cnn_dday_rmse = get_rmse(cnn_dday['pred'], cnn_dday['log_yield'])
cnn_temp_rmse = get_rmse(cnn_temp['pred'], cnn_temp['log_yield'])

print((cnn_dday_rmse - best_lm_rmse) / best_lm_rmse * 100)
print((cnn_dday_corr - best_lm_corr) / best_lm_corr * 100)
print((cnn_temp_rmse - best_lm_rmse) / best_lm_rmse * 100)
print((cnn_temp_corr - best_lm_corr) / best_lm_corr * 100)


In [None]:
#@title Compare results to Khaki et al
# Indiana, Illinois, Iowa, Minnesota, Missouri, Nebraska, Kansas, North Dakota,
# South Dakota, Ohio, Kentucky, Michigan, and Wisconsin
states = [18, 17, 19, 27, 29, 31, 20, 38, 46, 39, 21, 26, 55]
pretrain_temp3_unnorm['state'] = pretrain_temp3_unnorm['fips'].apply(
    lambda x: int(str(x).zfill(5)[:2]))
df = pretrain_temp3_unnorm[pretrain_temp3_unnorm['state'].isin(states)]

for y in [2017, 2018]:
    ydf = df[df['year'] == y]
    y_rmse = get_rmse(np.exp(ydf['log_yield']), np.exp(ydf['pred']))
    y_r = get_corr(np.exp(ydf['log_yield']), np.exp(ydf['pred']))
    print(y, round(y_rmse, 2), round(y_r * 100, 2))

# Khaki et al:
# 15.74 88.24
# 17.64 87.82

# Me:
# 2017 18.65 86.26
# 2018 17.42 85.7

# Process results

In [None]:
#@title Process grid search output
def process_grid_search(foldername):
    dir = f'{DL_DATA_PATH}/grid_search/{foldername}'
    df = pd.concat(
        [pd.read_csv(f'{dir}/{f}') for f in os.listdir(dir)]).drop_duplicates(subset='param_combo')
    df.to_csv(f'{dir}.csv', index=False)
    return df

df = process_grid_search('pretrain_3_temperature')


In [None]:
#@title Process bagging output
N_BAGGING_FOLDS = 100

def read_bagging_preds(dir):
    pred_files = os.listdir(dir)
    assert len(set(pred_files)) == N_BAGGING_FOLDS
    preds_full = pd.concat([pd.read_csv(f'{dir}/{i}.csv').assign(iter=i)
                                for i in range(N_BAGGING_FOLDS)])
    preds = preds_full.groupby(['fips', 'year']).mean().reset_index().drop('iter', axis=1)
    return preds, preds_full

def get_iter_performance(df):
    def get_iter_stat(i, fn):
        idf = df[df['iter'] <= i].groupby(['fips', 'year']).mean().reset_index()
        return fn(idf['log_yield'], idf['pred'])

    return pd.DataFrame(dict(r=[get_iter_stat(i, get_corr) for i in range(N_BAGGING_FOLDS)],
                             rmse=[get_iter_stat( i, get_rmse) for i in range(N_BAGGING_FOLDS)],
                             iter=[i for i in range(N_BAGGING_FOLDS)]))

def process_bagging_results(model_str, model_dir, out_str=None):
    if out_str is None:
        out_str = model_str
    preds, preds_full = read_bagging_preds(f'{model_dir}/{model_str}')

    iter_perf = get_iter_performance(preds_full)
    preds.to_csv(f'{DL_DATA_PATH}/bagging/{out_str}.csv', index=False)
    iter_perf.to_csv(f'{DL_DATA_PATH}/bagging/{out_str}_iter_performance.csv', index=False)

dl_dir = f'{DL_DATA_PATH}/bagging/'
lm_dir = f'{HEAT_DATA_PATH}/yield_predictions/'

process_bagging_results('basic_cnn_temperature2', dl_dir)
