<a href="https://colab.research.google.com/github/issmythe/intepretable-ai-crops/blob/main/linear_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
#@title Mount drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
#@title Imports
! pip install kaleido &> /dev/null

import sys
sys.path.append('drive/MyDrive/current_research_projects/utils/')

import importlib
import math
import os
import pickle
import random
import time

import numpy as np
import pandas as pd

# Analysis
from datetime import datetime, timedelta
from scipy.stats import pearsonr

from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from statsmodels.regression.linear_model import OLSResults

# Plotting
import matplotlib
import plotly
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly import subplots

# Utils
from read_data import get_max_vi, get_modis_vi, get_sif, get_weather, get_yields


In [None]:
#@title Constants
END_YEAR_INC = 2021

DATA_PATH = 'drive/MyDrive/current_research_projects/us_data/'
DL_DATA_PATH = 'drive/MyDrive/current_research_projects/dl_yield_forecasts/data/'
HEAT_DATA_PATH = 'drive/MyDrive/current_research_projects/heat_separability/data/'

FIG_PATH = 'drive/MyDrive/current_research_projects/heat_separability/%s/' % datetime.today().strftime('%Y%m')
! mkdir $FIG_PATH

pd.options.mode.chained_assignment = None


mkdir: cannot create directory ‘drive/MyDrive/current_research_projects/heat_separability/202407/’: File exists


# Data

In [None]:
#@title Get yields
yields = get_yields(2000, END_YEAR_INC, True).drop('Unnamed: 0', axis=1)
yields['fips'] = yields['fips'].apply(int)


In [None]:
#@title Get folds
random.seed(123)
years = [x for x in range(2000, END_YEAR_INC + 1)]
random.shuffle(years)
FOLDS = np.array_split(years, 10)

In [None]:
#@title Degree days and dummy cols
fixed_ddays_m = pd.read_csv(f'{DATA_PATH}/weather/processed/march_through_aug_31_states.csv')\
    .rename({'cum10C': '10C_total', 'cum29C': '29C_total', 'cum_prec': 'prec_total'}, axis=1)
fixed_ddays_a = pd.read_csv(f'{DATA_PATH}/weather/processed/april_through_sept_31_states.csv')\
    .rename({'cum10C': '10C_total', 'cum29C': '29C_total', 'cum_prec': 'prec_total'}, axis=1)

date_ddays = pd.read_csv(f'{DATA_PATH}/weather/processed/usda_day_of_year_full.csv')
keep_cols = ['fips', 'year', 'state', '10C_total', '29C_total', 'prec_total']
no_usda = fixed_ddays_m[fixed_ddays_m['state'].isin([12, 13, 45])]
date_ddays = pd.concat([date_ddays[keep_cols], no_usda[keep_cols]])

aug_data = pd.get_dummies(yields, columns=['fips', 'state']).assign(
        fips=yields['fips'], state=yields['state'])

state_cols = [x for x in aug_data.columns if x.startswith('state_')]
for c in state_cols:
    aug_data[f'{c}_tt'] = aug_data[c] * (aug_data['year'] - aug_data['year'].min())
    aug_data[f'{c}_2tt'] = aug_data[f'{c}_tt'] ** 2



In [None]:
#@title Shared helpers
def get_rmse(x, y):
    return ((x - y) ** 2).mean() ** 0.5

def get_corr(x, y):
    return pearsonr(x, y)[0]

def get_r2(x, y):
    return get_corr(x, y) ** 2

def quick_summarize(df, name=''):
    print(name, len(df),
          round(get_rmse(df['log_yield'], df['pred']), 3),
          round(get_corr(df['log_yield'], df['pred']), 3))


month_names = ['Jan', 'Feb', 'March', 'April', 'May', 'June',
               'July', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']
labels = pd.DataFrame({
     'doy': [pd.to_datetime(f'2023-{str(i + 1).zfill(2)}-01').dayofyear for i in range(12)],
     'label': month_names})

fips_to_state = lambda x: int(str(x).zfill(5)[:2])


# Helpers

In [None]:
#@title Prediction helpers
def make_reg_df(dday_df):
    return dday_df.assign(prec2_total=dday_df['prec_total'] ** 2).merge(aug_data)


def split_ex_years(df, ex_years):
    return df[~df['year'].isin(ex_years)]


def split_inc_years(df, inc_years):
    return df[df['year'].isin(inc_years)]


def get_weather_cols(): # TODO - constant?
    return ['10C_total', '29C_total', 'prec_total', 'prec2_total']


def get_tt_cols(train):
    return [x for x in train.columns if 'tt' in x]


def get_fips_cols(train):
    fips = train[[x for x in train if x.startswith('fips_')]].sum().reset_index()
    return fips.loc[fips[0] > 1, 'index'].to_list()[:-1]


def get_reg_cols(train, inc_weather):
    weather_cols = get_weather_cols() if inc_weather else []
    return weather_cols + get_tt_cols(train) + get_fips_cols(train)


def check_for_outf(outf, overwrite):
    if outf and not overwrite:
        try:
            return pd.read_csv(f'{HEAT_DATA_PATH}/yield_predictions/{outf}.csv')\
                .groupby(['fips', 'year']).mean().reset_index()
        except FileNotFoundError:
            return None


In [None]:
#@title Prediction main functions
def predict_one_fold_helper(dday_df, test_years, col_fn):
    df = make_reg_df(dday_df)
    test = split_inc_years(df, test_years)
    train = split_ex_years(df, test_years).sample(frac=1)

    reg = linear_model.LinearRegression().fit(train[col_fn(train)], train['log_yield'])
    test_preds = reg.predict(test[col_fn(train)])
    return test[['year', 'fips', 'log_yield']].assign(pred=test_preds)


def predict_one_fold_weather(dday_df, test_years):
    return predict_one_fold_helper(dday_df, test_years, lambda train: get_reg_cols(train, True))


def predict_one_fold_baseline(dday_df, test_years):
    return predict_one_fold_helper(dday_df, test_years, lambda train: get_reg_cols(train, False))


def predict_folds_helper(dday_df, folds, pred_fn, outf, overwrite, verbose):
    results = check_for_outf(outf, overwrite)
    if results is not None:
        return results

    results = []
    for i in range(len(folds)):
        results.append(pred_fn(dday_df, folds[i]))
        if verbose:
            print(i)
    results = pd.concat(results)

    if outf:
        results.to_csv(f'{HEAT_DATA_PATH}/yield_predictions/{outf}.csv', index=False)

    return results


def predict_folds_weather(dday_df, folds, outf=None, overwrite=False, verbose=True):
    return predict_folds_helper(dday_df, folds, predict_one_fold_weather, outf, overwrite, verbose)


def predict_folds_baseline(folds, outf=None, overwrite=False, verbose=True):
    return predict_folds_helper(
        fixed_ddays_m, folds, predict_one_fold_baseline, outf, overwrite, verbose)


'drive/MyDrive/current_research_projects/heat_separability/data/'

# Predictions

In [None]:
#@title 10-fold CV preds
baseline_cv = predict_folds_baseline(FOLDS, outf='baseline_preds_31_states')
march_cv = predict_folds_weather(fixed_ddays_m, FOLDS, outf='march_through_aug_31_states')
april_cv = predict_folds_weather(fixed_ddays_a, FOLDS, outf='april_through_sept_31_states')
doy_season_cv = predict_folds_weather(date_ddays, FOLDS, outf='doy_full_season_31_states')

print('All years:')
for df in [baseline_cv, march_cv, april_cv, doy_season_cv]:
    quick_summarize(df)

print('\n2017-2021:')
for df in [baseline_cv, march_cv, april_cv, doy_season_cv]:
    quick_summarize(df[df['year'] >= 2017])


All years:
 33064 0.24 0.709
 33064 0.203 0.803
 33064 0.208 0.791
 33064 0.201 0.806

2017-2021:
 6648 0.153 0.791
 6648 0.153 0.795
 6648 0.156 0.787
 6648 0.149 0.802


In [None]:
#@title Test periods preds (2017-2021)
test_years = [x for x in range(2017, 2022)]

baseline_test = predict_folds_baseline(
    [test_years], outf='baseline_test_period', verbose=False, overwrite=True)
march_test = predict_folds_weather(
    fixed_ddays_m, [test_years], outf='march_test_period', verbose=False)
april_test = predict_folds_weather(
    fixed_ddays_a, [test_years], outf='april_test_period', verbose=False)
doy_season_test = predict_folds_weather(
    date_ddays, [test_years], outf='doy_season_test_period', verbose=False, overwrite=True)

for df in [baseline_test, march_test, april_test, doy_season_test]:
    quick_summarize(df)
# gd

len fips: 1922
len fips: 1922
len fips: 1922
len fips: 1922
 6648 0.211 0.629
 6648 0.209 0.606
 6648 0.221 0.567
 6648 0.215 0.587
