In [1]:
import pandas as pd 
import geopandas as gpd
import gpflow
import numpy as np
import tensorflow as tf
from glob import glob

In [2]:
states = ["ILLINOIS", "IOWA", "MICHIGAN", "MINNESOTA", "OHIO", "INDIANA", "WISCONSON"]

for i, (X_all, df_features) in enumerate(zip(states, states)):
    print(i, X_all, df_features)

0 ILLINOIS ILLINOIS
1 IOWA IOWA
2 MICHIGAN MICHIGAN
3 MINNESOTA MINNESOTA
4 OHIO OHIO
5 INDIANA INDIANA
6 WISCONSON WISCONSON


In [3]:
df_response = pd.read_csv("../../data/synthetic/synthetic_yields.csv")
counties_states = pd.read_csv("../../data/crops/counties-states.csv")
df_response = df_response[df_response["key"].isin(counties_states["key"])]
# drop duplicated yields
df_modis = pd.read_csv("../../data/crops/processed_covariates/MOD13Q1-1000-all_data.csv")
df_modis = df_modis[df_modis["year"]==2015]
df_modis["key"] = df_modis["County"] + "-" + df_modis["State"]
df_gridmet = pd.read_csv("../../data/crops/processed_covariates/GRIDMET-all_data.csv")
df_gridmet = df_gridmet[df_gridmet["year"]==2015]
df_gridmet["key"] = df_gridmet["County"] + "-" + df_gridmet["State"]



In [4]:
dates = ['04-07']

In [5]:
modis_features = ["key", "longitude", "latitude"] + [f"EVI_{date}" for date in dates]
df_modis = df_modis[modis_features]

In [6]:
df_modis = df_modis.groupby(["key"]).mean().reset_index()

In [7]:
col_latlon = ["longitude", "latitude"]
col_modis = [f"EVI_{date}" for date in dates]
col_gridmet = [f"pr_{date}" for date in dates] + [f"tmmx_{date}" for date in dates]

In [8]:
gridmet_features = ["key"] + col_gridmet
df_gridmet = df_gridmet[gridmet_features]
df_gridmet = df_gridmet.groupby(["key"]).mean().reset_index()

In [9]:
df_features = df_modis.merge(df_gridmet, on=["key"])

In [10]:
df_all = df_response.merge(df_features, on=["key"])

In [11]:
# df_all.to_csv("../../data/crops/centroid_yield_features_data.csv")
df_all.to_csv("../../data/synthetic/centroid_synthetic_yield_features_data.csv")

In [12]:
keys = list(set(df_response.key))
keys.sort()

In [13]:
df_all = df_all[df_all["key"].isin(keys)]

In [14]:
col_index_space = [0, 1]
col_index_modis = [2]
col_index_pr = [3]
col_index_tmmx = [4]

latlon_cols = ["longitude", "latitude"]
modis_cols = [f"EVI_{date}" for date in dates]
pr_cols = [f"pr_{date}" for date in dates] 
tmmx_cols = [f"tmmx_{date}" for date in dates] 
all_features = latlon_cols + modis_cols + pr_cols + tmmx_cols

## Experimental Setup

In [15]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import gpflow 
from scipy.stats import sem
import json
import random 
import time 
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

seed = 0

X = df_all.loc[:,all_features].values
# X = df_all.iloc[:,6:34].values
y = df_all.y.values[:, None]
kf = KFold(n_splits=5, random_state=seed, shuffle=True)

## Centroid GP

In [16]:
RMSE = []
MAPE  = []
LL  = []
training_time = []

for fold, (train_index, test_index) in tqdm(enumerate(kf.split(keys))):
    # randomly pick counties
    train_keys = [keys[key] for key in train_index]
    test_keys = [keys[key] for key in test_index]
    train_index = df_all["key"].isin(train_keys)
    test_index = df_all["key"].isin(test_keys)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    scaler_x = StandardScaler().fit(X_train)
    scaler_y = StandardScaler().fit(y_train)
    X_train, y_train = scaler_x.transform(X_train), scaler_y.transform(y_train)
    X_test = scaler_x.transform(X_test)

    # fit and train GP regression model
    k_space = gpflow.kernels.Matern32(active_dims=col_index_space)
    k_modis = gpflow.kernels.RBF(active_dims=col_index_modis)
    k_pr = gpflow.kernels.RBF(active_dims=col_index_pr)
    k_tmmx = gpflow.kernels.RBF(active_dims=col_index_tmmx)
    k = k_space + k_modis + k_pr + k_tmmx
    m = gpflow.models.GPR(data=(X_train, y_train), kernel=k, mean_function=None)
    t0 = time.time()
    opt = gpflow.optimizers.Scipy()
    opt_logs = opt.minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=500))
    t1 = time.time()

    pred, var = m.predict_y(X_test)
    pred = scaler_y.inverse_transform(pred)
    # pred already inversely transformed, therefore only need to multiple the correction by scale_
    lower = np.reshape((1.96 * np.sqrt(var[:,0]))*scaler_y.scale_, y_test.shape)
    upper = np.reshape((1.96 * np.sqrt(var[:,0]))*scaler_y.scale_, y_test.shape)
    errors = np.concatenate((lower, upper), axis=1)
    errors = errors.T
    
    # compute metrics 
    loglik = np.mean(m.predict_log_density((X_test, scaler_y.transform(y_test))))
    RMSE.append(np.sqrt(np.mean((pred - y_test)**2)))
    MAPE.append(np.mean(np.abs(( pred - y_test) / y_test)))
    LL.append(loglik)

    # plot predictions
    plt.figure(figsize=(8,8))
    plt.scatter(pred, y_test, color="red")
    plt.plot(np.linspace(-100,100,201), np.linspace(-100,100,201), color="black")

    plt.errorbar(#
        pred[:,0],
        y_test[:,0],
        xerr=errors,
        fmt="o",
        ls="none",
        capsize=5,
        markersize=4,
        color="blue",
        alpha=0.2
        )
    plt.xlim((y_test.min()-0.5, y_test.max()+0.5))
    plt.ylim((y_test.min()-0.5, y_test.max()+0.5))
    plt.xlabel("Predictions")#
    plt.ylabel("Ground Truth")
    plt.savefig(f"../../results/synthetic/centroidGP_fold{fold}.png")
    plt.savefig(f"../../results/synthetic/centroidGP_fold{fold}.pdf")
    plt.savefig(f"../../results/synthetic/centroidGP_fold{fold}.svg")
    plt.close()

print(f"MAPE", MAPE)
print(f"RMSE", RMSE)
print(f"LL", LL)
print(f"Training Time", training_time)

0it [00:00, ?it/s]2022-01-12 12:42:24.101867: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-01-12 12:42:24.101905: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: nvidia4
2022-01-12 12:42:24.101927: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: nvidia4
2022-01-12 12:42:24.102020: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 460.27.4
2022-01-12 12:42:24.102039: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 460.27.4
2022-01-12 12:42:24.102043: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 460.27.4
2022-01-12 12:42:24.102355: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU 

MAPE [0.2294375558558635, 0.3014968315502764, 0.2998569175156853, 0.20309749697917764, 0.310900231788104]
RMSE [0.7289501465827158, 0.9270620074110206, 1.094964059114099, 1.086240665381462, 0.8917501630672453]
LL [0.8635893470586241, 0.643365766644254, 0.4183723078453546, 0.31524158955644926, 0.6488686199334244]
Training Time []





In [17]:
json_file = json.dumps({"CV-RMSE": sum(RMSE) / 5, "CV-MAPE": sum(MAPE) / 5, 
                       "CV-sd-RMSE": sem(RMSE), "CV-sd-MAPE": sem(MAPE), "CV-LL": sum(LL) / 5, "CV-sd-LL": sem(LL),
                        "Training Time": sum(training_time)/5, "Training Time se": sem(training_time)}
                       )
f = open(f"../../results/synthetic/centroidGP.json", "w")
f.write(json_file)
f.close()

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [18]:
m

name,class,transform,prior,trainable,shape,dtype,value
GPR.kernel.kernels[0].variance,Parameter,Softplus,,True,(),float64,0.144458
GPR.kernel.kernels[0].lengthscales,Parameter,Softplus,,True,(),float64,2.95196
GPR.kernel.kernels[1].variance,Parameter,Softplus,,True,(),float64,5.89228
GPR.kernel.kernels[1].lengthscales,Parameter,Softplus,,True,(),float64,3.97158
GPR.kernel.kernels[2].variance,Parameter,Softplus,,True,(),float64,4.68101
GPR.kernel.kernels[2].lengthscales,Parameter,Softplus,,True,(),float64,1.54055
GPR.kernel.kernels[3].variance,Parameter,Softplus,,True,(),float64,46.4975
GPR.kernel.kernels[3].lengthscales,Parameter,Softplus,,True,(),float64,5.16387
GPR.likelihood.variance,Parameter,Softplus + Shift,,True,(),float64,0.0123125


In [19]:
json_file

'{"CV-RMSE": 0.9457934083113086, "CV-MAPE": 0.2689578067378214, "CV-sd-RMSE": 0.06792269612314798, "CV-sd-MAPE": 0.021991067314917023, "CV-LL": 0.5778875262076213, "CV-sd-LL": 0.0962754483249235, "Training Time": 0.0, "Training Time se": NaN}'

## Random Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor

RMSE = []
MAPE  = []
LL  = []
training_time = []

for fold, (train_index, test_index) in tqdm(enumerate(kf.split(keys))):
    # randomly pick counties
    train_keys = [keys[key] for key in train_index]
    test_keys = [keys[key] for key in test_index]
    train_index = df_all["key"].isin(train_keys)
    test_index = df_all["key"].isin(test_keys)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    scaler_x = StandardScaler().fit(X_train)
    scaler_y = StandardScaler().fit(y_train)
    X_train, y_train = scaler_x.transform(X_train), scaler_y.transform(y_train)
    X_test = scaler_x.transform(X_test)

    # fit and train GP regression model
    m = RandomForestRegressor(max_depth=2, random_state=0)   

    t0 = time.time()
    m.fit(X_train, y_train.ravel())    
    t1 = time.time()

    pred = m.predict(X_test)
    pred = scaler_y.inverse_transform(pred[:, None])
    
    # compute metrics 
    RMSE.append(np.sqrt(np.mean((pred - y_test)**2)))
    MAPE.append(np.mean(np.abs(( pred - y_test) / y_test)))
    training_time.append(t1-t0)

    # plot predictions
    plt.figure(figsize=(8,8))
    plt.scatter(pred, y_test, color="red")
    plt.plot(np.linspace(-100,100,201), np.linspace(-100,100,201), color="black")
    plt.xlim((y_test.min()-0.5, y_test.max()+0.5))
    plt.ylim((y_test.min()-0.5, y_test.max()+0.5))
    plt.xlabel("Predictions")
    plt.ylabel("Ground Truth")
    plt.savefig(f"../../results/synthetic/centroidRF_fold{fold}.png")
    plt.savefig(f"../../results/synthetic/centroidRF_fold{fold}.pdf")
    plt.close()

print(f"MAPE", MAPE)
print(f"RMSE", RMSE)
print(f"Training Time", training_time)

json_file = json.dumps({"CV-RMSE": sum(RMSE) / 5, "CV-MAPE": sum(MAPE) / 5, 
                       "CV-sd-RMSE": sem(RMSE), "CV-sd-MAPE": sem(MAPE), 
                        "Training Time": sum(training_time)/5, "Training Time se": sem(training_time)}

                       )
f = open(f"../../results/synthetic/centroidRF.json", "w")
f.write(json_file)
f.close()

5it [00:00,  5.30it/s]

MAPE [1.134756454359541, 1.1280068043450413, 1.3186267795295163, 0.9141397598217534, 1.2957029216412572]
RMSE [2.3332936594822917, 2.997133030651329, 5.587044419422687, 2.5717109806993723, 2.6883855465343194]
Training Time [0.08103799819946289, 0.07909297943115234, 0.07673358917236328, 0.07631373405456543, 0.07688188552856445]





In [21]:
json_file

'{"CV-RMSE": 3.2355135273579996, "CV-MAPE": 1.1582465439394218, "CV-sd-RMSE": 0.5975159552633302, "CV-sd-MAPE": 0.07268783390304066, "Training Time": 0.07801203727722168, "Training Time se": 0.0008977967980275035}'