In [1]:
import xarray as xr
import numpy  as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import shap
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import os
import pickle

  from .autonotebook import tqdm as notebook_tqdm
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [2]:
## prepare data for XGB/NN model
df1 = pd.read_pickle("/glade/work/hpzhang/LCC_pred/data/data_monthly.pkl").dropna()
df1 = df1.drop(columns=['lat', 'lon', 'lsm', 'T1000', 'T700', 'EIS', 'ECTEI','ELF','SST','AOD'])
df1 = df1[df1.index.year.isin(np.arange(2003,2015))]


df2  = pd.read_pickle("/glade/work/hpzhang/LCC_pred/data/data_monthly_merra2.pkl").dropna()
df2  = df2.drop(columns=['lat', 'lon', 'lsm', 'T1000', 'T700', 'EIS', 'ECTEI','ELF','SST'])

In [3]:
data_train = df1
data_test  = df2

X_train, y_train = data_train.iloc[:,1:], data_train.iloc[:,0]
X_test, y_test   = data_test.iloc[:,1:], data_test.iloc[:,0]

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [4]:
import xgboost as xgb
import pickle

### load XGB model ###
file_name = "/glade/work/hpzhang/LCC_pred/data/xgb_reg.pkl"
model = pickle.load(open(file_name, "rb"))

# Predicting the Test set results
y_pred_XGB = model.predict(X_test_scaled)
mse_XGB = mean_squared_error(y_test, y_pred_XGB)
mae_XGB = mean_absolute_error(y_test, y_pred_XGB)
print('Test')
print('Mean squared error using XGBoost: ', mse_XGB)
print('Mean absolute error Using XGBoost: ', mae_XGB)

# Predicting the Training set results
y_train_XGB = model.predict(X_train_scaled)
mse_XGB = mean_squared_error(y_train, y_train_XGB)
mae_XGB = mean_absolute_error(y_train, y_train_XGB)
print('Training')
print('Mean squared error using XGBoost: ', mse_XGB)
print('Mean absolute error Using XGBoost: ', mae_XGB)

Test
Mean squared error using XGBoost:  0.040152006
Mean absolute error Using XGBoost:  0.17256394
Training
Mean squared error using XGBoost:  0.002556344
Mean absolute error Using XGBoost:  0.038721927


In [5]:
### XGB output ###
var = 'MERRA2_output'
os.system("mkdir -p /glade/work/hpzhang/LCC_pred/data/{}".format(var))
np.save("/glade/work/hpzhang/LCC_pred/data/{}/y_pred_XGB.npy".format(var), np.array(y_pred_XGB))


In [6]:
import tensorflow as tf

### load NN model ###
model = tf.keras.models.load_model('/glade/work/hpzhang/LCC_pred/data/NN_model')

# Predicting the Test set results
y_pred_neural = model.predict(X_test_scaled)
mse_neural, mae_neural = model.evaluate(X_test_scaled, y_test)
print('Mean squared error from neural net: ', mse_neural)
print('Mean absolute error from neural net: ', mae_neural)



2022-06-27 22:53:42.833279: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-27 22:53:43.504894: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30559 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:61:00.0, compute capability: 7.0


Mean squared error from neural net:  0.03539206087589264
Mean absolute error from neural net:  0.15912756323814392


In [7]:
### NN output ###

var = 'MERRA2_output'
os.system("mkdir -p /glade/work/hpzhang/LCC_pred/data/{}".format(var))
np.save("/glade/work/hpzhang/LCC_pred/data/{}/y_pred_neural.npy".format(var), np.array(y_pred_neural))

In [8]:
## prepare data for Qu model

df1 = pd.read_pickle("/glade/work/hpzhang/LCC_pred/data/data_monthly.pkl").dropna()
df1 = df1[['LCF','EIS','LH','dQ','RH700','OMEGA700','U1000','Tadv']]
df1 = df1[df1.index.year.isin(np.arange(2003,2015))]

df2  = pd.read_pickle("/glade/work/hpzhang/LCC_pred/data/data_monthly_merra2.pkl").dropna()
df2 = df2[['LCF','EIS','LH','dQ','RH700','OMEGA700','U1000','Tadv']]

data_train = df1
data_test  = df2

X_train, y_train = data_train.iloc[:,1:], data_train.iloc[:,0]
X_test, y_test   = data_test.iloc[:,1:], data_test.iloc[:,0]

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [9]:
### load Qu/linear model ###
file_name = "/glade/work/hpzhang/LCC_pred/data/lr_reg.pkl"
lr_model = pickle.load(open(file_name, "rb"))

y_pred_lr = lr_model.predict(X_test_scaled)
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print('Mean squared error from linear regression: ', mse_lr)
print('Mean absolute error from linear regression: ', mae_lr)

# summarize feature importance
feature_list = list(df1.columns[1:])
feature_imp_lr = pd.Series(abs(lr_model.coef_), index=feature_list).sort_values(ascending=False)
print(feature_imp_lr)

Mean squared error from linear regression:  0.048834330317338595
Mean absolute error from linear regression:  0.19064497534590347
EIS         0.126036
LH          0.033632
dQ          0.027660
OMEGA700    0.025818
U1000       0.011550
Tadv        0.007165
RH700       0.003977
dtype: float64


In [10]:
### Qu output ###

var = 'MERRA2_output'
os.system("mkdir -p /glade/work/hpzhang/LCC_pred/data/{}".format(var))
np.save("/glade/work/hpzhang/LCC_pred/data/{}/y_pred_lr.npy".format(var), np.array(y_pred_lr))