# Running models

I use a Deep Neural Network to estimate the temperature at 2m given several atmospheric parameters. 
I also run a gridserchCV with a random forest regressor using a smaller day average sample rather than the hourly sample.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Sequential
from keras.layers import Dense
#from keras.wrappers.scikit_learn import KerasRegressor
from scikeras.wrappers import KerasRegressor

2022-04-12 15:15:50.588473: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-12 15:15:50.588495: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
filepath = "/home/veroastro/Documents/NASA_POWER_ds/data/"
df = pd.read_csv(filepath + "POWER_Point_Hourly_20010101_20211231_041d39N_002d17E_UTC.csv")
df.head()

Unnamed: 0,YEAR,MO,DY,HR,WD10M,T2MWET,T2MDEW,PS,WS10M,V10M,U10M,QV2M,T2M,RH2M,PRECTOTCORR
0,2001,1,1,0,286.64,1.42,-0.66,97.95,3.0,-0.86,2.88,3.85,3.49,76.5,0.0
1,2001,1,1,1,295.52,1.7,-0.24,97.89,2.87,-1.23,2.59,3.91,3.65,77.62,0.0
2,2001,1,1,2,296.91,1.7,-0.01,97.82,2.9,-1.31,2.59,3.97,3.41,80.12,0.0
3,2001,1,1,3,293.3,2.04,0.51,97.76,2.88,-1.14,2.65,4.15,3.55,82.19,0.0
4,2001,1,1,4,295.29,2.64,1.36,97.71,2.82,-1.2,2.55,4.39,3.92,84.69,0.0


In [3]:
df_new = df.drop(['T2MWET','T2MDEW', 'PS'], axis = 1) # dropped to deal with multicollinearity

In [4]:
# defining Xand y and scalling X
from sklearn.preprocessing import MinMaxScaler
X = df_new.drop('T2M', axis= 1)
y = df_new['T2M']
scaler = MinMaxScaler()
X_t = scaler.fit_transform(X)
X.columns

Index(['YEAR', 'MO', 'DY', 'HR', 'WD10M', 'WS10M', 'V10M', 'U10M', 'QV2M',
       'RH2M', 'PRECTOTCORR'],
      dtype='object')

In [5]:
# Splitting the Xand y datasets into training and test set and normalising the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_t, y, 
                                                    test_size=0.3, random_state=42)

In [6]:
# define model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(250, input_dim=11, kernel_initializer='normal', activation='relu'))
    model.add(Dense(50, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [7]:
# run the model, fit the model and make predictions
estimator = KerasRegressor(model=baseline_model, epochs=100, batch_size=5, verbose=0, validation_split=0.3)

estimator.fit(X_train, y_train)
predictions = estimator.predict(X_test)
pred_train = estimator.predict(X_train)

2022-04-12 15:15:56.863107: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-04-12 15:15:56.863138: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (veroastro): /proc/driver/nvidia/version does not exist
2022-04-12 15:15:56.864244: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
estimator.model_.save('saved_model.h5')

In [10]:
# print mean square errors between predicted and spectroscopic z
from sklearn.metrics import mean_squared_error
error = mean_squared_error(y_test,predictions)
print('Mean Square Error:', round(error, 3))
print('Error Percentage:', round(error*100/y_test.mean(), 3), '%')

Mean Square Error: 0.018
Error Percentage: 0.115 %


# Model Random Forest Regressor

I wanted to try a Random forest regressor initially but it was computationally impossible with my computer, so I averaged the hours of the day to daily values.

In [11]:
# Gouping the dataset by day of the month and year.
df_daily = df_new.groupby(['YEAR', 'MO','DY']).mean().reset_index()
df_daily.describe()

Unnamed: 0,YEAR,MO,DY,HR,WD10M,WS10M,V10M,U10M,QV2M,T2M,RH2M,PRECTOTCORR
count,7670.0,7670.0,7670.0,7670.0,7670.0,7670.0,7670.0,7670.0,7670.0,7670.0,7670.0,7670.0
mean,2011.000652,6.523077,15.729205,11.5,197.665083,3.258208,0.423541,0.555808,8.045337,15.562784,69.783476,0.058508
std,6.055498,3.448888,8.800483,0.0,63.903244,1.296116,1.747359,2.316965,3.018352,6.688787,10.87685,0.178946
min,2001.0,1.0,1.0,11.5,25.4525,0.830417,-9.44,-13.43875,1.458333,-1.981667,33.32625,0.0
25%,2006.0,4.0,8.0,11.5,152.568333,2.464167,-0.615521,-0.608021,5.59625,9.965417,62.071146,0.0
50%,2011.0,7.0,16.0,11.5,200.631458,2.934583,0.674792,0.583125,7.566458,15.163958,69.497083,0.002083
75%,2016.0,10.0,23.0,11.5,246.275104,3.647083,1.674479,1.750208,10.486562,21.423333,77.488646,0.029583
max,2021.0,12.0,31.0,11.5,345.98,14.47875,6.56,11.762917,16.32125,30.3925,98.575,4.11125


In [12]:
# defining new X and y, scaling X and separating the data
X_d = df_daily.drop(['T2M','HR'], axis= 1)
y_d = df_daily['T2M']

X_dt = scaler.fit_transform(X_d)
X_dtrain, X_dtest, y_dtrain, y_dtest = train_test_split(X_dt, y_d, 
                                                    test_size=0.3, random_state=42)

In [13]:
# Random Forest with GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
RF = RandomForestRegressor(random_state = 42)
param_grid = {  'bootstrap': [True],
                'max_features': ['auto', 'sqrt','log2'], 
                'n_estimators': range(100,400, 20)}

rf= GridSearchCV(RF,
                  param_grid=param_grid, cv = 5)

# Train the model on training data
rf.fit(X_dtrain, y_dtrain);



In [14]:
rf.best_params_ ,rf.best_score_

({'bootstrap': True, 'max_features': 'auto', 'n_estimators': 380},
 0.9974387808933456)

In [16]:
predictions_rf = rf.predict(X_dtest)

In [17]:
error_rf = mean_squared_error(y_dtest,predictions_rf)
print('Mean Square Error:', round(error_rf, 3))
print('Error Percentage:', round(error_rf*100/y_dtest.mean(), 3), '%')

Mean Square Error: 0.089
Error Percentage: 0.58 %
