In [1]:
# !pip install tensorflow==1.15.0

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgbm
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import InputLayer, Input
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.callbacks import TensorBoard
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import EarlyStopping

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.colors as colors
import matplotlib as mpl

In [3]:
randomState = 42
np.random.seed(randomState)
tf.set_random_seed(randomState)

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
df5 = pd.read_csv('/content/gdrive/MyDrive/SoilMoisture/Data/c_5_data_soil_5.csv')
df10 = pd.read_csv('/content/gdrive/MyDrive/SoilMoisture/Data/c_5_data_soil_10.csv')
df20 = pd.read_csv('/content/gdrive/MyDrive/SoilMoisture/Data/c_5_data_soil_20.csv')
df50 = pd.read_csv('/content/gdrive/MyDrive/SoilMoisture/Data/c_5_data_soil_50.csv')
df100 = pd.read_csv('/content/gdrive/MyDrive/SoilMoisture/Data/c_5_data_soil_100.csv')

df5 = df5.rename(columns={'In-Situ':'InSitu'})
df10 = df10.rename(columns={'In-Situ':'InSitu'})
df20 = df20.rename(columns={'In-Situ':'InSitu'})
df50 = df50.rename(columns={'In-Situ':'InSitu'})
df100 = df100.rename(columns={'In-Situ':'InSitu'})

def replace_missing(attribute):
    return attribute.interpolate(inplace=True)

replace_missing(df5['NDVI'])
replace_missing(df5['EVI'])
replace_missing(df5['LSTDay'])
replace_missing(df5['LSTNight'])

replace_missing(df10['NDVI'])
replace_missing(df10['EVI'])
replace_missing(df10['LSTDay'])
replace_missing(df10['LSTNight'])

replace_missing(df20['NDVI'])
replace_missing(df20['EVI'])
replace_missing(df20['LSTDay'])
replace_missing(df20['LSTNight'])

replace_missing(df50['NDVI'])
replace_missing(df50['EVI'])
replace_missing(df50['LSTDay'])
replace_missing(df50['LSTNight'])

replace_missing(df100['NDVI'])
replace_missing(df100['EVI'])
replace_missing(df100['LSTDay'])
replace_missing(df100['LSTNight'])


df5 = df5.dropna()
df10 = df10.dropna()
df20 = df20.dropna()
df50 = df50.dropna()
df100 = df100.dropna()

In [6]:
df5.pop("BD")
df10.pop("BD")
df20.pop("BD")
df50.pop("Precip")
df100.pop("BD")
df100.pop("Precip")

raw5 = df5.copy()
raw10 = df10.copy()
raw20 = df20.copy()
raw50 = df50.copy()
raw100 = df100.copy()

raw5.pop("InSitu")
raw10.pop("InSitu")
raw20.pop("InSitu")
raw50.pop("InSitu")
raw100.pop("InSitu")

2        0.20708
3        0.20679
4        0.20750
5        0.20929
6        0.20937
          ...   
42497    0.39700
42498    0.39700
42499    0.39700
42500    0.39700
42501    0.39711
Name: InSitu, Length: 42500, dtype: float64

In [7]:
def optimizeModel(X, y):
	param_test = {'num_leaves': sp_randint(6, 50),
								'min_child_samples': sp_randint(10, 400),
								'min_child_weight': [1e-1,5e-1,5e-2, 1, 1e1, 1e2],
								'subsample': sp_uniform(loc=0.3, scale=0.7),
								'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
								'reg_alpha':  [0, 1e-1,1e-2, 5e-2,5e-1, 1, 3, 5, 10],
								'reg_lambda': [0, 1e-1,1e-2, 5e-2,5e-1, 1, 3, 5, 10, 25, 50],
								'learning_rate' :[1e-3, 5e-3, 1e-2, 3e-2, 5e-2, 1e-1, 2e-1],
								'max_depth' : [3,5,10,15,20,25,30]}
	
	rgr = lgbm.LGBMRegressor(max_depth=-1, random_state=42, silent=True, metric='mse', n_jobs=-1, n_estimators = 10000)
	
	gs = RandomizedSearchCV(estimator=rgr, param_distributions=param_test, n_iter=15, random_state=314, verbose=True, refit = True)
	gs.fit(X, y)
	print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))
	opt_params = gs.best_params_
        
	return opt_params

In [8]:
# evaluate lightgbm ensemble for regression for 5cm
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from lightgbm import LGBMRegressor
# define dataset
train_size_5 = int(len(df5)*0.9)
train_dataset_5, test_dataset_5 = df5.iloc[:train_size_5], df5.iloc[train_size_5:]

X5_train = np.asarray(train_dataset_5.drop('InSitu', axis = 1))
y5_train = np.asarray(train_dataset_5.loc[:,['InSitu']])
y5_train.reshape(-1, 1)

# standard scaling
input_scaler_5 = StandardScaler().fit(X5_train)
X5_train = input_scaler_5.transform(X5_train)
# output_scaler_5 = StandardScaler().fit(y5_train)
# y5_train = output_scaler_5.transform(y5_train)
# define the model
opt_params = optimizeModel(X5_train, y5_train.ravel())
model = LGBMRegressor()
model.set_params(**opt_params)
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X5_train, y5_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Depth 5 MSE: %.4f (%.4f)' % (np.mean(n_scores), np.std(n_scores)))

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 12.8min finished


Best score reached: 0.3622856008459382 with params: {'colsample_bytree': 0.6051900543300555, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_samples': 43, 'min_child_weight': 10.0, 'num_leaves': 10, 'reg_alpha': 5, 'reg_lambda': 3, 'subsample': 0.7799099313354487} 
Depth 5 MSE: -0.0021 (0.0001)


In [9]:
# evaluate lightgbm ensemble for regression for 10cm
# define dataset
train_size_10 = int(len(df10)*0.9)
train_dataset_10, test_dataset_10 = df10.iloc[:train_size_10], df10.iloc[train_size_10:]

X10_train = np.asarray(train_dataset_10.drop('InSitu', axis = 1))
y10_train = np.asarray(train_dataset_10.loc[:,['InSitu']])
y10_train.reshape(-1, 1)

# standard scaling
input_scaler_10 = StandardScaler().fit(X10_train)
X10_train = input_scaler_10.transform(X10_train)
# output_scaler_10 = StandardScaler().fit(y10_train)
# y10_train = output_scaler_10.transform(y10_train)
# define the model
opt_params = optimizeModel(X10_train, y10_train.ravel())
model = LGBMRegressor()
model.set_params(**opt_params)
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X10_train, y10_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Depth 10 MSE: %.4f (%.4f)' % (np.mean(n_scores), np.std(n_scores)))

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 12.5min finished


Best score reached: 0.36918282367269095 with params: {'colsample_bytree': 0.6051900543300555, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_samples': 43, 'min_child_weight': 10.0, 'num_leaves': 10, 'reg_alpha': 5, 'reg_lambda': 3, 'subsample': 0.7799099313354487} 
Depth 10 MSE: -0.0017 (0.0001)


In [10]:
# evaluate lightgbm ensemble for regression for 20cm
# define dataset
train_size_20 = int(len(df20)*0.9)
train_dataset_20, test_dataset_20 = df20.iloc[:train_size_20], df20.iloc[train_size_20:]

X20_train = np.asarray(train_dataset_20.drop('InSitu', axis = 1))
y20_train = np.asarray(train_dataset_20.loc[:,['InSitu']])
y20_train.reshape(-1, 1)

# standard scaling
input_scaler_20 = StandardScaler().fit(X20_train)
X20_train = input_scaler_20.transform(X20_train)
# output_scaler_20 = StandardScaler().fit(y20_train)
# y20_train = output_scaler_20.transform(y20_train)
# define the model
opt_params = optimizeModel(X20_train, y20_train.ravel())
model = LGBMRegressor()
model.set_params(**opt_params)
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X20_train, y20_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Depth 20 MSE: %.4f (%.4f)' % (np.mean(n_scores), np.std(n_scores)))

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 11.5min finished


Best score reached: 0.28179597071530277 with params: {'colsample_bytree': 0.796013298339504, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_samples': 349, 'min_child_weight': 100.0, 'num_leaves': 13, 'reg_alpha': 10, 'reg_lambda': 25, 'subsample': 0.302421687583401} 
Depth 20 MSE: -0.0014 (0.0000)


In [11]:
# evaluate lightgbm ensemble for regression for 50cm
# define dataset
train_size_50 = int(len(df50)*0.9)
train_dataset_50, test_dataset_50 = df50.iloc[:train_size_50], df50.iloc[train_size_50:]

X50_train = np.asarray(train_dataset_50.drop('InSitu', axis = 1))
y50_train = np.asarray(train_dataset_50.loc[:,['InSitu']])
y50_train.reshape(-1, 1)

# standard scaling
input_scaler_50 = StandardScaler().fit(X50_train)
X50_train = input_scaler_50.transform(X50_train)
# output_scaler_50 = StandardScaler().fit(y50_train)
# y50_train = output_scaler_50.transform(y50_train)
# define the model
opt_params = optimizeModel(X50_train, y50_train.ravel())
model = LGBMRegressor()
model.set_params(**opt_params)
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X50_train, y50_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Depth 50 MSE: %.4f (%.4f)' % (np.mean(n_scores), np.std(n_scores)))

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 11.1min finished


Best score reached: 0.2363158837022728 with params: {'colsample_bytree': 0.44945060352437893, 'learning_rate': 0.01, 'max_depth': 30, 'min_child_samples': 119, 'min_child_weight': 100.0, 'num_leaves': 33, 'reg_alpha': 0.01, 'reg_lambda': 10, 'subsample': 0.6284334424128668} 
Depth 50 MSE: -0.0032 (0.0001)


In [12]:
# evaluate lightgbm ensemble for regression for 100cm
# define dataset
train_size_100 = int(len(df100)*0.9)
train_dataset_100, test_dataset_100 = df100.iloc[:train_size_100], df100.iloc[train_size_100:]

X100_train = np.asarray(train_dataset_100.drop('InSitu', axis = 1))
y100_train = np.asarray(train_dataset_100.loc[:,['InSitu']])
y100_train.reshape(-1, 1)

# standard scaling
input_scaler_100 = StandardScaler().fit(X100_train)
X100_train = input_scaler_100.transform(X100_train)
# output_scaler_100 = StandardScaler().fit(y100_train)
# y100_train = output_scaler_100.transform(y100_train)
# define the model
opt_params = optimizeModel(X100_train, y100_train.ravel())
model = LGBMRegressor()
model.set_params(**opt_params)
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X100_train, y100_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Depth 100 MSE: %.4f (%.4f)' % (np.mean(n_scores), np.std(n_scores)))

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 10.3min finished


Best score reached: 0.006289098196931619 with params: {'colsample_bytree': 0.6316033217250339, 'learning_rate': 0.005, 'max_depth': 25, 'min_child_samples': 64, 'min_child_weight': 100.0, 'num_leaves': 8, 'reg_alpha': 5, 'reg_lambda': 3, 'subsample': 0.7093362205925597} 
Depth 100 MSE: -0.0073 (0.0001)
