In [11]:
from pathlib import Path

DATA_PATH = Path.cwd().parent / 'data'

In [12]:
import pandas as pd

EOS_PATH = DATA_PATH / 'EOS04 EDITED.xlsx'
SENTINEL_PATH = DATA_PATH / 'SENTINEL Edited.xlsx'


eos = pd.ExcelFile(EOS_PATH)
sentinel = pd.ExcelFile(SENTINEL_PATH)


eos.sheet_names, sentinel.sheet_names

(['28-08-2022',
  '11-09-2022',
  '15-10-2022',
  '24-01-2023',
  '11-02-2023 ',
  '28-2-2023'],
 ['27-08-2022',
  '08-09-2022',
  '14-10-2022',
  '30-01-2023',
  '11-02-2023 ',
  '28-2-2023'])

Since the data is spatial (because of presence of latitude and longitude) and temporal (because of presence of dates) in nature, we'll need different approaches to deal with this.

But, firstly, lets take the naive approach and lets concatenate all dates and answer "CAN Lat, Lon, HH and HV predict SM?"

In [7]:
from sklearn.model_selection import GridSearchCV

def fit_grid_search(X_train, X_test, y_train, y_test, model, param_grid):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=3,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)


    print("Best Parameters:", grid_search.best_params_)

    best_model = grid_search.best_estimator_
    print("Best Model Score - ", best_model.score(X_test, y_test))

## EOS Data

In [2]:
eos_dfs = []

for sheet_name in eos.sheet_names:
    df = pd.read_excel(EOS_PATH, sheet_name=sheet_name)
    eos_dfs.append(df)

eos_combined = pd.concat(eos_dfs, ignore_index=True)

eos_combined

Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),HH-pol,HV-pol,SM1 (%)
0,2022-08-28,22.526048,72.765011,-4.99884,-13.33651,30.5
1,2022-08-28,22.525481,72.765028,-8.76236,-16.08855,46.9
2,2022-08-28,22.525999,72.765663,-7.11428,-11.90641,18.1
3,2022-08-28,22.527290,72.764707,-8.32358,-15.11733,34.4
4,2022-08-28,22.527874,72.764718,-5.27314,-15.93518,41.1
...,...,...,...,...,...,...
892,2023-02-28,22.523640,72.766727,-8.69274,-14.15688,26.8
893,2023-02-28,22.523657,72.766016,-6.48271,-15.56076,32.8
894,2023-02-28,22.524255,72.766024,-6.20660,-14.73141,34.8
895,2023-02-28,22.524229,72.766601,-9.43414,-20.52987,27.8


In [3]:
eos_combined['Month'] = eos_combined['Sample Date & Time'].dt.month
eos_combined['Day'] = eos_combined['Sample Date & Time'].dt.day
eos_combined['Year'] = eos_combined['Sample Date & Time'].dt.year

In [4]:
X_cols = ['HH-pol', 'HV-pol', 'Day', 'Month', 'Year']
y_col = ['SM1 (%)']

In [6]:
from datetime import datetime

mask = eos_combined['Sample Date & Time'] > datetime(2023, 1, 1)
X = eos_combined[X_cols]
y = eos_combined[y_col]

X_train, X_test = X[mask], X[~mask]
y_train, y_test = y[mask], y[~mask]

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=10)

# taken from gpt
rf_param_grid = {
    'n_estimators': [100, 200, 500],     
    'max_depth': [None, 5, 10, 20],      
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 2, 4],       
    'max_features': ['sqrt', 'log2']     
}

fit_grid_search(X_train, X_test, y_train, y_test, model=rf, param_grid=rf_param_grid)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best Model Score -  -0.17819066873550593


In [45]:
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings('ignore')

xgb = XGBRegressor(random_state=10, objective='reg:squarederror')

# taken from gpt
xgb_param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

fit_grid_search(X_train, X_test, y_train, y_test, model=xgb, param_grid=xgb_param_grid)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
Best Model Score -  -0.015545832012215266


In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

ada = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(random_state=10),
    random_state=10
)

# taken from gpt
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 1.0],
    'estimator__max_depth': [2, 3, 5, None],
    'estimator__min_samples_split': [2, 5, 10]
}

fit_grid_search(X_train, X_test, y_train, y_test, model=ada, param_grid=ada_param_grid)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'estimator__max_depth': 5, 'estimator__min_samples_split': 2, 'learning_rate': 1.0, 'n_estimators': 50}
Best Model Score -  -0.10890908793130283


In [9]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)

y_train_scaled = mm.fit_transform(y_train)
y_test_scaled = mm.transform(y_test)

In [12]:
from sklearn.svm import SVR

svr = SVR()

svr_param_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10, 100],         # Regularization strength
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient
    'epsilon': [0.01, 0.1, 0.2, 0.5]  # Margin of tolerance
}

fit_grid_search(X_train_scaled, X_test_scaled, y_train_scaled.flatten(), y_test_scaled.flatten(), model=svr, param_grid=svr_param_grid)

Fitting 3 folds for each of 320 candidates, totalling 960 fits
Best Parameters: {'C': 1, 'epsilon': 0.01, 'gamma': 0.1, 'kernel': 'poly'}
Best Model Score -  -807.6610786881246


## Sentinel Data

In [21]:
sentinel_dfs = []

for sheet_name in sentinel.sheet_names:
    df = pd.read_excel(SENTINEL_PATH, sheet_name=sheet_name)
    if 'Latitude (Centre of grid).1' in df.columns:
        df = df.drop(['Latitude (Centre of grid).1'], axis=1)
    df = df.rename(columns={'(θ)': 'angle'})
    sentinel_dfs.append(df)

sentinel_combined = pd.concat(sentinel_dfs, ignore_index=True)

sentinel_combined

Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),VH-pol,VV-pol,angle,SM1 (%)
0,2022-08-08,22.526048,72.765011,-16.375600,-10.590500,40.048800,30.5
1,2022-08-08,22.525481,72.765028,-16.244300,-10.634800,41.349100,46.9
2,2022-08-08,22.525999,72.765663,-16.821400,-9.816820,42.283900,18.1
3,2022-08-08,22.527290,72.764707,-16.003700,-10.809500,42.692300,34.4
4,2022-08-08,22.527874,72.764718,-16.637400,-10.626300,43.860500,41.1
...,...,...,...,...,...,...,...
941,2023-02-28,22.523640,72.766727,-14.539132,-10.738241,41.797688,26.8
942,2023-02-28,22.523657,72.766016,-14.474763,-10.252501,41.254852,32.8
943,2023-02-28,22.524255,72.766024,-16.174562,-11.142242,42.046009,34.8
944,2023-02-28,22.524229,72.766601,-16.532534,-11.246845,41.706707,27.8


In [22]:
sentinel_combined['Day'] = sentinel_combined['Sample Date & Time'].dt.day
sentinel_combined['Month'] = sentinel_combined['Sample Date & Time'].dt.month
sentinel_combined['Year'] = sentinel_combined['Sample Date & Time'].dt.year

In [65]:
sentinel_combined = sentinel_combined.dropna()

In [66]:
X_cols = ['Latitude (Centre of grid)', 'Longitude (Centre of grid)', 'VH-pol', 'VV-pol', 'angle']
y_col = ['SM1 (%)']

In [67]:
X = sentinel_combined[X_cols]
y = sentinel_combined[y_col]

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)

len(X_train), len(X_test)

(721, 181)

In [69]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=10)

# taken from gpt
rf_param_grid = {
    'n_estimators': [100, 200, 500],     
    'max_depth': [None, 5, 10, 20],      
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 2, 4],       
    'max_features': ['sqrt', 'log2']     
}

fit_grid_search(X_train, X_test, y_train.values.flatten(), y_test.values.flatten(), model=rf, param_grid=rf_param_grid)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 500}
Best Model Score -  -0.07487080036037774


In [70]:
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings('ignore')

xgb = XGBRegressor(random_state=10, objective='reg:squarederror')

# taken from gpt
xgb_param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

fit_grid_search(X_train, X_test, y_train.values.flatten(), y_test.values.flatten(), model=xgb, param_grid=xgb_param_grid)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best Model Score -  -0.014610027498297207


In [71]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

ada = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(random_state=10),
    random_state=10
)

# taken from gpt
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 1.0],
    'estimator__max_depth': [2, 3, 5, None],
    'estimator__min_samples_split': [2, 5, 10]
}

fit_grid_search(X_train, X_test, y_train.values.flatten(), y_test.values.flatten(), model=ada, param_grid=ada_param_grid)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'estimator__max_depth': 3, 'estimator__min_samples_split': 10, 'learning_rate': 0.1, 'n_estimators': 200}
Best Model Score -  -0.07352010652440288


In [72]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)

y_train_scaled = mm.fit_transform(y_train)
y_test_scaled = mm.transform(y_test)

In [73]:
from sklearn.svm import SVR

svr = SVR()

svr_param_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10, 100],         # Regularization strength
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient
    'epsilon': [0.01, 0.1, 0.2, 0.5]  # Margin of tolerance
}

fit_grid_search(X_train_scaled, X_test_scaled, y_train_scaled.flatten(), y_test_scaled.flatten(), model=svr, param_grid=svr_param_grid)

Fitting 3 folds for each of 320 candidates, totalling 960 fits
Best Parameters: {'C': 1, 'epsilon': 0.01, 'gamma': 0.1, 'kernel': 'sigmoid'}
Best Model Score -  -0.10618917371650438
