In [1]:
from pathlib import Path
import os

DATA_PATH = Path.cwd().parent / "data"

EOS_PATH = DATA_PATH / "EOS04 EDITED.xlsx"
SENTINEL_PATH = DATA_PATH / "SENTINEL Edited.xlsx"

os.path.isfile(EOS_PATH), os.path.isfile(SENTINEL_PATH)

(True, True)

In [2]:
import pandas as pd

eos = pd.ExcelFile(EOS_PATH)
sentinel = pd.ExcelFile(SENTINEL_PATH)

eos.sheet_names, sentinel.sheet_names

(['28-08-2022',
  '11-09-2022',
  '15-10-2022',
  '24-01-2023',
  '11-02-2023 ',
  '28-2-2023'],
 ['27-08-2022',
  '08-09-2022',
  '14-10-2022',
  '30-01-2023',
  '11-02-2023 ',
  '28-2-2023'])

# One Big Class For Experiments

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)


class ModelExperiments:
    def __init__(self, X_train, X_test, y_train, y_test, 
                 X_train_scaled=None, X_test_scaled=None, 
                 y_train_scaled=None, y_test_scaled=None):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.X_train_scaled = X_train_scaled
        self.X_test_scaled = X_test_scaled
        self.y_train_scaled = y_train_scaled
        self.y_test_scaled = y_test_scaled

        self.y_train = self.y_train.values.flatten()
        self.y_test = self.y_test.values.flatten()

        self.y_test_scaled = self.y_test_scaled.flatten()
        self.y_train_scaled = self.y_train_scaled.flatten()

    def fit_grid_search(self, model, param_grid, scaled=False, model_name="Model"):
        """
        Runs GridSearchCV for a given model + param_grid
        """
        print(f"\n=== Running {model_name} ===")
        
        X_train = self.X_train_scaled if scaled else self.X_train
        X_test = self.X_test_scaled if scaled else self.X_test
        y_train = self.y_train_scaled if scaled else self.y_train
        y_test = self.y_test_scaled if scaled else self.y_test

        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=3,
            n_jobs=-1,
            verbose=1
        )

        grid_search.fit(X_train, y_train)

        print("Best Parameters:", grid_search.best_params_)

        best_model = grid_search.best_estimator_
        print("Test R2 Score:", best_model.score(X_test, y_test))
        y_preds = best_model.predict(X_test)
        return self.make_result_dict(y_test, y_preds)
    
    def make_result_dict(self, y_true, y_preds):
        result_dict = {}

        result_dict['MAE'] = mean_absolute_error(y_true, y_preds)
        result_dict['MSE'] = mean_squared_error(y_true, y_preds)
        result_dict['RMSE'] = root_mean_squared_error(y_true, y_preds)
        result_dict['R2'] =  r2_score(y_true, y_preds)

        return result_dict

    def run_all(self):
        """
        Runs all experiments: RF, XGB, AdaBoost, SVR
        """
        results = {}

        # Random Forest
        rf = RandomForestRegressor(random_state=10)
        rf_param_grid = {
            'n_estimators': [100, 200, 500],     
            'max_depth': [None, 5, 10, 20],      
            'min_samples_split': [2, 5, 10],     
            'min_samples_leaf': [1, 2, 4],       
            'max_features': ['sqrt', 'log2']     
        }
        results["RandomForest"] = self.fit_grid_search(rf, rf_param_grid, model_name="RandomForest")

        # XGBoost
        xgb = XGBRegressor(random_state=10, objective='reg:squarederror')
        xgb_param_grid = {
            'n_estimators': [100, 200, 500],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.05, 0.1],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
        results["XGBoost"] = self.fit_grid_search(xgb, xgb_param_grid, model_name="XGBoost")

        # AdaBoost
        ada = AdaBoostRegressor(
            estimator=DecisionTreeRegressor(random_state=10),
            random_state=10
        )
        ada_param_grid = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.05, 0.1, 1.0],
            'estimator__max_depth': [2, 3, 5, None],
            'estimator__min_samples_split': [2, 5, 10]
        }
        results["AdaBoost"] = self.fit_grid_search(ada, ada_param_grid, model_name="AdaBoost")

        # SVR (requires scaled data!)
        svr = SVR()
        svr_param_grid = {
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'C': [0.1, 1, 10, 100],
            'gamma': ['scale', 'auto', 0.01, 0.1, 1],
            'epsilon': [0.01, 0.1, 0.2, 0.5]
        }
        if self.X_train_scaled is not None:  # only run if scaled data provided
            results["SVR"] = self.fit_grid_search(svr, svr_param_grid, scaled=True, model_name="SVR")

        return results


# Only Sentinel Data

In [5]:
sentinel_dfs = []

for sheet_name in sentinel.sheet_names:
    df = pd.read_excel(SENTINEL_PATH, sheet_name=sheet_name)
    if 'Latitude (Centre of grid).1' in df.columns:
        df = df.drop(['Latitude (Centre of grid).1'], axis=1)
    df = df.rename(columns={'(θ)': 'angle'})
    sentinel_dfs.append(df)

sentinel_combined = pd.concat(sentinel_dfs, ignore_index=True)

sentinel_combined

Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),VH-pol,VV-pol,angle,SM1 (%)
0,2022-08-08,22.526048,72.765011,-16.375600,-10.590500,40.048800,30.5
1,2022-08-08,22.525481,72.765028,-16.244300,-10.634800,41.349100,46.9
2,2022-08-08,22.525999,72.765663,-16.821400,-9.816820,42.283900,18.1
3,2022-08-08,22.527290,72.764707,-16.003700,-10.809500,42.692300,34.4
4,2022-08-08,22.527874,72.764718,-16.637400,-10.626300,43.860500,41.1
...,...,...,...,...,...,...,...
941,2023-02-28,22.523640,72.766727,-14.539132,-10.738241,41.797688,26.8
942,2023-02-28,22.523657,72.766016,-14.474763,-10.252501,41.254852,32.8
943,2023-02-28,22.524255,72.766024,-16.174562,-11.142242,42.046009,34.8
944,2023-02-28,22.524229,72.766601,-16.532534,-11.246845,41.706707,27.8


In [6]:
sentinel_combined['Day'] = sentinel_combined['Sample Date & Time'].dt.day
sentinel_combined['Month'] = sentinel_combined['Sample Date & Time'].dt.month
sentinel_combined['Year'] = sentinel_combined['Sample Date & Time'].dt.year

In [7]:
sentinel_combined = sentinel_combined.dropna()

## VH, VV, angle

In [None]:
X_cols = ['VH-pol', 'VV-pol', 'angle']
y_col = ['SM1 (%)']

X = sentinel_combined[X_cols]
y = sentinel_combined[y_col]

TRAIN_SIZE = 0.8

split_idx = int(len(X) * TRAIN_SIZE)

X_train, X_test = X.iloc[: split_idx], X.iloc[split_idx: ]
y_train, y_test = y.iloc[: split_idx], y.iloc[split_idx: ]

len(X_train), len(X_test)

(721, 181)

In [None]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)

y_train_scaled = mm.fit_transform(y_train)
y_test_scaled = mm.transform(y_test)

In [19]:
warnings.filterwarnings('always', category=FutureWarning)
sent_vv_vh_angle = ModelExperiments(X_train, X_test, y_train, y_test,
                    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled).run_all()

sent_vv_vh_angle


=== Running RandomForest ===
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Test R2 Score: -0.12419332552148332

=== Running XGBoost ===
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
Test R2 Score: -0.106412190432156

=== Running AdaBoost ===
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'estimator__max_depth': 3, 'estimator__min_samples_split': 2, 'learning_rate': 0.1, 'n_estimators': 200}
Test R2 Score: -0.06356310889106287

=== Running SVR ===
Fitting 3 folds for each of 320 candidates, totalling 960 fits
Best Parameters: {'C': 10, 'epsilon': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
Test R2 Score: -0.12875996535854006


{'RandomForest': {'MAE': 9.235245851072932,
  'MSE': 123.7550404342748,
  'RMSE': 11.124524279009634,
  'R2': -0.12419332552148332},
 'XGBoost': {'MAE': 9.162773408415568,
  'MSE': 121.79763236042216,
  'RMSE': 11.036196462569075,
  'R2': -0.106412190432156},
 'AdaBoost': {'MAE': 9.003120199315553,
  'MSE': 117.08065913321528,
  'RMSE': 10.820381653768747,
  'R2': -0.06356310889106287},
 'SVR': {'MAE': 0.032897406811686125,
  'MSE': 0.0015725403585216211,
  'RMSE': 0.03965526898813852,
  'R2': -0.12875996535854006}}

## VV, VH

In [21]:
X_cols = ['VH-pol', 'VV-pol']
y_col = ['SM1 (%)']

X = sentinel_combined[X_cols]
y = sentinel_combined[y_col]

TRAIN_SIZE = 0.8

split_idx = int(len(X) * TRAIN_SIZE)

X_train, X_test = X.iloc[: split_idx], X.iloc[split_idx: ]
y_train, y_test = y.iloc[: split_idx], y.iloc[split_idx: ]

len(X_train), len(X_test)

(721, 181)

In [22]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)

y_train_scaled = mm.fit_transform(y_train)
y_test_scaled = mm.transform(y_test)

In [None]:
# warnings.filterwarnings('always', category=FutureWarning)
sent_vv_vh = ModelExperiments(X_train, X_test, y_train, y_test,
                    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled).run_all()

sent_vv_vh


=== Running RandomForest ===
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Test R2 Score: -0.13711664977999827

=== Running XGBoost ===
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Test R2 Score: -0.07659471035003662

=== Running AdaBoost ===
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'estimator__max_depth': 3, 'estimator__min_samples_split': 10, 'learning_rate': 1.0, 'n_estimators': 50}
Test R2 Score: -0.15220357149815533

=== Running SVR ===
Fitting 3 folds for each of 320 candidates, totalling 960 fits
Best Parameters: {'C': 1, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Test R2 Score: -0.31303685667649916


{'RandomForest': RandomForestRegressor(max_depth=5, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=10, random_state=10),
 'XGBoost': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.01, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, ...),
 'AdaBoost': AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=3,


## VV, VH, angle, day, month, year

In [36]:
X_cols = ['VH-pol', 'VV-pol', 'angle', 'Day', 'Month', 'Year']
y_col = ['SM1 (%)']

X = sentinel_combined[X_cols]
y = sentinel_combined[y_col]

TRAIN_SIZE = 0.8

split_idx = int(len(X) * TRAIN_SIZE)

X_train, X_test = X.iloc[: split_idx], X.iloc[split_idx: ]
y_train, y_test = y.iloc[: split_idx], y.iloc[split_idx: ]

len(X_train), len(X_test)

(721, 181)

In [37]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)

y_train_scaled = mm.fit_transform(y_train)
y_test_scaled = mm.transform(y_test)

In [38]:
warnings.filterwarnings('always', category=FutureWarning)
sent_vv_vh_angle_dmy = ModelExperiments(X_train, X_test, y_train, y_test,
                    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled).run_all()

sent_vv_vh_angle_dmy


=== Running RandomForest ===
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
Test R2 Score: -0.18304661755876372

=== Running XGBoost ===
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Test R2 Score: -0.09249842166900635

=== Running AdaBoost ===
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'estimator__max_depth': 5, 'estimator__min_samples_split': 2, 'learning_rate': 1.0, 'n_estimators': 100}
Test R2 Score: -0.3238648811035951

=== Running SVR ===
Fitting 3 folds for each of 320 candidates, totalling 960 fits
Best Parameters: {'C': 1, 'epsilon': 0.1, 'gamma': 0.1, 'kernel': 'sigmoid'}
Test R2 Score: -0.19253638204632217


{'RandomForest': RandomForestRegressor(max_depth=5, max_features='sqrt', min_samples_split=5,
                       n_estimators=500, random_state=10),
 'XGBoost': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.01, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, ...),
 'AdaBoost': AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=5, ran

# Only EOS Data

In [8]:
eos_dfs = []

for sheet_name in eos.sheet_names:
    df = pd.read_excel(EOS_PATH, sheet_name=sheet_name)
    eos_dfs.append(df)

eos_combined = pd.concat(eos_dfs, ignore_index=True)

eos_combined

Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),HH-pol,HV-pol,SM1 (%)
0,2022-08-28,22.526048,72.765011,-4.99884,-13.33651,30.5
1,2022-08-28,22.525481,72.765028,-8.76236,-16.08855,46.9
2,2022-08-28,22.525999,72.765663,-7.11428,-11.90641,18.1
3,2022-08-28,22.527290,72.764707,-8.32358,-15.11733,34.4
4,2022-08-28,22.527874,72.764718,-5.27314,-15.93518,41.1
...,...,...,...,...,...,...
892,2023-02-28,22.523640,72.766727,-8.69274,-14.15688,26.8
893,2023-02-28,22.523657,72.766016,-6.48271,-15.56076,32.8
894,2023-02-28,22.524255,72.766024,-6.20660,-14.73141,34.8
895,2023-02-28,22.524229,72.766601,-9.43414,-20.52987,27.8


In [9]:
eos_combined['Month'] = eos_combined['Sample Date & Time'].dt.month
eos_combined['Day'] = eos_combined['Sample Date & Time'].dt.day
eos_combined['Year'] = eos_combined['Sample Date & Time'].dt.year

## HH, HV

In [41]:
X_cols = ['HH-pol', 'HV-pol']
y_col = ['SM1 (%)']

TRAIN_SIZE = 0.8

X = eos_combined[X_cols]
y = eos_combined[y_col]

split_idx = int(len(X) * TRAIN_SIZE)

X_train, X_test = X.iloc[: split_idx], X.iloc[split_idx: ]
y_train, y_test = y.iloc[: split_idx], y.iloc[split_idx: ]

len(X_train), len(X_test)

(717, 180)

In [42]:
from sklearn.preprocessing import MinMaxScaler

X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)

y_train_scaled = mm.fit_transform(y_train)
y_test_scaled = mm.transform(y_test)

In [44]:
eos_hh_hv = ModelExperiments(X_train, X_test, y_train, y_test,
                             X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled).run_all()


=== Running RandomForest ===
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Test R2 Score: 0.007397434859398366

=== Running XGBoost ===
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Test R2 Score: -0.2304222583770752

=== Running AdaBoost ===
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'estimator__max_depth': 2, 'estimator__min_samples_split': 2, 'learning_rate': 1.0, 'n_estimators': 50}
Test R2 Score: 0.024421436193262203

=== Running SVR ===
Fitting 3 folds for each of 320 candidates, totalling 960 fits
Best Parameters: {'C': 1, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
Test R2 Score: -0.18142027044210884


## HH, HV, Day, Month, Year

In [45]:
X_cols = ['HH-pol', 'HV-pol', 'Day', 'Month', 'Year']
y_col = ['SM1 (%)']

TRAIN_SIZE = 0.8

X = eos_combined[X_cols]
y = eos_combined[y_col]

split_idx = int(len(X) * TRAIN_SIZE)

X_train, X_test = X.iloc[: split_idx], X.iloc[split_idx: ]
y_train, y_test = y.iloc[: split_idx], y.iloc[split_idx: ]

len(X_train), len(X_test)

(717, 180)

In [46]:
from sklearn.preprocessing import MinMaxScaler

X_train_scaled = mm.fit_transform(X_train)
X_test_scaled = mm.transform(X_test)

y_train_scaled = mm.fit_transform(y_train)
y_test_scaled = mm.transform(y_test)

In [47]:
eos_hh_hv_dmy = ModelExperiments(X_train, X_test, y_train, y_test,
                             X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled).run_all()


=== Running RandomForest ===
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Test R2 Score: -0.01605000034572779

=== Running XGBoost ===
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Test R2 Score: -0.1027759313583374

=== Running AdaBoost ===
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters: {'estimator__max_depth': 2, 'estimator__min_samples_split': 2, 'learning_rate': 1.0, 'n_estimators': 50}
Test R2 Score: -0.10198427533640286

=== Running SVR ===
Fitting 3 folds for each of 320 candidates, totalling 960 fits
Best Parameters: {'C': 100, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'linear'}
Test R2 Score: -1.231915905531031


# EOS and Sentinel Data Combined

In [10]:
eos.sheet_names, sentinel.sheet_names

(['28-08-2022',
  '11-09-2022',
  '15-10-2022',
  '24-01-2023',
  '11-02-2023 ',
  '28-2-2023'],
 ['27-08-2022',
  '08-09-2022',
  '14-10-2022',
  '30-01-2023',
  '11-02-2023 ',
  '28-2-2023'])

In [11]:
sentinel_combined

Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),VH-pol,VV-pol,angle,SM1 (%),Day,Month,Year
0,2022-08-08,22.526048,72.765011,-16.375600,-10.590500,40.048800,30.5,8,8,2022
1,2022-08-08,22.525481,72.765028,-16.244300,-10.634800,41.349100,46.9,8,8,2022
2,2022-08-08,22.525999,72.765663,-16.821400,-9.816820,42.283900,18.1,8,8,2022
3,2022-08-08,22.527290,72.764707,-16.003700,-10.809500,42.692300,34.4,8,8,2022
4,2022-08-08,22.527874,72.764718,-16.637400,-10.626300,43.860500,41.1,8,8,2022
...,...,...,...,...,...,...,...,...,...,...
941,2023-02-28,22.523640,72.766727,-14.539132,-10.738241,41.797688,26.8,28,2,2023
942,2023-02-28,22.523657,72.766016,-14.474763,-10.252501,41.254852,32.8,28,2,2023
943,2023-02-28,22.524255,72.766024,-16.174562,-11.142242,42.046009,34.8,28,2,2023
944,2023-02-28,22.524229,72.766601,-16.532534,-11.246845,41.706707,27.8,28,2,2023


In [12]:
eos_combined

Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),HH-pol,HV-pol,SM1 (%),Month,Day,Year
0,2022-08-28,22.526048,72.765011,-4.99884,-13.33651,30.5,8,28,2022
1,2022-08-28,22.525481,72.765028,-8.76236,-16.08855,46.9,8,28,2022
2,2022-08-28,22.525999,72.765663,-7.11428,-11.90641,18.1,8,28,2022
3,2022-08-28,22.527290,72.764707,-8.32358,-15.11733,34.4,8,28,2022
4,2022-08-28,22.527874,72.764718,-5.27314,-15.93518,41.1,8,28,2022
...,...,...,...,...,...,...,...,...,...
892,2023-02-28,22.523640,72.766727,-8.69274,-14.15688,26.8,2,28,2023
893,2023-02-28,22.523657,72.766016,-6.48271,-15.56076,32.8,2,28,2023
894,2023-02-28,22.524255,72.766024,-6.20660,-14.73141,34.8,2,28,2023
895,2023-02-28,22.524229,72.766601,-9.43414,-20.52987,27.8,2,28,2023


In [28]:
import pandas as pd

# Example: assume you already loaded your Excel files
# eos = pd.ExcelFile("eos_data.xlsx")
# sentinel = pd.ExcelFile("sentinel_data.xlsx")

# Step 1: Extract sheet names and standardize them to datetime
def clean_dates(sheet_names):
    dates = []
    for s in sheet_names:
        s = s.strip().replace(" ", "")  # remove trailing spaces
        # Handle single-digit months like '28-2-2023'
        try:
            dt = pd.to_datetime(s, format="%d-%m-%Y")
        except:
            dt = pd.to_datetime(s, dayfirst=True)  # fallback
        dates.append(dt)
    return dates

eos_dates = clean_dates(eos.sheet_names)
sentinel_dates = clean_dates(sentinel.sheet_names)

# Step 2: Match dates within ±2 days
pairs = []
for e_date, e_sheet in zip(eos_dates, eos.sheet_names):
    for s_date, s_sheet in zip(sentinel_dates, sentinel.sheet_names):
        if abs((e_date - s_date).days) <= 2:
            pairs.append((e_sheet, s_sheet))

print("Matched pairs (EOS, Sentinel):")
for p in pairs:
    print(p)

Matched pairs (EOS, Sentinel):
('28-08-2022', '27-08-2022')
('15-10-2022', '14-10-2022')
('11-02-2023 ', '11-02-2023 ')
('28-2-2023', '28-2-2023')


In [39]:
import pandas as pd

combined_dfs = {}

for eos_sheet in eos.sheet_names:
    eos_date = pd.to_datetime(eos_sheet.strip(), dayfirst=True)
    
    # look for Sentinel sheets within ±2 days
    for sent_sheet in sentinel.sheet_names:
        sent_date = pd.to_datetime(sent_sheet.strip(), dayfirst=True)
        
        if abs((eos_date - sent_date).days) <= 2:
            eos_df = pd.read_excel(EOS_PATH, sheet_name=eos_sheet)
            sent_df = pd.read_excel(SENTINEL_PATH, sheet_name=sent_sheet)
            
            # add source column so you know where data came from
            eos_df["source"] = "EOS"
            sent_df["source"] = "Sentinel"
            
            # concat vertically (stacking rows)
            combined = pd.concat([eos_df, sent_df], ignore_index=True)
            
            key = f"{eos_sheet}_{sent_sheet}"
            combined_dfs[key] = combined

combined_dfs.keys()

dict_keys(['28-08-2022_27-08-2022', '15-10-2022_14-10-2022', '11-02-2023 _11-02-2023 ', '28-2-2023_28-2-2023'])

In [None]:
for key in combined_dfs.keys():
    display(combined_dfs[key])
    print()
    print()

    

Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),HH-pol,HV-pol,SM1 (%),source,VH-pol,VV-pol,(θ)
0,2022-08-28,22.526048,72.765011,-4.99884,-13.33651,30.5,EOS,,,
1,2022-08-28,22.525481,72.765028,-8.76236,-16.08855,46.9,EOS,,,
2,2022-08-28,22.525999,72.765663,-7.11428,-11.90641,18.1,EOS,,,
3,2022-08-28,22.527290,72.764707,-8.32358,-15.11733,34.4,EOS,,,
4,2022-08-28,22.527874,72.764718,-5.27314,-15.93518,41.1,EOS,,,
...,...,...,...,...,...,...,...,...,...,...
295,2022-08-08,22.524262,72.768629,,,28.8,Sentinel,-13.6399,-8.41941,42.5706
296,2022-08-08,22.524252,72.767989,,,44.2,Sentinel,-14.1900,-8.72712,43.6156
297,2022-08-08,22.524049,72.768361,,,35.0,Sentinel,-14.8214,-9.03759,43.1670
298,2022-08-08,22.524049,72.768361,,,37.4,Sentinel,-14.8214,-9.03759,43.1670






Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),HH-pol,HV-pol,SM1 (%),source,Latitude (Centre of grid).1,VH-pol,VV-pol,(θ)
0,2022-10-15,22.526048,72.765011,-7.22510,-15.31476,15.4,EOS,,,,
1,2022-10-15,22.525481,72.765028,-7.12465,-15.65603,11.0,EOS,,,,
2,2022-10-15,22.525423,72.765663,-9.24639,-15.80069,13.1,EOS,,,,
3,2022-10-15,22.525999,72.765663,-8.71673,-17.20822,20.8,EOS,,,,
4,2022-10-15,22.525563,72.765399,-7.06178,-17.07947,12.6,EOS,,,,
...,...,...,...,...,...,...,...,...,...,...,...
324,2022-10-14,22.524255,72.766024,,,13.7,Sentinel,22.524255,-17.8510,-10.1538,41.8686
325,2022-10-14,22.524229,72.766601,,,13.2,Sentinel,22.524229,-19.5258,-12.0423,41.8539
326,2022-10-14,22.524027,72.766357,,,14.4,Sentinel,22.524027,-18.7444,-11.8569,41.6164
327,2022-10-14,,72.766357,,,15.8,Sentinel,22.524027,-18.7444,-11.8569,41.6164






Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),HH-pol,HV-pol,SM1 (%),source,VH-pol,VV-pol,(θ)
0,2023-02-11,22.526048,72.765011,-6.65891,-16.04795,31.7,EOS,,,
1,2023-02-11,22.525481,72.765028,-8.39406,-14.43060,15.0,EOS,,,
2,2023-02-11,22.525423,72.765663,-7.16176,-18.05111,30.4,EOS,,,
3,2023-02-11,22.525999,72.765663,-6.16542,-14.59074,1.9,EOS,,,
4,2023-02-11,22.525563,72.765399,-9.40695,-16.35837,30.4,EOS,,,
...,...,...,...,...,...,...,...,...,...,...
285,2023-02-11,22.523640,72.766727,,,23.0,Sentinel,-14.6779,-9.02415,41.9279
286,2023-02-11,22.523657,72.766016,,,38.7,Sentinel,-14.7466,-8.90438,41.3781
287,2023-02-11,22.524255,72.766024,,,32.1,Sentinel,-14.3410,-6.59922,41.8594
288,2023-02-11,22.524229,72.766601,,,21.2,Sentinel,-14.7369,-6.97648,41.8458






Unnamed: 0,Sample Date & Time,Latitude (Centre of grid),Longitude (Centre of grid),HH-pol,HV-pol,SM1 (%),source,VH-pol,VV-pol,angle
0,2023-02-28,22.526048,72.765011,-5.15094,-15.03994,8.7,EOS,,,
1,2023-02-28,22.525481,72.765028,-6.32535,-17.69331,25.9,EOS,,,
2,2023-02-28,22.525423,72.765663,-7.84002,-14.92221,49.7,EOS,,,
3,2023-02-28,22.525999,72.765663,-8.46383,-17.36857,3.3,EOS,,,
4,2023-02-28,22.525563,72.765399,-4.92177,-14.62923,50.7,EOS,,,
...,...,...,...,...,...,...,...,...,...,...
308,2023-02-28,22.523640,72.766727,,,26.8,Sentinel,-14.539132,-10.738241,41.797688
309,2023-02-28,22.523657,72.766016,,,32.8,Sentinel,-14.474763,-10.252501,41.254852
310,2023-02-28,22.524255,72.766024,,,34.8,Sentinel,-16.174562,-11.142242,42.046009
311,2023-02-28,22.524229,72.766601,,,27.8,Sentinel,-16.532534,-11.246845,41.706707






In [None]:
# Step 3: Merge corresponding sheets
combined_dfs = {}
for e_sheet, s_sheet in pairs:
    df_e = pd.read_excel(EOS_PATH, sheet_name=e_sheet)
    df_s = pd.read_excel(SENTINEL_PATH, sheet_name=s_sheet)

    # Clean sentinel columns like before
    if 'Latitude (Centre of grid).1' in df_s.columns:
        df_s = df_s.drop(['Latitude (Centre of grid).1'], axis=1)
    df_s = df_s.rename(columns={'(\u03b8)': 'angle'})

    # Merge on common spatial + temporal columns
    df_combined = pd.merge(
        df_e,
        df_s,
        on=['Sample Date & Time', 'Latitude (Centre of grid)', 'Longitude (Centre of grid)'],
        suffixes=('_eos', '_sentinel')
    )

    combined_dfs[f"{e_sheet}_{s_sheet}"] = df_combined

print("Combined datasets created:", list(combined_dfs.keys()))


Combined datasets created: ['28-08-2022_27-08-2022', '15-10-2022_14-10-2022', '11-02-2023 _11-02-2023 ', '28-2-2023_28-2-2023']
