In [1]:
import pandas as pd
import numpy as np

import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
s1 = pd.read_csv("C:/Users/dutta/Downloads/Lukas_sentinel_project/notebooks/S1_LUCAS_2018_features.csv")

In [3]:
s1.head()

Unnamed: 0,system:index,NUTS2,POINT_ID,VH,VV,VV_VH,crop_class,month,.geo
0,0_00000000000000000000_0,DE11,42482862,-999.0,-999.0,-999.0,,3.0,"{""type"":""MultiPoint"",""coordinates"":[]}"
1,0_00000000000000000001_0,DE11,42702880,-999.0,-999.0,-999.0,,3.0,"{""type"":""MultiPoint"",""coordinates"":[]}"
2,0_00000000000000000002_0,DE11,42702906,-999.0,-999.0,-999.0,,3.0,"{""type"":""MultiPoint"",""coordinates"":[]}"
3,0_00000000000000000003_0,DE11,43142882,-999.0,-999.0,-999.0,,3.0,"{""type"":""MultiPoint"",""coordinates"":[]}"
4,0_00000000000000000004_0,DE11,43302838,-999.0,-999.0,-999.0,,3.0,"{""type"":""MultiPoint"",""coordinates"":[]}"


In [4]:
s1.shape

(81680, 9)

In [5]:
s1.columns

Index(['system:index', 'NUTS2', 'POINT_ID', 'VH', 'VV', 'VV_VH', 'crop_class',
       'month', '.geo'],
      dtype='object')

In [6]:
# Replace fill value with NaN
s1_clean = s1.copy()
s1_clean[["VV", "VH", "VV_VH"]] = s1_clean[["VV", "VH", "VV_VH"]].replace(-999, pd.NA)

In [7]:
# Rename system:index away (we don't need it)
s1 = s1.drop(columns=["system:index", ".geo"], errors="ignore")

In [8]:
# Ensure POINT_ID exists and is the right type
print("Columns before:", s1.columns)

Columns before: Index(['NUTS2', 'POINT_ID', 'VH', 'VV', 'VV_VH', 'crop_class', 'month'], dtype='object')


In [9]:
# Force POINT_ID to string for safe merging later
s1["POINT_ID"] = s1["POINT_ID"].astype(str)

In [10]:
# Replace -999 fill values with NaN
s1[["VV", "VH", "VV_VH"]] = s1[["VV", "VH", "VV_VH"]].replace(-999, pd.NA)


In [11]:
# Keep only relevant columns
s1_clean = s1[["POINT_ID", "crop_class", "NUTS2", "month", "VV", "VH", "VV_VH"]].copy()

In [12]:
# Fix types
s1_clean["month"] = s1_clean["month"].astype(int)

In [13]:
# Drop rows without crop labels
s1_clean = s1_clean.dropna(subset=["crop_class"])

In [14]:
s1_clean.shape

(71176, 7)

In [15]:
s1_clean.head()

Unnamed: 0,POINT_ID,crop_class,NUTS2,month,VV,VH,VV_VH
1313,42402858,barley,DE11,3,,,
1314,42842960,barley,DE11,3,,,
1315,43462850,barley,DE11,3,,,
1316,43262898,barley,DE11,3,,,
1317,42402848,barley,DE11,3,,,


In [16]:
s1_clean.isna().sum()

POINT_ID          0
crop_class        0
NUTS2             0
month             0
VV            70919
VH            71166
VV_VH         71173
dtype: int64

In [17]:
s2_clean = pd.read_csv("C:/Users/dutta/Downloads/Lukas_sentinel_project/notebooks/lucas2018_DE_phase3_clean_new.csv")

In [18]:
# Make sure s2_clean POINT_ID is also string
s2_clean["POINT_ID"] = s2_clean["POINT_ID"].astype(str)

In [19]:
df_combined = s2_clean.merge(
    s1_clean,
    on=["POINT_ID", "month", "crop_class", "NUTS2"],
    how="left"
)

In [20]:
df_combined.shape

(300734, 9)

In [21]:
df_combined.isna().sum()

POINT_ID           0
crop_class         0
NUTS2              0
month              0
NDVI               0
EVI                0
VV            299672
VH            300714
VV_VH         300727
dtype: int64

In [22]:
# How many POINT_IDs overlap?
len(set(s2_clean["POINT_ID"]) & set(s1_clean["POINT_ID"]))

8897

In [23]:
# Fill missing radar with a neutral value (e.g. median or constant)
df_model = df_combined.copy()

for col in ["VV", "VH", "VV_VH"]:
    df_model[col] = df_model[col].fillna(-999)

In [24]:
# Feature matrix and target
X = df_model[["NDVI", "EVI", "month", "VV", "VH", "VV_VH"]].copy()
y = df_model["crop_class"].copy()

In [25]:
# Fix types
X = X.astype({
    "NDVI": "float32",
    "EVI": "float32",
    "month": "int8",
    "VV": "float32",
    "VH": "float32",
    "VV_VH": "float32"
})

X.shape, y.shape

((300734, 6), (300734,))

In [26]:
#Train/split test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((210513, 6), (90221, 6))

In [27]:
#train Random Forest on S1 + S2

rf_s1s2 = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_s1s2.fit(X_train, y_train)


RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)

In [28]:
#evaluate S1 + S2 model

from sklearn.metrics import classification_report

y_pred = rf_s1s2.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      barley       0.20      0.18      0.19     11727
      fodder       0.03      0.02      0.02      1081
       maize       0.48      0.57      0.53     24155
        oats       0.01      0.01      0.01      1114
other_arable       0.08      0.06      0.07      6025
    potatoes       0.01      0.01      0.01       273
    rapeseed       0.08      0.06      0.07      2467
         rye       0.10      0.08      0.09      5941
     soybean       0.02      0.02      0.02       519
  sugar_beet       0.21      0.20      0.21     11979
   sunflower       0.11      0.09      0.10      3441
       wheat       0.34      0.38      0.36     21499

    accuracy                           0.31     90221
   macro avg       0.14      0.14      0.14     90221
weighted avg       0.28      0.31      0.30     90221

