# Feature engineering for neural network input data

In [1]:
import statistics
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Path where to store processed data
save_path = "../data/processed/"

## 1. Import merged data - full data set

In [3]:
# Read data
data_df = pd.read_pickle("../data/interim/vtdata.pkl")

# Load data, 'pandas.category' type for categorical variables for Random Forest
vt_X = data_df.drop(columns=["vt"])
vt_y = data_df["vt"]

print(vt_X.shape)
print(vt_y.shape)

(22173, 97)
(22173,)


In [4]:
# Print column rows for validation
_ = [print(x, end=" ; ") for x in vt_X.columns.values]

x ; y ; plot_id ; B11_median_comp_Sent2 ; B12_median_comp_Sent2 ; B1_median_comp_Sent2 ; B2_median_comp_Sent2 ; B3_median_comp_Sent2 ; B4_median_comp_Sent2 ; B5_median_comp_Sent2 ; B6_median_comp_Sent2 ; B7_median_comp_Sent2 ; B8A_median_comp_Sent2 ; B8_median_comp_Sent2 ; B9_median_comp_Sent2 ; EVI_greenest_pixel_Sent2 ; EVI_median_comp_Sent2 ; GNDVI_greenest_pixel_Sent2 ; GNDVI_median_comp_Sent2 ; NDMI_greenest_pixel_Sent2 ; NDMI_median_comp_Sent2 ; NDVI_greenest_pixel_Sent2 ; NDVI_median_comp_Sent2 ; SAVI_greenest_pixel_Sent2 ; SAVI_median_comp_Sent2 ; B1_median_comp_Lands7 ; B2_median_comp_Lands7 ; B3_median_comp_Lands7 ; B4_median_comp_Lands7 ; B5_median_comp_Lands7 ; B6_median_comp_Lands7 ; B7_median_comp_Lands7 ; EVI_greenest_pixel_Lands7 ; EVI_median_comp_Lands7 ; GNDVI_greenest_pixel_Lands7 ; GNDVI_median_comp_Lands7 ; NDMI_greenest_pixel_Lands7 ; NDMI_median_comp_Lands7 ; NDVI_greenest_pixel_Lands7 ; NDVI_median_comp_Lands7 ; SAVI_greenest_pixel_Lands7 ; SAVI_median_comp_Land

---
### 1.1 Binarize and factorize class labels
### Binarize

In [5]:
# Binarize class labels
label_binarizer = LabelBinarizer()
vt_y_binarized = label_binarizer.fit_transform(vt_y)

vt_y_binarized_df = pd.DataFrame(
    vt_y_binarized,
    columns=label_binarizer.classes_
).reset_index(drop=True)

vt_y_binarized_df

Unnamed: 0,10ab,10c,11b,12b,12c,1ab,1c,2a,2b,2c,...,6b,7a,7b,7c,8a,8b,8cd,9ad,9bc,9e
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22168,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
22169,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
22170,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22171,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Save DataFrame to pickle
vt_y_binarized_df.to_pickle(save_path+"vt_y_bin.pkl")

### Factorize

In [7]:
vt_y_factorized, vt_y_unique = pd.factorize(vt_y, sort=True)

In [8]:
vt_y_fact_df = pd.DataFrame(
    data=vt_y_factorized,
    columns=["vt_integer"],
    index=vt_y_unique[vt_y_factorized]
)
vt_y_fact_df

Unnamed: 0,vt_integer
8b,26
2ef,11
2ef,11
2ef,11
6a,20
...,...
2b,8
8a,25
4c,16
11b,2


In [9]:
# Save DataFrame to pickle
vt_y_fact_df.to_pickle(save_path+"vt_y_fact.pkl")

---
### 1.2 Scale numerical features

In [10]:
# Subset numeric features
vt_X_num = vt_X.select_dtypes(
    exclude='category'
).drop(
    columns=["x", "y", "plot_id"]
).reset_index(drop=True)

# Print for testing
_ = [print(x, end=" ; ") for x in vt_X_num.columns.values]

B11_median_comp_Sent2 ; B12_median_comp_Sent2 ; B1_median_comp_Sent2 ; B2_median_comp_Sent2 ; B3_median_comp_Sent2 ; B4_median_comp_Sent2 ; B5_median_comp_Sent2 ; B6_median_comp_Sent2 ; B7_median_comp_Sent2 ; B8A_median_comp_Sent2 ; B8_median_comp_Sent2 ; B9_median_comp_Sent2 ; EVI_greenest_pixel_Sent2 ; EVI_median_comp_Sent2 ; GNDVI_greenest_pixel_Sent2 ; GNDVI_median_comp_Sent2 ; NDMI_greenest_pixel_Sent2 ; NDMI_median_comp_Sent2 ; NDVI_greenest_pixel_Sent2 ; NDVI_median_comp_Sent2 ; SAVI_greenest_pixel_Sent2 ; SAVI_median_comp_Sent2 ; B1_median_comp_Lands7 ; B2_median_comp_Lands7 ; B3_median_comp_Lands7 ; B4_median_comp_Lands7 ; B5_median_comp_Lands7 ; B6_median_comp_Lands7 ; B7_median_comp_Lands7 ; EVI_greenest_pixel_Lands7 ; EVI_median_comp_Lands7 ; GNDVI_greenest_pixel_Lands7 ; GNDVI_median_comp_Lands7 ; NDMI_greenest_pixel_Lands7 ; NDMI_median_comp_Lands7 ; NDVI_greenest_pixel_Lands7 ; NDVI_median_comp_Lands7 ; SAVI_greenest_pixel_Lands7 ; SAVI_median_comp_Lands7 ; aspect ; bioc

In [11]:
# Use MinMax scaler
minmax_scaler = MinMaxScaler()
vt_X_num_scaled = minmax_scaler.fit_transform(vt_X_num)

In [12]:
vt_X_num_scaled_df = pd.DataFrame(vt_X_num_scaled, columns=vt_X_num.columns)
vt_X_num_scaled_df

Unnamed: 0,B11_median_comp_Sent2,B12_median_comp_Sent2,B1_median_comp_Sent2,B2_median_comp_Sent2,B3_median_comp_Sent2,B4_median_comp_Sent2,B5_median_comp_Sent2,B6_median_comp_Sent2,B7_median_comp_Sent2,B8A_median_comp_Sent2,...,tmax_6,tmax_8,tmax_9,tmin_5,tmin_9,topographic_wetness_index,total_insolation,valley_depth,vertical_distance_to_channel_network,visible_sky
0,0.208990,0.161059,0.022034,0.021597,0.034284,0.035148,0.065702,0.131269,0.160046,0.202123,...,0.810852,0.839939,0.637813,0.582242,0.453646,0.220136,0.596502,0.333756,0.005423,0.867498
1,0.331650,0.355519,0.097631,0.057903,0.064895,0.072655,0.088258,0.125437,0.146172,0.185270,...,0.213769,0.253153,0.152620,0.205234,0.309951,0.126367,0.612365,0.121213,0.227601,0.793475
2,0.267768,0.217108,0.029225,0.024672,0.036204,0.039534,0.068390,0.151038,0.184555,0.233189,...,0.681638,0.506976,0.410214,0.535862,0.415151,0.259111,0.710934,0.084897,0.027481,0.937970
3,0.254157,0.201629,0.016871,0.018922,0.029519,0.035414,0.069767,0.125799,0.152933,0.200182,...,0.571245,0.526518,0.345568,0.321360,0.407100,0.484918,0.551638,0.040774,0.039700,0.938588
4,0.169200,0.122281,0.023693,0.019591,0.032034,0.025614,0.057309,0.142801,0.173921,0.215458,...,0.867838,0.691886,0.601649,0.811609,0.650339,0.150468,0.766644,0.239576,0.030171,0.878294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22168,0.308600,0.250346,0.033005,0.033231,0.046992,0.051294,0.089701,0.172125,0.211388,0.266042,...,0.200070,0.274446,0.302413,0.215339,0.391405,0.145284,0.725349,0.016396,0.034928,0.915887
22169,0.122364,0.086599,0.021757,0.016669,0.021643,0.016498,0.035474,0.111433,0.142616,0.177504,...,0.734926,0.755738,0.623664,0.875742,0.808021,0.233836,0.751023,0.115542,0.020473,0.852929
22170,0.213874,0.147047,0.014658,0.016315,0.037263,0.019401,0.069767,0.223262,0.273364,0.329028,...,0.698084,0.517333,0.445081,0.449423,0.432992,0.180621,0.604713,0.195726,0.011264,0.746713
22171,0.231985,0.168065,0.015580,0.019791,0.035608,0.028039,0.070226,0.186689,0.238644,0.301302,...,0.746137,0.848106,0.877475,0.853121,0.858409,0.219107,0.793266,0.297835,0.018384,0.847341


In [13]:
# Print min/max/mean for each feature for testing
tmp_df = vt_X_num_scaled_df.round(decimals=4)

for column in tmp_df:
    print(f"{column} -- max: {max(tmp_df[column])}; "
          + f"min: {min(tmp_df[column])},; mean: {round(statistics.mean(tmp_df[column]), 3)}")

B11_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.213
B12_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.182
B1_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.043
B2_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.036
B3_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.047
B4_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.046
B5_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.073
B6_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.143
B7_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.173
B8A_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.214
B8_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.195
B9_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.257
EVI_greenest_pixel_Sent2 -- max: 1.0; min: 0.0,; mean: 0.254
EVI_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.738
GNDVI_greenest_pixel_Sent2 -- max: 1.0; min: 0.0,; mean: 0.758
GNDVI_median_comp_Sent2 -- max: 1.0; min: 0.0,; mean: 0.764
NDMI_greenest_pixel_Sent2 -- max: 1.0; min: 0.0,; mean: 0.587
NDMI_medi

In [14]:
# Save DataFrame to pickle
vt_X_num_scaled_df.to_pickle(save_path+"vt_X_minmax_scaler_allbands.pkl")

---
### 1.3 One-hot-encode categorical variables

In [15]:
# Subset categorical features
vt_X_cat = vt_X.select_dtypes(include='category').reset_index(drop=True)

vt_X_cat

Unnamed: 0,ar50_artype,ar50_skogbon,ar50_treslag,ar50_veg,corine_lc_2012,geo_berggrunn,geo_grunnvann,geo_infiltr_evne,geo_losmasse,geology_norge
0,60,11,39,98,412,9,1,1,1,2
1,50,98,39,52,333,7,1,1,3,2
2,60,11,39,98,322,22,1,1,1,3
3,81,98,98,98,322,12,2,2,4,1
4,30,13,31,98,324,18,1,4,7,1
...,...,...,...,...,...,...,...,...,...,...
22168,50,98,39,54,333,10,1,4,7,3
22169,30,18,31,98,312,5,2,2,4,1
22170,30,99,99,98,311,32,2,2,4,1
22171,20,98,98,98,243,19,1,4,7,1


In [16]:
# Get one-hot-encoded variables
vt_X_cat_onehot = pd.get_dummies(vt_X_cat)
vt_X_cat_onehot

Unnamed: 0,ar50_artype_10,ar50_artype_20,ar50_artype_30,ar50_artype_50,ar50_artype_60,ar50_artype_70,ar50_artype_81,ar50_artype_82,ar50_skogbon_11,ar50_skogbon_12,...,geo_losmasse_23,geo_losmasse_24,geo_losmasse_25,geo_losmasse_28,geo_losmasse_35,geo_losmasse_38,geo_losmasse_44,geology_norge_1,geology_norge_2,geology_norge_3
0,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22168,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
22169,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
22170,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
22171,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


---
### 1.4 Merge DataFrames and save

In [17]:
vt_X_merged = pd.concat([vt_X.loc[:,["x","y","plot_id"]].reset_index(drop=True),
                         vt_X_num_scaled_df, vt_X_cat_onehot], axis=1)
print(vt_X_merged.shape)
vt_X_merged

(22173, 210)


Unnamed: 0,x,y,plot_id,B11_median_comp_Sent2,B12_median_comp_Sent2,B1_median_comp_Sent2,B2_median_comp_Sent2,B3_median_comp_Sent2,B4_median_comp_Sent2,B5_median_comp_Sent2,...,geo_losmasse_23,geo_losmasse_24,geo_losmasse_25,geo_losmasse_28,geo_losmasse_35,geo_losmasse_38,geo_losmasse_44,geology_norge_1,geology_norge_2,geology_norge_3
0,4.139482e+05,7.200240e+06,2842,0.208990,0.161059,0.022034,0.021597,0.034284,0.035148,0.065702,...,False,False,False,False,False,False,False,False,True,False
1,1.804726e+05,6.821747e+06,1521,0.331650,0.355519,0.097631,0.057903,0.064895,0.072655,0.088258,...,False,False,False,False,False,False,False,False,True,False
2,3.237466e+05,6.947909e+06,2328,0.267768,0.217108,0.029225,0.024672,0.036204,0.039534,0.068390,...,False,False,False,False,False,False,False,False,False,True
3,8.276886e+05,7.758279e+06,5173,0.254157,0.201629,0.016871,0.018922,0.029519,0.035414,0.069767,...,False,False,False,False,False,False,False,True,False,False
4,1.805536e+05,6.605858e+06,1509,0.169200,0.122281,0.023693,0.019591,0.032034,0.025614,0.057309,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22168,7.270007e+04,6.696251e+06,914,0.308600,0.250346,0.033005,0.033231,0.046992,0.051294,0.089701,...,False,False,False,False,False,False,False,False,False,True
22169,7.268746e+04,6.480263e+06,902,0.122364,0.086599,0.021757,0.016669,0.021643,0.016498,0.035474,...,False,False,False,False,False,False,False,True,False,False
22170,1.007973e+06,7.865903e+06,6179,0.213874,0.147047,0.014658,0.016315,0.037263,0.019401,0.069767,...,False,False,False,False,False,False,False,True,False,False
22171,-1.733582e+04,6.533936e+06,405,0.231985,0.168065,0.015580,0.019791,0.035608,0.028039,0.070226,...,False,False,False,False,False,False,False,True,False,False


In [18]:
# Print all columns
_ = [print(col, end=" ; ") for col in vt_X_merged.columns]

x ; y ; plot_id ; B11_median_comp_Sent2 ; B12_median_comp_Sent2 ; B1_median_comp_Sent2 ; B2_median_comp_Sent2 ; B3_median_comp_Sent2 ; B4_median_comp_Sent2 ; B5_median_comp_Sent2 ; B6_median_comp_Sent2 ; B7_median_comp_Sent2 ; B8A_median_comp_Sent2 ; B8_median_comp_Sent2 ; B9_median_comp_Sent2 ; EVI_greenest_pixel_Sent2 ; EVI_median_comp_Sent2 ; GNDVI_greenest_pixel_Sent2 ; GNDVI_median_comp_Sent2 ; NDMI_greenest_pixel_Sent2 ; NDMI_median_comp_Sent2 ; NDVI_greenest_pixel_Sent2 ; NDVI_median_comp_Sent2 ; SAVI_greenest_pixel_Sent2 ; SAVI_median_comp_Sent2 ; B1_median_comp_Lands7 ; B2_median_comp_Lands7 ; B3_median_comp_Lands7 ; B4_median_comp_Lands7 ; B5_median_comp_Lands7 ; B6_median_comp_Lands7 ; B7_median_comp_Lands7 ; EVI_greenest_pixel_Lands7 ; EVI_median_comp_Lands7 ; GNDVI_greenest_pixel_Lands7 ; GNDVI_median_comp_Lands7 ; NDMI_greenest_pixel_Lands7 ; NDMI_median_comp_Lands7 ; NDVI_greenest_pixel_Lands7 ; NDVI_median_comp_Lands7 ; SAVI_greenest_pixel_Lands7 ; SAVI_median_comp_Land

In [19]:
# No NA values?
vt_X_merged.isna().sum().sum()

0

In [20]:
_ = [print(f"{x}: {y}") for x, y in zip(vt_X_merged.columns, vt_X_merged.dtypes)]

x: float64
y: float64
plot_id: int64
B11_median_comp_Sent2: float64
B12_median_comp_Sent2: float64
B1_median_comp_Sent2: float64
B2_median_comp_Sent2: float64
B3_median_comp_Sent2: float64
B4_median_comp_Sent2: float64
B5_median_comp_Sent2: float64
B6_median_comp_Sent2: float64
B7_median_comp_Sent2: float64
B8A_median_comp_Sent2: float64
B8_median_comp_Sent2: float64
B9_median_comp_Sent2: float64
EVI_greenest_pixel_Sent2: float64
EVI_median_comp_Sent2: float64
GNDVI_greenest_pixel_Sent2: float64
GNDVI_median_comp_Sent2: float64
NDMI_greenest_pixel_Sent2: float64
NDMI_median_comp_Sent2: float64
NDVI_greenest_pixel_Sent2: float64
NDVI_median_comp_Sent2: float64
SAVI_greenest_pixel_Sent2: float64
SAVI_median_comp_Sent2: float64
B1_median_comp_Lands7: float64
B2_median_comp_Lands7: float64
B3_median_comp_Lands7: float64
B4_median_comp_Lands7: float64
B5_median_comp_Lands7: float64
B6_median_comp_Lands7: float64
B7_median_comp_Lands7: float64
EVI_greenest_pixel_Lands7: float64
EVI_median_co

In [21]:
# Save DataFrame to pickle
vt_X_merged.to_pickle(save_path+"vt_X_scaled_and_dummies_allbands.pkl")