# EDA - Identify significant predictors with RF feature selection
- Edited by Rumi Nakagawa
- Spring 2023 Capstone

## Monthly data
- `data_monthly_v1_0_mydrive.csv`

# 0. Preparation

## Mount google drive
- Make sure that available access is the user's own drive(no access across files in shared folder)

In [67]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
%cd drive/MyDrive/

[Errno 2] No such file or directory: 'drive/MyDrive/'
/content/drive/MyDrive


## Import libraries

In [69]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [70]:
# !pip install dython

In [71]:
!pip install geopandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [72]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

In [73]:
# Import a Spark function from library
from pyspark.sql.functions import col

In [74]:
from pyspark.sql.functions import col
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import col
import pandas as pd
import numpy as np
import os

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from pyspark.sql.functions import desc

import geopandas as gpd
import folium

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import MinMaxScaler
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.ml.classification import RandomForestClassifier
import time
# from dython import nominal

# Apply the default theme
sns.set_theme()


## (to be updated) Get access to blob storage
- Reference from 261

In [75]:
# Put at the top of any notebooks for storing in blob

# from pyspark.sql.functions import col, max

# blob_container = "team06" # The name of your container created in https://portal.azure.com
# storage_account = "apatel" # The name of your Storage account created in https://portal.azure.com
# secret_scope = "team06" # The name of the scope created in your local computer using the Databricks CLI
# secret_key = "team06" # The name of the secret key created in your local computer using the Databricks CLI 
# blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
# mount_path = "/mnt/mids-w261"

## Import csv
sample csv

In [76]:
monthly_df = pd.read_csv("data_monthly_v1_0_mydrive.csv")
# Copied original file. It is needed to store in each user's mydrive


In [77]:
monthly_df

Unnamed: 0,SITE_ID,year,month,TIMESTAMP,dataset,SITE_IGBP,LOCATION_LAT,LOCATION_LONG,TA_F,VPD_F,...,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_IGBP,MODIS_PFT,koppen_sub,koppen,CO2_concentration
0,AR-SLu,2010,1,201001,FLUXNET,MF,-33.46480,-66.45980,28.493,23.378,...,0.000000,0.49,1.2,313.84,293.58,OSH,SH,BSk,Arid,387.110
1,AR-SLu,2010,2,201002,FLUXNET,MF,-33.46480,-66.45980,26.673,14.369,...,0.000000,0.43,0.9,309.86,292.96,OSH,SH,BSk,Arid,387.675
2,AR-SLu,2010,3,201003,FLUXNET,MF,-33.46480,-66.45980,25.744,15.167,...,0.000000,0.41,0.8,309.18,290.52,OSH,SH,BSk,Arid,388.195
3,AR-SLu,2010,4,201004,FLUXNET,MF,-33.46480,-66.45980,18.450,9.185,...,0.000000,0.36,0.5,303.24,286.34,OSH,SH,BSk,Arid,388.905
4,AR-SLu,2010,5,201005,FLUXNET,MF,-33.46480,-66.45980,13.493,5.823,...,0.000000,0.37,0.5,296.20,277.82,OSH,SH,BSk,Arid,389.320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19010,SE-Lnn,2018,8,201808,ICOS2018,CRO,58.34063,13.10177,16.767,5.483,...,0.000000,0.28,0.5,296.66,282.84,CRO,CRO,Dfb,Cold,406.525
19011,SE-Lnn,2018,9,201809,ICOS2018,CRO,58.34063,13.10177,11.719,2.344,...,0.000000,0.43,0.7,290.18,280.78,CRO,CRO,Dfb,Cold,405.985
19012,SE-Lnn,2018,10,201810,ICOS2018,CRO,58.34063,13.10177,7.827,1.575,...,0.000000,0.54,1.0,284.16,276.10,CRO,CRO,Dfb,Cold,406.280
19013,SE-Lnn,2018,11,201811,ICOS2018,CRO,58.34063,13.10177,3.794,0.563,...,0.800000,,,277.06,270.88,CRO,CRO,Dfb,Cold,407.320


In [78]:
# Original columns
print(len(monthly_df.columns))
monthly_df.columns

62


Index(['SITE_ID', 'year', 'month', 'TIMESTAMP', 'dataset', 'SITE_IGBP',
       'LOCATION_LAT', 'LOCATION_LONG', 'TA_F', 'VPD_F', 'P_F', 'NETRAD',
       'NEE_VUT_REF', 'NEE_VUT_REF_QC', 'NEE_CUT_REF', 'NEE_CUT_REF_QC',
       'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF', 'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF',
       'RECO_NT_VUT_REF', 'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF',
       'RECO_DT_CUT_REF', 'time', 'ET', 'BESS-PAR', 'BESS-PARdiff',
       'BESS-RSDN', 'CSIF-SIFdaily', 'CSIF-SIFinst', 'PET', 'Ts', 'Tmean',
       'prcp', 'vpd', 'prcp-lag3', 'ESACCI-sm', 'MODIS_LC', 'b1', 'b2', 'b3',
       'b4', 'b5', 'b6', 'b7', 'EVI', 'GCI', 'NDVI', 'NDWI', 'NIRv', 'kNDVI',
       'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP',
       'MODIS_PFT', 'koppen_sub', 'koppen', 'CO2_concentration'],
      dtype='object')

## Check how much NA/None exists in each column

In [79]:
monthly_df.isna().sum()

SITE_ID              0
year                 0
month                0
TIMESTAMP            0
dataset              0
                    ..
MODIS_IGBP           0
MODIS_PFT            0
koppen_sub           0
koppen               0
CO2_concentration    0
Length: 62, dtype: int64

In [80]:
monthly_df_countNA = monthly_df.isna().sum()
monthly_df_countNA

SITE_ID              0
year                 0
month                0
TIMESTAMP            0
dataset              0
                    ..
MODIS_IGBP           0
MODIS_PFT            0
koppen_sub           0
koppen               0
CO2_concentration    0
Length: 62, dtype: int64

In [81]:
monthly_df_countNA = pd.DataFrame(monthly_df.isna().sum())

NA0_list = list(monthly_df_countNA[monthly_df_countNA[0] == 0].index)
NA_list = list(monthly_df_countNA[monthly_df_countNA[0] != 0].index)

In [82]:
NA_list

['P_F',
 'NETRAD',
 'RECO_DT_VUT_REF',
 'RECO_DT_CUT_REF',
 'ET',
 'CSIF-SIFdaily',
 'CSIF-SIFinst',
 'PET',
 'Ts',
 'Tmean',
 'prcp',
 'vpd',
 'prcp-lag3',
 'ESACCI-sm',
 'b1',
 'b2',
 'b3',
 'b4',
 'b5',
 'b6',
 'b7',
 'EVI',
 'GCI',
 'NDVI',
 'NDWI',
 'NIRv',
 'kNDVI',
 'Percent_Snow',
 'Fpar',
 'Lai']

In [83]:
len(monthly_df)

19015

In [84]:
monthly_df_countNA[monthly_df_countNA[0] != 0]

Unnamed: 0,0
P_F,353
NETRAD,2693
RECO_DT_VUT_REF,1
RECO_DT_CUT_REF,1
ET,32
CSIF-SIFdaily,121
CSIF-SIFinst,121
PET,101
Ts,101
Tmean,101


# Preprocess features

In [85]:
len(monthly_df.columns)

62

In [86]:
# Features used for this analysis

output_var = 'GPP_NT_VUT_REF'
# other options
output_related_var = ['NEE_VUT_REF', 'NEE_CUT_REF', 'GPP_NT_VUT_REF',
                      'GPP_DT_VUT_REF', 'GPP_NT_CUT_REF','GPP_DT_CUT_REF',
                      'RECO_NT_VUT_REF', 'RECO_DT_VUT_REF', 
                      'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF']

# predictor variables
pred_var_numeric = ['TA_F', 'VPD_F', 'P_F', 'NETRAD','ET',
                    'BESS-PAR', 'BESS-PARdiff','BESS-RSDN', 
                    'CSIF-SIFdaily', 'CSIF-SIFinst','PET', 'Ts', 'Tmean',
                    'prcp', 'vpd', 'prcp-lag3', 'ESACCI-sm',
                    'b1', 'b2', 'b3','b4', 'b5', 'b6', 'b7', 
                    'EVI', 'GCI', 'NDVI', 'NDWI', 'NIRv', 'kNDVI',
                    'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 
                    'CO2_concentration', 'LOCATION_LAT', 'LOCATION_LONG',
                    'year', 'month']

pred_var_categorical = ['SITE_IGBP','MODIS_LC', 'MODIS_IGBP','MODIS_PFT', 'koppen_sub', 'koppen']

other_var =  ['SITE_ID', 'TIMESTAMP', 'dataset', 'time']

qc_flags = ['NEE_VUT_REF_QC', 'NEE_CUT_REF_QC'] 

NA_list = [] # There is no feature whose values is zero

len(output_related_var + pred_var_numeric + pred_var_categorical 
    + other_var + qc_flags + NA_list)
# output_var is included in output_related_var

62

In [87]:
feature_list = pred_var_numeric + pred_var_categorical

### Predictor variables(candidates)

| feature_name      |                       source                       | definition                                                                                                                                                                              | var_type    |   |
|-------------------|:--------------------------------------------------:|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|---|
| TA_F              | FLUXNET                                            | Air temperature, consolidated from TA_F_MDS and TA_ERA                                                                                                                                  | numeric     |   |
| VPD_F             | FLUXNET                                            | Vapor Pressure Deficit consolidated from VPD_F_MDS and VPD_ERA                                                                                                                          | numeric     |   |
| P_F               | FLUXNET                                            | Precipitation consolidated from P and P_ERA                                                                                                                                             | numeric     |   |
| NETRAD            | FLUXNET                                            | Net radiation                                                                                                                                                                           | numeric     |   |
| BESS-PAR          | BESS PAR                                           | Photosynthetic Active Radiation (PAR)                                                                                                                                                   | numeric     |   |
| BESS-PARdiff      | BESS PAR                                           | Diffuse PAR                                                                                                                                                                             | numeric     |   |
| BESS-RSDN         | BESS PAR                                           | Shortwave downwelling radiation                                                                                                                                                         | numeric     |   |
| CSIF-SIFdaily     | CSIF                                               | All-sky daily average SIF                                                                                                                                                               | numeric     |   |
| CSIF-SIFinst      | NA                                                 | NA                                                                                                                                                                                      | numeric     |   |
| PET               | ERA5-Land                                          | Potential ET                                                                                                                                                                            | numeric     |   |
| Ts                | ERA5-Land                                          | NA                                                                                                                                                                                      | numeric     |   |
| Tmean             | ERA5-Land                                          | Air temperature                                                                                                                                                                         | numeric     |   |
| prcp              | ERA5-Land                                          | Precipitation                                                                                                                                                                           | numeric     |   |
| vpd               | ERA5-Land                                          | Vapor pressure deficit                                                                                                                                                                  | numeric     |   |
| prcp-lag3         | ERA5-Land                                          | Precipitation 3-month lag                                                                                                                                                               | numeric     |   |
| ESACCI-sm         | ERA5-Land                                          | Soil moisture                                                                                                                                                                           | numeric     |   |
| MODIS_LC          | NA                                                 | MODIS land cover (MODIS LC)                                                                                                                                                             | categorical |   |
| b1                | MCD43C4                                            | Surface reflectance Band 1                                                                                                                                                              | numeric     |   |
| b2                | MCD43C4                                            | Surface reflectance Band 2 (nir)                                                                                                                                                        | numeric     |   |
| b3                | MCD43C4                                            | Surface reflectance Band 3 (blue)                                                                                                                                                       | numeric     |   |
| b4                | MCD43C4                                            | Surface reflectance Band 4 (green)                                                                                                                                                      | numeric     |   |
| b5                | MCD43C4                                            | Surface reflectance Band 5 (SWIR1)                                                                                                                                                      | numeric     |   |
| b6                | MCD43C4                                            | Surface reflectance Band 6 (SWIR2)                                                                                                                                                      | numeric     |   |
| b7                | MCD43C4                                            | Surface reflectance Band 7 (SWIR3)                                                                                                                                                      | numeric     |   |
| EVI               | MCD43C4                                            | Enhanced Vegetation Index (EVI)                                                                                                                                                         | numeric     |   |
| GCI               | MCD43C4                                            | CIGreen c                                                                                                                                                                               | numeric     |   |
| NDVI              | MCD43C4                                            | Normalized Difference Vegetation Index (NDVI)                                                                                                                                           | numeric     |   |
| NDWI              | MCD43C4                                            | Normalized Different Water Index (NDWI) b                                                                                                                                               | numeric     |   |
| NIRv              | MCD43C4                                            | NIRv d                                                                                                                                                                                  | numeric     |   |
| kNDVI             | MCD43C4                                            | kNDVI a                                                                                                                                                                                 | numeric     |   |
| Percent_Snow      | MCD43C4                                            | Percentage of snow cover                                                                                                                                                                | numeric     |   |
| Fpar              | MCD15A3H (after 2002/07) MOD15A2H (before 2002/07) | Fraction of photosynthetically active radiation (fPAR)                                                                                                                                  | numeric     |   |
| Lai               | MCD15A3H (after 2002/07) MOD15A2H (before 2002/07) | Surface reflectance Band 7 (SWIR3)                                                                                                                                                      | numeric     |   |
| LST_Day           | MYD11A1 (after 2002/07) MOD11A1 (before 2002/07)   | Daytime land surface temperature                                                                                                                                                        | numeric     |   |
| LST_Night         | MYD11A1 (after 2002/07) MOD11A1 (before 2002/07)   | Nighttime land surface temperature                                                                                                                                                      | numeric     |   |
| MODIS_IGBP        | NA                                                 | NA                                                                                                                                                                                      | categorical |   |
| MODIS_PFT         | NA                                                 | NA                                                                                                                                                                                      | categorical |   |
| koppen_sub        | NA                                                 | NA                                                                                                                                                                                      | categorical |   |
| koppen            | Koppen-Geiger                                      | Climate zone (one-hot encoding)                                                                                                                                                         | categorical |   |
| CO2_concentration | ESLR                                               | Atmospheric CO2 concentration                                                                                                                                                           | numeric     |   |

In [88]:
# Check if all the features are included in one of the four
total = output_related_var + pred_var_numeric + pred_var_categorical + other_var + qc_flags + NA_list
for i in monthly_df.columns:
  if i not in total:
    print(i)

## Add hemisphere to the df

In [89]:
def assign_hemisphere(x):
  if x >= 0:
    return "N"
  elif x < 0:
    return "S"

monthly_df["hemisphere"] = monthly_df["LOCATION_LAT"].apply(lambda x: assign_hemisphere(x))

---
*Following process use only Spark(N0 PANDAS)*

# Use parquet -> Spark df as preparation for global data

## DF to Parquet format
- Pandas is converted to parquet in order to make the following code usable with global data in AWS

In [90]:
type(monthly_df)

pandas.core.frame.DataFrame

### Method 1

In [91]:
!pip install fastparquet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas>=1.5.0
  Using cached pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.0
    Uninstalling pandas-1.3.0:
      Successfully uninstalled pandas-1.3.0
Successfully installed pandas-1.5.3


In [92]:
monthly_df.to_parquet('monthly_df_pq', engine='fastparquet')

### Method 2

In [93]:
# Convert df to parquet
import pyarrow as pa
import pyarrow.parquet as pq

# Convert DataFrame to Apache Arrow Table
table = pa.Table.from_pandas(monthly_df)
# Second, write the table into parquet file say file_name.parquet
# Parquet with Brotli compression
pq.write_table(table, 'monthly_df_pq.parquet')

## parquet to Spark df 

In [94]:
# Parquet is read from 
# When running Spark in AWS through the access to Azure, update the location and file name 
monthly_sdf = spark.read.parquet('monthly_df_pq')

In [95]:
monthly_sdf.printSchema()

root
 |-- SITE_ID: string (nullable = true)
 |-- year: long (nullable = true)
 |-- month: long (nullable = true)
 |-- TIMESTAMP: long (nullable = true)
 |-- dataset: string (nullable = true)
 |-- SITE_IGBP: string (nullable = true)
 |-- LOCATION_LAT: double (nullable = true)
 |-- LOCATION_LONG: double (nullable = true)
 |-- TA_F: double (nullable = true)
 |-- VPD_F: double (nullable = true)
 |-- P_F: double (nullable = true)
 |-- NETRAD: double (nullable = true)
 |-- NEE_VUT_REF: double (nullable = true)
 |-- NEE_VUT_REF_QC: double (nullable = true)
 |-- NEE_CUT_REF: double (nullable = true)
 |-- NEE_CUT_REF_QC: double (nullable = true)
 |-- GPP_NT_VUT_REF: double (nullable = true)
 |-- GPP_DT_VUT_REF: double (nullable = true)
 |-- GPP_NT_CUT_REF: double (nullable = true)
 |-- GPP_DT_CUT_REF: double (nullable = true)
 |-- RECO_NT_VUT_REF: double (nullable = true)
 |-- RECO_DT_VUT_REF: double (nullable = true)
 |-- RECO_NT_CUT_REF: double (nullable = true)
 |-- RECO_DT_CUT_REF: double (

In [96]:
monthly_sdf.show(truncate=False)

+-------+----+-----+---------+-------+---------+------------+-------------+------+------+-----+-----------+-----------+--------------+-----------+--------------+--------------+--------------+--------------+--------------+---------------+---------------+---------------+---------------+--------+---------+--------+------------+---------+-------------+------------+------------+---------+---------+-----------+----------+-----------+----------+--------+-----------+----------+-----------+-----------+----------+----------+-----------+----------+---------+----------+------------+-----------+----------+------------+----+---+-------+---------+----------+---------+----------+---------+-----------------+----------+
|SITE_ID|year|month|TIMESTAMP|dataset|SITE_IGBP|LOCATION_LAT|LOCATION_LONG|TA_F  |VPD_F |P_F  |NETRAD     |NEE_VUT_REF|NEE_VUT_REF_QC|NEE_CUT_REF|NEE_CUT_REF_QC|GPP_NT_VUT_REF|GPP_DT_VUT_REF|GPP_NT_CUT_REF|GPP_DT_CUT_REF|RECO_NT_VUT_REF|RECO_DT_VUT_REF|RECO_NT_CUT_REF|RECO_DT_CUT_REF|tim

In [97]:
display(monthly_sdf)

DataFrame[SITE_ID: string, year: bigint, month: bigint, TIMESTAMP: bigint, dataset: string, SITE_IGBP: string, LOCATION_LAT: double, LOCATION_LONG: double, TA_F: double, VPD_F: double, P_F: double, NETRAD: double, NEE_VUT_REF: double, NEE_VUT_REF_QC: double, NEE_CUT_REF: double, NEE_CUT_REF_QC: double, GPP_NT_VUT_REF: double, GPP_DT_VUT_REF: double, GPP_NT_CUT_REF: double, GPP_DT_CUT_REF: double, RECO_NT_VUT_REF: double, RECO_DT_VUT_REF: double, RECO_NT_CUT_REF: double, RECO_DT_CUT_REF: double, time: string, ET: double, BESS-PAR: bigint, BESS-PARdiff: bigint, BESS-RSDN: bigint, CSIF-SIFdaily: double, CSIF-SIFinst: double, PET: double, Ts: double, Tmean: double, prcp: double, vpd: double, prcp-lag3: double, ESACCI-sm: double, MODIS_LC: bigint, b1: double, b2: double, b3: double, b4: double, b5: double, b6: double, b7: double, EVI: double, GCI: double, NDVI: double, NDWI: double, NIRv: double, kNDVI: double, Percent_Snow: double, Fpar: double, Lai: double, LST_Day: double, LST_Night: dou

In [98]:
# size of dataset
print(f'monthly_sdf: {monthly_sdf.count()}')

monthly_sdf: 19015


In [99]:
print(type(monthly_sdf))

<class 'pyspark.sql.dataframe.DataFrame'>


In [100]:
# Features used for this analysis

output_var = 'GPP_NT_VUT_REF'
# other options
output_related_var = ['NEE_VUT_REF', 'NEE_CUT_REF', 'GPP_NT_VUT_REF',
                      'GPP_DT_VUT_REF', 'GPP_NT_CUT_REF','GPP_DT_CUT_REF',
                      'RECO_NT_VUT_REF', 'RECO_DT_VUT_REF', 
                      'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF']

# predictor variables
pred_var_numeric = ['TA_F', 'VPD_F', 'P_F', 'NETRAD','ET',
                    'BESS-PAR', 'BESS-PARdiff','BESS-RSDN', 
                    'CSIF-SIFdaily', 'CSIF-SIFinst','PET', 'Ts', 'Tmean',
                    'prcp', 'vpd', 'prcp-lag3', 'ESACCI-sm',
                    'b1', 'b2', 'b3','b4', 'b5', 'b6', 'b7', 
                    'EVI', 'GCI', 'NDVI', 'NDWI', 'NIRv', 'kNDVI',
                    'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 
                    'CO2_concentration', 'LOCATION_LAT', 'LOCATION_LONG',
                    'year', 'month']

pred_var_categorical = ['SITE_IGBP','MODIS_LC', 'MODIS_IGBP','MODIS_PFT', 
                        'koppen_sub', 'koppen']

other_var =  ['SITE_ID', 'year', 'month', 'TIMESTAMP', 'dataset', 'time']

qc_flags = ['NEE_VUT_REF_QC', 'NEE_CUT_REF_QC'] 

NA_list = [] # There is no feature whose values is zero

len(output_related_var + pred_var_numeric + pred_var_categorical 
    + other_var + qc_flags + NA_list)
# output_var is included in output_related_var

64

In [101]:
# Features for RFRegressor 
feature_list = pred_var_numeric + pred_var_categorical # [output_var] is not included in the list

Make sure to add output variable `GPP_NT_VUT_REF` to the column

In [102]:
# Update Spark dataframe
monthly_rf = monthly_sdf.select([output_var] + feature_list)
monthly_rf.show()

+--------------+------+------+-----+-----------+---------+--------+------------+---------+-------------+------------+------------+---------+---------+-----------+----------+-----------+----------+-----------+----------+-----------+-----------+----------+----------+-----------+----------+---------+----------+------------+-----------+----------+------------+----+---+-------+---------+-----------------+------------+-------------+----+-----+---------+--------+----------+---------+----------+---------+
|GPP_NT_VUT_REF|  TA_F| VPD_F|  P_F|     NETRAD|       ET|BESS-PAR|BESS-PARdiff|BESS-RSDN|CSIF-SIFdaily|CSIF-SIFinst|         PET|       Ts|    Tmean|       prcp|       vpd|  prcp-lag3| ESACCI-sm|         b1|        b2|         b3|         b4|        b5|        b6|         b7|       EVI|      GCI|      NDVI|        NDWI|       NIRv|     kNDVI|Percent_Snow|Fpar|Lai|LST_Day|LST_Night|CO2_concentration|LOCATION_LAT|LOCATION_LONG|year|month|SITE_IGBP|MODIS_LC|MODIS_IGBP|MODIS_PFT|koppen_sub|   ko

Observe the number of missing values in each feature

In [103]:
# counts the number of None in each feature
from pyspark.sql.functions import isnull, when, count, col
monthly_rf.select([count(when(isnull(c), c)).alias(c) for c in monthly_rf.columns]).show()

+--------------+----+-----+---+------+---+--------+------------+---------+-------------+------------+---+---+-----+----+---+---------+---------+---+---+---+---+---+---+---+----+---+----+----+----+-----+------------+----+---+-------+---------+-----------------+------------+-------------+----+-----+---------+--------+----------+---------+----------+------+
|GPP_NT_VUT_REF|TA_F|VPD_F|P_F|NETRAD| ET|BESS-PAR|BESS-PARdiff|BESS-RSDN|CSIF-SIFdaily|CSIF-SIFinst|PET| Ts|Tmean|prcp|vpd|prcp-lag3|ESACCI-sm| b1| b2| b3| b4| b5| b6| b7| EVI|GCI|NDVI|NDWI|NIRv|kNDVI|Percent_Snow|Fpar|Lai|LST_Day|LST_Night|CO2_concentration|LOCATION_LAT|LOCATION_LONG|year|month|SITE_IGBP|MODIS_LC|MODIS_IGBP|MODIS_PFT|koppen_sub|koppen|
+--------------+----+-----+---+------+---+--------+------------+---------+-------------+------------+---+---+-----+----+---+---------+---------+---+---+---+---+---+---+---+----+---+----+----+----+-----+------------+----+---+-------+---------+-----------------+------------+-------------

In [104]:
# Original number of observations
monthly_rf.count()

19015

In [105]:
# Drop row that contains NA
monthly_rf_NAdrop = monthly_rf.na.drop("any")
monthly_rf_NAdrop.count()

12937

Use `monthly_rf_NAdrop` in the later phase

In [106]:
monthly_rf_NAdrop.show()

+--------------+------+------+-----+-----------+---------+--------+------------+---------+-------------+------------+------------+---------+---------+-----------+----------+-----------+----------+-----------+----------+-----------+-----------+----------+----------+-----------+----------+---------+----------+------------+-----------+----------+------------+----+---+-------+---------+-----------------+------------+-------------+----+-----+---------+--------+----------+---------+----------+---------+
|GPP_NT_VUT_REF|  TA_F| VPD_F|  P_F|     NETRAD|       ET|BESS-PAR|BESS-PARdiff|BESS-RSDN|CSIF-SIFdaily|CSIF-SIFinst|         PET|       Ts|    Tmean|       prcp|       vpd|  prcp-lag3| ESACCI-sm|         b1|        b2|         b3|         b4|        b5|        b6|         b7|       EVI|      GCI|      NDVI|        NDWI|       NIRv|     kNDVI|Percent_Snow|Fpar|Lai|LST_Day|LST_Night|CO2_concentration|LOCATION_LAT|LOCATION_LONG|year|month|SITE_IGBP|MODIS_LC|MODIS_IGBP|MODIS_PFT|koppen_sub|   ko

# RF feature selection with pyspark
- In modeling, train-test split(+cross validation) needs to be implemented additionally

In [107]:
# !pip install pyspark

In [108]:
from pyspark.mllib.stat import Statistics
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import MinMaxScaler
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Import fro classifier
# from pyspark.ml.evaluation import BinaryClassificationEvaluator
# from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Documentation for RFRegressor in Spark
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.regression.RandomForestRegressor.html

## Convert categorical variable to numeric
- For monthly dataset, we need to go through the steps because of the several categorical variables

### StringIndexer

- Convert string categorical variables to numeric variables

In [118]:
# Categorical features in monthly dataset
pred_var_categorical

['SITE_IGBP', 'MODIS_LC', 'MODIS_IGBP', 'MODIS_PFT', 'koppen_sub', 'koppen']

In [119]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
 
categoricalCols = pred_var_categorical
 
# The following two lines are estimators. They return functions that we will later apply to transform the dataset.
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=[x + "_Index" for x in categoricalCols]) 
encoder = OneHotEncoder(inputCols=stringIndexer.getOutputCols(), outputCols=[x + "_OHE" for x in categoricalCols]) 
 
# Example
# If the label column ("income") is also a string value - let's say it has two possible values, "<=50K" and ">50K". 
# Convert it to a numeric value using StringIndexer.
# labelToIndex = StringIndexer(inputCol="income", outputCol="label")

For Categorical variables, new column XXX_Index is added and numerical values are assigned to each category.

In [120]:
# This case we run code for feature importance, not modeling /prediction 
# Therefore full dataset is provided as input
stringIndexerModel = stringIndexer.fit(monthly_rf_NAdrop)
monthly_rf_NAdrop_STI = stringIndexerModel.transform(monthly_rf_NAdrop) # STI = StringIndexer

# Typically input data is train set/holdout set
    # stringIndexerModel = stringIndexer.fit(train0)
    # train0_STI = stringIndexerModel.transform(train0) # STI = StringIndexer
    # display(train0_STI)

  # same operation for holdout data
    # stringIndexerModel = stringIndexer.fit(holdout0)
    # holdout0_STI = stringIndexerModel.transform(holdout0) # STI = StringIndexer
    # display(holdout0_STI)

In [121]:
monthly_rf_NAdrop_STI.show()

+--------------+------+------+-----+-----------+---------+--------+------------+---------+-------------+------------+------------+---------+---------+-----------+----------+-----------+----------+-----------+----------+-----------+-----------+----------+----------+-----------+----------+---------+----------+------------+-----------+----------+------------+----+---+-------+---------+-----------------+------------+-------------+----+-----+---------+--------+----------+---------+----------+---------+---------------+--------------+----------------+---------------+----------------+------------+
|GPP_NT_VUT_REF|  TA_F| VPD_F|  P_F|     NETRAD|       ET|BESS-PAR|BESS-PARdiff|BESS-RSDN|CSIF-SIFdaily|CSIF-SIFinst|         PET|       Ts|    Tmean|       prcp|       vpd|  prcp-lag3| ESACCI-sm|         b1|        b2|         b3|         b4|        b5|        b6|         b7|       EVI|      GCI|      NDVI|        NDWI|       NIRv|     kNDVI|Percent_Snow|Fpar|Lai|LST_Day|LST_Night|CO2_concentration|

### OneHotEncoder

Transform the string values by `stringIndexer.fit` `stringIndexer.transform` first, and then use OneHotEncoder.

This link is helpful to understand the structure of column generated by `OneHotEncoder`
https://www.skytowner.com/explore/one_hot_encoding_in_pyspark

In [122]:

# 'MODIS_LC_Index' -> 'MODIS_LC_OHE' 
# 'MODIS_IGBP_Index' -> 'MODIS_IGBP_OHE' 
# 'MODIS_PFT_Index' -> 'MODIS_PFT_OHE'
# 'koppen_sub_Index' -> 'koppen_sub_OHE'
# 'koppen_Index' -> 'koppen_OHE'

# OHE = OneHotEncoding
encoder = OneHotEncoder(inputCols=['SITE_IGBP_Index', 'MODIS_LC_Index',
                                   'MODIS_IGBP_Index', 'MODIS_PFT_Index',
                                   'koppen_sub_Index', 'koppen_Index'],
                        outputCols=['SITE_IGBP_OHE', 'MODIS_LC_OHE',
                                    'MODIS_IGBP_OHE', 'MODIS_PFT_OHE',
                                    'koppen_sub_OHE', 'koppen_OHE'])
model = encoder.fit(monthly_rf_NAdrop_STI)
monthly_rf_NAdrop_OHE = model.transform(monthly_rf_NAdrop_STI)
monthly_rf_NAdrop_OHE.show()

# model = encoder.fit(holdout0_STI)
# holdout0_OHE = model.transform(holdout0_STI)
# display(holdout0_OHE)

+--------------+------+------+-----+-----------+---------+--------+------------+---------+-------------+------------+------------+---------+---------+-----------+----------+-----------+----------+-----------+----------+-----------+-----------+----------+----------+-----------+----------+---------+----------+------------+-----------+----------+------------+----+---+-------+---------+-----------------+------------+-------------+----+-----+---------+--------+----------+---------+----------+---------+---------------+--------------+----------------+---------------+----------------+------------+--------------+--------------+--------------+-------------+--------------+-------------+
|GPP_NT_VUT_REF|  TA_F| VPD_F|  P_F|     NETRAD|       ET|BESS-PAR|BESS-PARdiff|BESS-RSDN|CSIF-SIFdaily|CSIF-SIFinst|         PET|       Ts|    Tmean|       prcp|       vpd|  prcp-lag3| ESACCI-sm|         b1|        b2|         b3|         b4|        b5|        b6|         b7|       EVI|      GCI|      NDVI|      

## Assemble features

### Update feature list(If categorical vairable exists)

In [127]:
# New feature list with OHE variables

# Keep original list for numeric features
pred_var_numeric = ['TA_F', 'VPD_F', 'P_F', 'NETRAD','ET',
                    'BESS-PAR', 'BESS-PARdiff','BESS-RSDN', 
                    'CSIF-SIFdaily', 'CSIF-SIFinst','PET', 'Ts', 'Tmean',
                    'prcp', 'vpd', 'prcp-lag3', 'ESACCI-sm',
                    'b1', 'b2', 'b3','b4', 'b5', 'b6', 'b7', 
                    'EVI', 'GCI', 'NDVI', 'NDWI', 'NIRv', 'kNDVI',
                    'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 
                    'CO2_concentration', 'LOCATION_LAT', 'LOCATION_LONG',
                    'year', 'month']

# Update variable names of categorical features
# Original: pred_var_categorical = ['SITE_IGBP','MODIS_LC', 'MODIS_IGBP','MODIS_PFT', 'koppen_sub', 'koppen']
pred_var_categorical_OHE = ['SITE_IGBP_OHE','MODIS_LC_OHE', 'MODIS_IGBP_OHE','MODIS_PFT_OHE',
                        'koppen_sub_OHE', 'koppen_OHE']

feature_list = pred_var_numeric + pred_var_categorical_OHE


In [128]:
# Updated feature list
feature_list

['TA_F',
 'VPD_F',
 'P_F',
 'NETRAD',
 'ET',
 'BESS-PAR',
 'BESS-PARdiff',
 'BESS-RSDN',
 'CSIF-SIFdaily',
 'CSIF-SIFinst',
 'PET',
 'Ts',
 'Tmean',
 'prcp',
 'vpd',
 'prcp-lag3',
 'ESACCI-sm',
 'b1',
 'b2',
 'b3',
 'b4',
 'b5',
 'b6',
 'b7',
 'EVI',
 'GCI',
 'NDVI',
 'NDWI',
 'NIRv',
 'kNDVI',
 'Percent_Snow',
 'Fpar',
 'Lai',
 'LST_Day',
 'LST_Night',
 'CO2_concentration',
 'LOCATION_LAT',
 'LOCATION_LONG',
 'year',
 'month',
 'SITE_IGBP_OHE',
 'MODIS_LC_OHE',
 'MODIS_IGBP_OHE',
 'MODIS_PFT_OHE',
 'koppen_sub_OHE',
 'koppen_OHE']

In [129]:
# Assemble features
assembler = VectorAssembler().setInputCols(feature_list)\
                            .setOutputCol('vectorized_features')

In [130]:
# Usually the input is train set/ holdout set
# Full dataset is used here because the goal of this process is to apply feture importance
# Example code
  # assmb_train0 = assembler.transform(train0)#train0_OHE)
  # assmb_holdout0 = assembler.transform(holdout0)#_OHE)

assmb_sdf_NAdrop_OHE = assembler.transform(monthly_rf_NAdrop_OHE)


In [131]:
# vectorized features are added to the lest side of table
assmb_sdf_NAdrop_OHE.show()

+--------------+------+------+-----+-----------+---------+--------+------------+---------+-------------+------------+------------+---------+---------+-----------+----------+-----------+----------+-----------+----------+-----------+-----------+----------+----------+-----------+----------+---------+----------+------------+-----------+----------+------------+----+---+-------+---------+-----------------+------------+-------------+----+-----+---------+--------+----------+---------+----------+---------+---------------+--------------+----------------+---------------+----------------+------------+--------------+--------------+--------------+-------------+--------------+-------------+--------------------+
|GPP_NT_VUT_REF|  TA_F| VPD_F|  P_F|     NETRAD|       ET|BESS-PAR|BESS-PARdiff|BESS-RSDN|CSIF-SIFdaily|CSIF-SIFinst|         PET|       Ts|    Tmean|       prcp|       vpd|  prcp-lag3| ESACCI-sm|         b1|        b2|         b3|         b4|        b5|        b6|         b7|       EVI|      

## Normalize features

In [132]:
# normalize dataset
scaler = MinMaxScaler().setInputCol('vectorized_features')\
                            .setOutputCol('normalized_features')

scaler_sdf_NAdrop_model = scaler.fit(assmb_sdf_NAdrop_OHE)
normed_sdf_NAdrop = scaler_sdf_NAdrop_model.transform(assmb_sdf_NAdrop_OHE)

# Same as the last section, usually input is train set/holdout set
# Example
    # normed_train0 = scaler_model0.transform(assmb_train0)
    # normed_holdout0 = scaler_model0.transform(assmb_holdout0)

In [133]:
# Column 'normalized features' is added
normed_sdf_NAdrop.show()


+--------------+------+------+-----+-----------+---------+--------+------------+---------+-------------+------------+------------+---------+---------+-----------+----------+-----------+----------+-----------+----------+-----------+-----------+----------+----------+-----------+----------+---------+----------+------------+-----------+----------+------------+----+---+-------+---------+-----------------+------------+-------------+----+-----+---------+--------+----------+---------+----------+---------+---------------+--------------+----------------+---------------+----------------+------------+--------------+--------------+--------------+-------------+--------------+-------------+--------------------+--------------------+
|GPP_NT_VUT_REF|  TA_F| VPD_F|  P_F|     NETRAD|       ET|BESS-PAR|BESS-PARdiff|BESS-RSDN|CSIF-SIFdaily|CSIF-SIFinst|         PET|       Ts|    Tmean|       prcp|       vpd|  prcp-lag3| ESACCI-sm|         b1|        b2|         b3|         b4|        b5|        b6|        

## Fit model

### Random Forest Regressor

In [134]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(featuresCol = 'normalized_features', labelCol = 'GPP_NT_VUT_REF')
rfModel = rf.fit(normed_sdf_NAdrop)
# rfModel = rf.fit(normed_train0)
# predictions = rfModel.transform(normed_holdout0)

## Feature Importance

In [135]:
feature_importance = rfModel.featureImportances

Feature importance is obtained by each one-hot-encoded feature
Ex. if feature A has four features, it is counted as four independent features(A-1, A-2, A-3, A-4)

In [136]:
# Result of 108 features are obtained
feature_importance

SparseVector(108, {0: 0.001, 2: 0.0003, 3: 0.0331, 4: 0.0191, 5: 0.0029, 6: 0.0177, 7: 0.005, 8: 0.1802, 9: 0.2188, 10: 0.0019, 11: 0.0029, 12: 0.0016, 13: 0.0003, 14: 0.0018, 15: 0.0002, 17: 0.0002, 18: 0.0074, 19: 0.0005, 20: 0.0005, 21: 0.0013, 22: 0.0013, 23: 0.0007, 24: 0.032, 25: 0.0029, 26: 0.0006, 27: 0.0049, 28: 0.0849, 29: 0.057, 31: 0.1227, 32: 0.1539, 33: 0.0025, 34: 0.0023, 35: 0.0003, 36: 0.0045, 37: 0.007, 38: 0.0001, 39: 0.0005, 40: 0.0013, 42: 0.0023, 43: 0.0001, 44: 0.0021, 50: 0.0005, 51: 0.0023, 55: 0.0008, 59: 0.0005, 62: 0.0003, 63: 0.0005, 64: 0.0008, 67: 0.0005, 68: 0.001, 72: 0.0023, 75: 0.0001, 77: 0.0009, 79: 0.0003, 84: 0.0002, 85: 0.0008, 86: 0.0002, 87: 0.0001, 88: 0.005, 90: 0.0002, 104: 0.0007, 105: 0.0001, 106: 0.0001, 107: 0.0009})

### Details of features

In [146]:
pred_var_numeric = ['TA_F', 'VPD_F', 'P_F', 'NETRAD','ET',
                    'BESS-PAR', 'BESS-PARdiff','BESS-RSDN', 
                    'CSIF-SIFdaily', 'CSIF-SIFinst','PET', 'Ts', 'Tmean',
                    'prcp', 'vpd', 'prcp-lag3', 'ESACCI-sm',
                    'b1', 'b2', 'b3','b4', 'b5', 'b6', 'b7', 
                    'EVI', 'GCI', 'NDVI', 'NDWI', 'NIRv', 'kNDVI',
                    'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 
                    'CO2_concentration', 'LOCATION_LAT', 'LOCATION_LONG',
                    'year', 'month']
print(f'Numeric features: {len(pred_var_numeric)}')

# pred_var_categorical_OHE = [`SITE_IGBP`, 'MODIS_LC_OHE', 'MODIS_IGBP_OHE','MODIS_PFT_OHE',
#                         'koppen_sub_OHE', 'koppen_OHE']

# Name one-hot-encoded categorical features as follows
SITE_IGBP = ['SITE_IGBP_' + str(i+1) for i in range(10)]
MODIS_LC = ['MODIS_LC_' + str(i+1) for i in range(13)]
MODIS_IGBP = ['MODIS_IGBP_' + str(i+1) for i in range(13)]
MODIS_PFT = ['MODIS_PFT_' + str(i+1) for i in range(8)]
koppen_sub = ['koppen_sub_' + str(i+1) for i in range(20)]
koppen = ['koppen_' + str(i+1) for i in range(4)]

feature_list_OHE = pred_var_numeric + SITE_IGBP + MODIS_LC + MODIS_IGBP + MODIS_PFT + koppen_sub + koppen

print(f'Numeric + categorical features: {len(feature_list_OHE)}')

Numeric features: 40
Numeric + categorical features: 108


In [147]:
feature_importances_list = [feature_list_OHE]
feature_importances_list += [list(feature_importance)]
feature_importances_df = pd.DataFrame(feature_importances_list).T

In [148]:
feature_importances_df.columns = ["feature_name", "importance"]
feature_importances_df

Unnamed: 0,feature_name,importance
0,TA_F,0.001017
1,VPD_F,0.0
2,P_F,0.00028
3,NETRAD,0.03309
4,ET,0.019118
...,...,...
103,koppen_sub_20,0.0
104,koppen_1,0.000685
105,koppen_2,0.000051
106,koppen_3,0.000144


In [149]:
!pwd

/content/drive/MyDrive


In [150]:
!pip install pandas==1.3.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [151]:
feature_importances_df.to_csv("feature_importances_df_monthly.csv")