# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

In [2]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

## Import Modules

In [3]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.chdir(MY_HOME_ABS_PATH) # <------------------ ADDED
import math
import json

import pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO
from sklearn.impute import KNNImputer # <----------- ADDED
from tqdm import tqdm # <----------- ADDED

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [5]:
# # Import SparkSession
# from pyspark.sql import SparkSession
# # Create a Spark Session
# spark = SparkSession.builder.master("local[*]").config(
#     "spark.jars.packages", 
#     "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6"
#     ).getOrCreate()
# # Check Spark Session Information
# spark

# Define Constants

In [6]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + 'monthly-interpolated.csv'

In [7]:
# File
container = "baseline-data"
ext = "parquet"
ver = "1-i"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name_base = f"baseline-train-v-{ver}"
test_blob_name_base = f"baseline-test-v-{ver}"

In [8]:
# # "Golden" Sites
# tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
# tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

# train_sites = tier1_sites + tier2_sites

# # Selected Test Sites
# test_sites = ["US-GLE", # ENF, Cold
#               "US-AR1", # GRA, Temperate
#               "US-Seg", # GRA, Arid
#               "US-FR2", # WSA, Temperate
#               "ES-LM2", # WSA, Arid
#               "CA-Cbo", # DBF, Cold
#               "FR-Lam", # CRO, Temperate
#               "IT-Cpz", # EBF, Temperate
#               "CN-Cha", # MF Cold
#               "IT-Lsn", # OSH, Temperate
#               ]

train_sites = ['CN-HaM', 'AR-SLu']# <--- reduced to two site by John for dev
test_sites = ['US-Wi5', 'US-Wi6']

In [9]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'

# Get Gold Sample Site Data

In [10]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'lat', 'long', #'elevation', <---- REMOVED
                          'koppen_sub', 'koppen_main', 'koppen_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(train_sites + test_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(4, 9)


Unnamed: 0,site_id,lat,long,koppen_sub,koppen_main,koppen_name,c3c4,c4_percent,filename
0,AR-SLu,-33.4648,-66.4598,7,2,BSk,C3,67.08,data_full_half_hourly_raw_v0_1_AR-SLu.csv
1,CN-HaM,37.37,101.18,29,5,ET,C3,1.07,data_full_half_hourly_raw_v0_1_CN-HaM.csv
2,US-Wi5,46.65308,-91.08581,26,4,Dfb,C3,4.14,data_full_half_hourly_raw_v0_1_US-Wi5.csv
3,US-Wi6,46.62489,-91.29822,26,4,Dfb,C3,4.14,data_full_half_hourly_raw_v0_1_US-Wi6.csv


# Get Monthly Data

In [11]:
# Load monthly metadata
monthly_df = pd.read_csv(monthly_data_filename)

# only focus on target sites
monthly_df = monthly_df.loc[monthly_df['SITE_ID'].isin(train_sites + test_sites)]
print(f"size:{monthly_df.shape}")
monthly_df.reset_index(inplace=True, drop=True)
monthly_df[['year','month', 'TIMESTAMP']] = monthly_df[['year','month', 'TIMESTAMP']].astype('int')
monthly_df[['MODIS_LC']] = monthly_df[['MODIS_LC']].astype('int')
monthly_df.head()

size:(50, 20)


Unnamed: 0,date,SITE_ID,year,month,TIMESTAMP,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
0,2010-01-01,AR-SLu,2010,1,201001,154.0,40.0,336.0,0.20432,-0.01339,302.46967,0.15152,7,0.03542,0.0,0.49,1.2,313.84,293.58,SH
1,2010-02-01,AR-SLu,2010,2,201002,120.0,46.0,258.0,0.14553,-0.00894,298.78864,0.16656,7,0.0004,0.0,0.43,0.9,309.86,292.96,SH
2,2010-03-01,AR-SLu,2010,3,201003,107.0,31.0,231.0,0.1098,-0.00813,297.54816,0.16408,7,-0.02286,0.0,0.41,0.8,309.18,290.52,SH
3,2010-04-01,AR-SLu,2010,4,201004,81.0,27.0,175.0,0.07673,-0.00676,291.69604,0.12402,7,-0.04202,0.0,0.36,0.5,303.24,286.34,SH
4,2010-05-01,AR-SLu,2010,5,201005,56.0,19.0,122.0,0.06602,-0.00473,287.05652,0.14273,7,-0.01064,0.0,0.37,0.5,296.2,277.82,SH


# Stage 1: Trim and Merge Site Metadata

All available features from Half=hourly data:
```
'TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
'b7', 'IGBP', 'koppen'
``` 

In [12]:
def data_cleanup_new(data_dir, site_id_file_df, target, target_qc, features):
  data_df = None
  # qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
  qc_flags_features = [s for s in features if "_QC" in s]

  # Iterate through each site:
  for i, r in site_id_file_df.iterrows():        
    if not r.filename or type(r.filename) != type(""):
      print(f'\nERROR: {r.site_id} is mssing hourly data.')
      continue

    # Get only `features` from file
    local_filename = data_dir + os.sep + r.filename
    site_df = pd.read_csv(local_filename, usecols = [target, target_qc] + features)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    site_df['date'] = pd.to_datetime(site_df['date'])
    site_df['minute'] = site_df['datetime'].dt.minute
    if len(qc_flags_features) != 0:
      site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
    site_df['site_id'] = r.site_id

    # Remove zero or negative SW <------------- RETHINK LATER
    site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True)

    # Drop rows with NAs for Target Variable
    site_df.dropna(subset=[target], axis=0, inplace=True)

    # Drop rows with bad NEE_VUT_REF_QC (aka bad GPP records)
    site_df.drop(site_df[site_df[target_qc] == 3].index, inplace = True)
    site_df.drop([target_qc], axis=1, inplace=True)

    # Drop rows with any NA
    #site_df.dropna(axis=0, inplace=True) <-------------- REMOVED

    print(f"{r.site_id}: {site_df.shape}")
    if type(data_df) == type(None):
      data_df = site_df
    else:
      data_df = pd.concat([data_df, site_df])
          
  return data_df

In [39]:
# Initial data clean and feature selections from raw data
data_df = data_cleanup_new(raw_data_dir, site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

# Merge with site metadata
data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")

# Merge with monthly data
data_df = data_df.merge(monthly_df.drop(['date', 'TIMESTAMP'], axis=1), how='inner', # <----------------------- CHANGED TO INNER BC MONTHLY DATA ISN'T FOR EVERY SITE (present for 243 out of ~280)
                        left_on =['site_id', 'year', 'month'],
                        right_on=['SITE_ID', 'year', 'month'])
data_df.drop('SITE_ID', axis=1, inplace=True)
print(f"Data size after after merged with monthly data: {data_df.shape}")

# Drop rows with NA
#check_and_drop_na(data_df) # <----------------------------- REMOVED
print(f"Data size after after final drop: {data_df.shape}")

#reorder columns
features = data_df.columns.to_list()
features.remove(target_variable)
data_df = data_df[([target_variable] + features)]

display(data_df.head())

AR-SLu: (22099, 27)
CN-HaM: (50613, 27)
US-Wi5: (11374, 27)
US-Wi6: (9748, 27)
Data size after cleanup: (93834, 27)
Data size after after merged with site metadata: (93834, 32)
Data size after after merged with monthly data: (72472, 47)
Data size after after final drop: (72472, 47)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT
0,-0.13057,24.008,0.0,356.231,8.305,0.321,94.956,2010-01-01 00:00:00,2010,1,1,0,2010-01-01,0.28274,0.52839,0.11593,0.0677,0.2194,0.0379,0.0679,0.2836,0.2441,0.1478,MF,Arid,0,AR-SLu,-33.4648,-66.4598,7,C3,67.08,154.0,40.0,336.0,0.20432,-0.01339,302.46967,0.15152,7,0.03542,0.0,0.49,1.2,313.84,293.58,SH
1,-0.58623,23.773,0.0,356.231,7.96,0.321,94.959,2010-01-01 00:30:00,2010,1,1,0,2010-01-01,0.28274,0.52839,0.11593,0.0677,0.2194,0.0379,0.0679,0.2836,0.2441,0.1478,MF,Arid,30,AR-SLu,-33.4648,-66.4598,7,C3,67.08,154.0,40.0,336.0,0.20432,-0.01339,302.46967,0.15152,7,0.03542,0.0,0.49,1.2,313.84,293.58,SH
2,-0.66336,23.539,0.0,356.231,7.616,0.0,94.963,2010-01-01 01:00:00,2010,1,1,1,2010-01-01,0.28274,0.52839,0.11593,0.0677,0.2194,0.0379,0.0679,0.2836,0.2441,0.1478,MF,Arid,0,AR-SLu,-33.4648,-66.4598,7,C3,67.08,154.0,40.0,336.0,0.20432,-0.01339,302.46967,0.15152,7,0.03542,0.0,0.49,1.2,313.84,293.58,SH
3,-0.64809,23.304,0.0,356.231,7.272,0.0,94.966,2010-01-01 01:30:00,2010,1,1,1,2010-01-01,0.28274,0.52839,0.11593,0.0677,0.2194,0.0379,0.0679,0.2836,0.2441,0.1478,MF,Arid,30,AR-SLu,-33.4648,-66.4598,7,C3,67.08,154.0,40.0,336.0,0.20432,-0.01339,302.46967,0.15152,7,0.03542,0.0,0.49,1.2,313.84,293.58,SH
4,0.93336,23.069,0.0,356.231,6.927,0.0,94.97,2010-01-01 02:00:00,2010,1,1,2,2010-01-01,0.28274,0.52839,0.11593,0.0677,0.2194,0.0379,0.0679,0.2836,0.2441,0.1478,MF,Arid,0,AR-SLu,-33.4648,-66.4598,7,C3,67.08,154.0,40.0,336.0,0.20432,-0.01339,302.46967,0.15152,7,0.03542,0.0,0.49,1.2,313.84,293.58,SH


## Impute Values

In [47]:
# using only train_df for example -> would be nice to impute before splitting in the end
#df_init = data_transformer.train_df.copy()

# Define the features to use in KNN imputer, only using real values as cat are same per site
knn_exclude_cols = ['date', 'datetime', 'day', 'minute', 'site_id'] + list(site_metadata_df.columns)
knn_imp_real = list(data_df.select_dtypes(include=['int', 'float']).columns)
knn_imp_real = [x for x in knn_imp_real if x not in knn_exclude_cols]
print(knn_imp_real)


def knn_impute(df, imp_cols, group_col='site_id', k=5, weights='uniform'):
    groups = df.groupby(group_col)
    
    # Loop through groups, impute na
    for group_id, group_data in tqdm(groups):
        # Save inds
        inds = group_data.index

        # Fit and transform the data using KNNImputer, format as DF
        group_knn_df = group_data[imp_cols].copy()
        group_knn_df = group_knn_df.dropna(axis=1, how='all') # drop col if all NA

        imputer = KNNImputer(n_neighbors=k, weights=weights)
        imputed_group = imputer.fit_transform(group_knn_df)
        imputed_group = pd.DataFrame(imputed_group, columns=group_knn_df.columns).set_index(inds)

        # Update group_data (which has wider column set) and recreate initial df
        group_data.set_index(inds)
        group_data.fillna(imputed_group, inplace=True)

        # update initial df
        df.loc[inds] = group_data

    return df


df_imputed = knn_impute(data_df, knn_imp_real, k=5, weights='uniform') # <------ can use weights='distance' as well

['GPP_NT_VUT_REF', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'year', 'month', 'hour', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night']


100%|██████████| 2/2 [00:00<00:00, 12.18it/s]


### Check Imputation Work

In [48]:
## Compare data_df init to data_df_copy with filled NA values
df_init = data_df.copy()

# confirm no NAs remain in new df
na_rows_post = df_imputed.isna().any(axis=1).sum()
print(f"Number of NA rows post imputation: {na_rows_post}")

# Drop NA rows from both (using indices) confirm they are same df now
drop_na = df_init.dropna(how='any')
drop_imp = df_imputed.iloc[drop_na.index, ]
drop_na.reset_index(inplace=True, drop=True)
drop_imp.reset_index(inplace=True, drop=True)
print(f"Are all rows with no NAs the same as before? {drop_na.equals(drop_imp)}")

# Check that 50 rows that initiall had NA are the same in non-NA cols
na_inds = df_init.loc[df_init.isna().any(axis=1), ].index
errors = 0
for ind in na_inds[:50]:
    check_ind = pd.concat([df_init.iloc[ind], df_imputed.iloc[ind]], axis=1).dropna()
    check_ind.columns = ['initial', 'post_imp']
    if not check_ind['initial'].equals(check_ind['post_imp']):
        errors += 1
        print(ind)
print(f"Number of non-NA values changed by error: {errors}")

# DF length is the same 
print(f"DF is same length as before: {len(df_init) == len(df_imputed)}")

Number of NA rows post imputation: 0
Are all rows with no NAs the same as before? True
Number of non-NA values changed by error: 0
DF is same length as before: True


In [40]:
print("ISSUE WHERE MONTHLY DATA DOESN'T HAVE EVERY SITE, SHOULD WE INNER JOIN?")
monthly_df.loc[monthly_df['SITE_ID']=='US-Wi5']

ISSUE WHERE MONTHLY DATA DOESN'T HAVE EVERY SITE, SHOULD WE INNER JOIN?


Unnamed: 0,date,SITE_ID,year,month,TIMESTAMP,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT


# CheckPoint: Upload Data to Azure Storage Blob as Parquet

In [14]:
# # Upload to Azure Storage Blob
# # ref: https://stackoverflow.com/a/54666079
# data_cleanup_checkpoint = True
# tag = "raw"
# blob_name = f"{blob_name_base}_{tag}.{ext}"

# if data_cleanup_checkpoint:

#   parquet_file = BytesIO()
#   data_df.to_parquet(parquet_file, engine='pyarrow')
#   parquet_file.seek(0)

#   azStorageClient = AzStorageClient(az_cred_file)
#   azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

In [15]:
# # Save no transformed data
# get_non_transform_train_test = True
# if get_non_transform_train_test:
#   data_transformer = TFTDataTransformer(train_sites, test_sites,
#                                         None, data_df)
#   train_df, test_df = data_transformer.get_test_train_raw()
#   print("Train data peak:")
#   display(train_df.head(5))
#   print("Test data peak:")
#   display(test_df.head(5))

#   train_blob_name= f"{train_blob_name_base}-{tag}.{ext}"
#   test_blob_name= f"{test_blob_name_base}-{tag}.{ext}"
#   data_transformer.upload_train_test_to_azure(az_cred_file, container,\
#                                               train_blob_name, test_blob_name)

# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [16]:
load_data_from_previous_checkpoint = False
useSpark = False

raw_data_file_path = None
if load_data_from_previous_checkpoint:
  data_df = None
  raw_data_file_path = tmp_dir + os.sep + blob_name
  print(f"loading {raw_data_file_path}...")
  if not (os.path.exists(raw_data_file_path)):
      if not (os.path.exists(tmp_dir)):
          os.mkdir(tmp_dir)
      azStorageClient = AzStorageClient(az_cred_file)
      file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
      data_df = pd.read_parquet(file_stream, engine='pyarrow')
      data_df.to_parquet(raw_data_file_path)
  
if useSpark:
  data_transformer = PySparkMLDataTransformer(spark, train_sites, test_sites,
                                              raw_data_file_path, data_df)
else:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                              raw_data_file_path, data_df)

Data size: (95274, 46).


In [17]:
timestamp_col = 'datetime'
target_col = 'GPP_NT_VUT_REF'

if useSpark: # Spark ML Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'MODIS_PFT', 'MODIS_LC'] 
  data_transformer.data_transform(categorical_cols, timestamp_col, target_col)

  print("Train data peak:")
  data_transformer.train_df.show(5, False)
  print("Test data peak:")
  data_transformer.test_df.show(5, False)

  train_blob_name= f"{train_blob_name_base}"
  test_blob_name= f"{test_blob_name_base}"

else: # TFT Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'site_id',
                      'year', 'month', 'day', 'hour', 'minute',
                      'MODIS_PFT', 'MODIS_LC']
  realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                  'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                  'lat', 'long', 'c4_percent', #'elevation', <----------------------- REMOVED
                  'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                  'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                  'LST_Day', 'LST_Night']
  backup_cols = ['IGBP', 'koppen','site_id']
  data_transformer.data_transform(categorical_cols, realNum_cols, backup_cols,\
                                  timestamp_col, target_col)

  print("Train data peak:")
  display(data_transformer.train_df.head(5))
  print("Test data peak:")
  display(data_transformer.test_df.head(5))

  train_blob_name= f"{train_blob_name_base}.{ext}"
  test_blob_name= f"{test_blob_name_base}.{ext}"

Data size: (95274, 49).
Data size after encoding: (95274, 49)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,IGBP_name,koppen_name,site_id_name
0,0.40979,23.235,0.0,333.482,10.7,0.0,95.214,2009-12-21 00:00:00,3,11,20,0,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,2,0,0,0,-33.4648,-66.4598,0,0,67.08,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu
1,-0.26229,23.022,0.0,333.482,10.38,0.0,95.211,2009-12-21 00:30:00,3,11,20,0,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,2,0,1,0,-33.4648,-66.4598,0,0,67.08,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu
2,-0.59009,22.81,0.0,333.482,10.059,0.0,95.208,2009-12-21 01:00:00,3,11,20,1,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,2,0,0,0,-33.4648,-66.4598,0,0,67.08,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu
3,-0.54323,22.598,0.0,333.482,9.739,0.0,95.205,2009-12-21 01:30:00,3,11,20,1,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,2,0,1,0,-33.4648,-66.4598,0,0,67.08,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu
4,-2.02626,22.385,0.0,333.482,9.418,0.0,95.202,2009-12-21 02:00:00,3,11,20,2,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,2,0,0,0,-33.4648,-66.4598,0,0,67.08,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu


Features(44): ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'year', 'month', 'day', 'hour', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'IGBP', 'koppen', 'minute', 'site_id', 'lat', 'long', 'koppen_sub', 'c3c4', 'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night', 'MODIS_PFT']
Train data size: (74152, 49).
Test data size: (21122, 49).
Normalizinf features (32): ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'lat', 'long', 'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night']
Train data size: (74152, 49).
Test data size: (21122, 49).
Train data peak:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:,features] = scaler.transform(train_df[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:,features] = scaler.transform(test_df[features])


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,IGBP_name,koppen_name,site_id_name
0,0.40979,1.28628,-0.72943,1.09416,0.66773,-0.20653,1.52273,2009-12-21 00:00:00,3,11,20,0,0.71305,0.6391,0.64848,-0.4062,-0.25674,-0.36887,-0.37343,0.05041,0.4373,0.26961,2,0,0,0,-1.53475,-1.53475,0,0,1.53475,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu
1,-0.26229,1.27074,-0.72943,1.09416,0.62366,-0.20653,1.52247,2009-12-21 00:30:00,3,11,20,0,0.71305,0.6391,0.64848,-0.4062,-0.25674,-0.36887,-0.37343,0.05041,0.4373,0.26961,2,0,1,0,-1.53475,-1.53475,0,0,1.53475,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu
2,-0.59009,1.25528,-0.72943,1.09416,0.57946,-0.20653,1.52222,2009-12-21 01:00:00,3,11,20,1,0.71305,0.6391,0.64848,-0.4062,-0.25674,-0.36887,-0.37343,0.05041,0.4373,0.26961,2,0,0,0,-1.53475,-1.53475,0,0,1.53475,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu
3,-0.54323,1.23981,-0.72943,1.09416,0.5354,-0.20653,1.52197,2009-12-21 01:30:00,3,11,20,1,0.71305,0.6391,0.64848,-0.4062,-0.25674,-0.36887,-0.37343,0.05041,0.4373,0.26961,2,0,1,0,-1.53475,-1.53475,0,0,1.53475,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu
4,-2.02626,1.22427,-0.72943,1.09416,0.4912,-0.20653,1.52172,2009-12-21 02:00:00,3,11,20,2,0.71305,0.6391,0.64848,-0.4062,-0.25674,-0.36887,-0.37343,0.05041,0.4373,0.26961,2,0,0,0,-1.53475,-1.53475,0,0,1.53475,,,,,,,,2,,,,,,,2,MF,Arid,AR-SLu


Test data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,lat,long,koppen_sub,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_PFT,IGBP_name,koppen_name,site_id_name
74152,-0.39329,-0.3808,-0.72943,-0.12253,-0.71669,-0.20653,1.74701,2004-04-20 00:00:00,2,3,19,0,0.51689,0.71509,0.53316,-0.45146,-0.36605,-0.43805,-0.47622,-0.10967,-0.36103,0.09368,0,1,0,2,0.9381,-1.85591,1,0,-0.54989,,,,,,,,2,,,,,,,2,ENF,Cold,US-Wi5
74153,0.81492,-0.38598,-0.72943,-0.12253,-0.71889,-0.20653,1.74491,2004-04-20 00:30:00,2,3,19,0,0.51689,0.71509,0.53316,-0.45146,-0.36605,-0.43805,-0.47622,-0.10967,-0.36103,0.09368,0,1,1,2,0.9381,-1.85591,1,0,-0.54989,,,,,,,,2,,,,,,,2,ENF,Cold,US-Wi5
74154,0.91835,-0.39116,-0.72943,-0.12253,-0.72109,-0.20653,1.74282,2004-04-20 01:00:00,2,3,19,1,0.51689,0.71509,0.53316,-0.45146,-0.36605,-0.43805,-0.47622,-0.10967,-0.36103,0.09368,0,1,0,2,0.9381,-1.85591,1,0,-0.54989,,,,,,,,2,,,,,,,2,ENF,Cold,US-Wi5
74155,-0.39329,-0.39634,-0.72943,-0.12253,-0.7233,-0.20653,1.74073,2004-04-20 01:30:00,2,3,19,1,0.51689,0.71509,0.53316,-0.45146,-0.36605,-0.43805,-0.47622,-0.10967,-0.36103,0.09368,0,1,1,2,0.9381,-1.85591,1,0,-0.54989,,,,,,,,2,,,,,,,2,ENF,Cold,US-Wi5
74156,-0.25732,-0.40152,-0.72943,-0.12253,-0.7255,-0.20653,1.73863,2004-04-20 02:00:00,2,3,19,2,0.51689,0.71509,0.53316,-0.45146,-0.36605,-0.43805,-0.47622,-0.10967,-0.36103,0.09368,0,1,0,2,0.9381,-1.85591,1,0,-0.54989,,,,,,,,2,,,,,,,2,ENF,Cold,US-Wi5


## Impute Missing Values

In [26]:
# using only train_df for example -> would be nice to impute before splitting in the end
#df_init = data_transformer.train_df.copy()

# Define the features to use in KNN imputer, only using real values as cat are same per site
knn_exclude_cols = ['date', 'datetime', 'day', 'minute', 'site_id'] + list(site_metadata_df.columns)
knn_imp_real = list(data_df.select_dtypes(include=['int', 'float']).columns)
knn_imp_real = [x for x in knn_imp_real if x not in knn_exclude_cols]
print(knn_imp_real)

['GPP_NT_VUT_REF', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'year', 'month', 'hour', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day', 'LST_Night']


In [68]:
from sklearn.impute import KNNImputer # <----------- ADDED
from tqdm import tqdm # <----------- ADDED

def knn_impute(df, imp_cols, group_col='site_id', k=5, weights='uniform'):
    groups = df.groupby(group_col)

    # Loop through groups, impute na
    for group_id, group_data in tqdm(groups):
        # Save inds
        inds = group_data.index

        # Fit and transform the data using KNNImputer, format as DF
        group_knn_df = group_data[imp_cols].copy()
        imputer = KNNImputer(n_neighbors=k, weights=weights)
        imputed_group = imputer.fit_transform(group_knn_df)
        imputed_group = pd.DataFrame(imputed_group, columns=group_knn_df.columns).set_index(inds)

        # Update group_data (which has wider column set) and recreate initial df
        group_data.set_index(inds)
        group_data.fillna(imputed_group, inplace=True)

        # update initial df
        df.loc[inds] = group_data

    return df


df_imputed = knn_impute(data_transformer.train_df, knn_imp_real, k=5, weights='uniform')

100%|██████████| 2/2 [02:24<00:00, 72.08s/it]


### Check Imputing Work

In [69]:
## Compare data_df init to data_df_copy with filled NA values
df_init = data_transformer.train_df.copy()

# confirm no NAs remain in new df
na_rows_post = df_imputed.isna().any(axis=1).sum()
print(f"Number of NA rows post imputation: {na_rows_post}")

# Drop NA rows from both (using indices) confirm they are same df now
drop_na = df_init.dropna(how='any')
drop_imp = df_imputed.iloc[drop_na.index, ]
drop_na.reset_index(inplace=True, drop=True)
drop_imp.reset_index(inplace=True, drop=True)
print(f"Are all rows with no NAs the same as before? {drop_na.equals(drop_imp)}")

# Check that 50 rows that initiall had NA are the same in non-NA cols
na_inds = df_init.loc[df_init.isna().any(axis=1), ].index
errors = 0
for ind in na_inds[:50]:
    check_ind = pd.concat([df_init.iloc[ind], df_imputed.iloc[ind]], axis=1).dropna()
    check_ind.columns = ['initial', 'post_imp']
    if not check_ind['initial'].equals(check_ind['post_imp']):
        errors += 1
        print(ind)
print(f"Number of non-NA values changed by error: {errors}")

# DF length is the same 
print(f"DF is same length as before: {len(df_init) == len(df_imputed)}")

Number of NA rows post imputation: 0
Are all rows with no NAs the same as before? True
Number of non-NA values changed by error: 0
DF is same length as before: True


# Checkpoint: Upload train and test to Azure Blob Storage

In [None]:
# final_checkpoint = True

# if final_checkpoint:
#   data_transformer.upload_train_test_to_azure(az_cred_file, container, \
#                                             train_blob_name, test_blob_name)