In [4]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

In [5]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

In [6]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
    sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Definitions

In [7]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
preproc_objects_dir = root_dir + os.sep + 'code' + os.sep + 'src' + os.sep + 'preprocessing' + os.sep + 'preproc_objects'

if IN_COLLAB:
    raw_data_dir = "/content/drive/MyDrive/W210/Data/half_hourly_data"

# input files
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-mvp.csv"
split_dict_filename = preproc_objects_dir + os.sep + "stratified_splits_k5.joblib"

# File
container = "all-sites-data"
ext = "parquet"
ver = "mvp"
model = "rfr"
tag = "raw"
blob_name_base = f"{model}-full_2010_2015_v_{ver}"
blob_name = f"{blob_name_base}_{tag}.{ext}"
train_blob_name = f"{model}-full_2010_2015-train-v-{ver}.{ext}"
val_blob_name = f"{model}-full_2010_2015-val-v-{ver}.{ext}"
test_blob_name = f"{model}-full_2010_2015-test-v-{ver}.{ext}"

In [8]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'koppen_sub', 'koppen_main',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

### Load One Site

In [9]:
# Prepare hourly site df
site_id = 'AU-ASM'
filename = f'data_full_half_hourly_raw_v0_1_{site_id}.csv'
local_filename = tmp_dir + os.sep + filename
site_df = pd.read_csv(local_filename)

# Format columns
site_df['datetime'] = pd.to_datetime(site_df['datetime'])
site_df['date'] = pd.to_datetime(site_df['date'])
site_df['minute'] = site_df['datetime'].dt.minute
site_df['site_id'] = site_id

# Move from HH to H level
site_df = site_df.loc[site_df['datetime'].dt.minute == 0, ].copy()
site_df.drop('minute', axis=1, inplace=True)

In [10]:
site_df.head(2)

Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,LW_IN_F_QC,LW_IN_ERA,VPD_F,VPD_F_QC,VPD_ERA,P_F,P_F_QC,P_ERA,PA_F,PA_F_QC,PA_ERA,NETRAD,PPFD_IN,G_F_MDS,G_F_MDS_QC,LE_F_MDS,LE_F_MDS_QC,LE_CORR,H_F_MDS,H_F_MDS_QC,H_CORR,NEE_VUT_REF,NEE_VUT_REF_QC,NEE_CUT_REF,NEE_CUT_REF_QC,GPP_NT_VUT_REF,GPP_DT_VUT_REF,GPP_NT_CUT_REF,GPP_DT_CUT_REF,RECO_NT_VUT_REF,RECO_DT_VUT_REF,RECO_NT_CUT_REF,RECO_DT_CUT_REF,datetime,year,month,day,hour,SITE_ID,date,NEE_VUT_REF_qa,SW_DIF,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,site_id
0,201009030000,201009030030,17.05,0,20.063,0.0,0.0,0,0.0,344.0,0.0,369.077,1.361,0,5.681,0.0,0.0,0.0,94.25,0.0,94.277,-54.0,,-40.0,0.0,2.0,0.0,2.67961,-7.0,0.0,-9.35371,1.92074,0.0,1.92074,0.0,-0.88585,0.0,-0.88585,0.0,1.03489,0.75614,1.03198,0.78076,2010-09-03 00:00:00,2010,9,3,0,AU-ASM,2010-09-03,1.92074,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM
2,201009030100,201009030130,16.68,0,20.1,0.0,0.0,0,0.0,353.0,0.0,370.182,1.33,0,5.622,0.0,0.0,0.0,94.29,0.0,94.226,-43.0,,-35.0,0.0,4.0,1.0,5.35922,-11.0476,1.0,-14.7623,1.5979,1.0,1.33708,1.0,-0.58022,0.0,-0.58022,0.0,1.01768,0.74104,1.01503,0.76282,2010-09-03 01:00:00,2010,9,3,1,AU-ASM,2010-09-03,1.5979,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM


## Engineer Features

#### MSC Features

In [17]:
# MSC Features
def engineer_msc_features(df, input_features):
    seasons = {
            "Winter": [12, 1, 2],
            "Spring": [3, 4, 5],
            "Summer": [6, 7, 8],
            "Fall": [9, 10, 11]
        }

    # Function to map months to the corresponding season
    def month_to_season(month):
        for season, months in seasons.items():
            if month in months:
                return season
        return None
    df['season'] = df.month.map(month_to_season)

    # Loop through input features
    for feature in input_features:
        # Get MSC features
        mean_cycles = {}
        for season, months in seasons.items():
            mean_cycles[season] = df[df['season'] == season][feature].mean()

        # Get Amplitude, Min features
        amplitude_msc = max(mean_cycles.values()) - min(mean_cycles.values())
        min_msc = min(mean_cycles.values())

        # Merge into DF
        for season, mean_value in mean_cycles.items():
            df.loc[df['season'] == season, f"{feature}_szn_mean"] = mean_value
        df[f"{feature}_amp_msc"] = amplitude_msc
        df[f"{feature}_min_msc"] = min_msc

    # Remove season col
    df.drop(columns=['season'], inplace=True)

    return df

msc_features = ['TA_ERA', 'SW_IN_ERA', 'P_ERA', 'EVI', 'NDVI', 'NIRv', 'b4']
test = engineer_msc_features(site_df, msc_features)
test.head(2)

Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,LW_IN_F_QC,LW_IN_ERA,VPD_F,VPD_F_QC,VPD_ERA,P_F,P_F_QC,P_ERA,PA_F,PA_F_QC,PA_ERA,NETRAD,PPFD_IN,G_F_MDS,G_F_MDS_QC,LE_F_MDS,LE_F_MDS_QC,LE_CORR,H_F_MDS,H_F_MDS_QC,H_CORR,NEE_VUT_REF,NEE_VUT_REF_QC,NEE_CUT_REF,NEE_CUT_REF_QC,GPP_NT_VUT_REF,GPP_DT_VUT_REF,GPP_NT_CUT_REF,GPP_DT_CUT_REF,RECO_NT_VUT_REF,RECO_DT_VUT_REF,RECO_NT_CUT_REF,RECO_DT_CUT_REF,datetime,year,month,day,hour,SITE_ID,date,NEE_VUT_REF_qa,SW_DIF,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,site_id,TA_ERA_szn_mean,TA_ERA_amp_msc,TA_ERA_min_msc,SW_IN_ERA_szn_mean,SW_IN_ERA_amp_msc,SW_IN_ERA_min_msc,P_ERA_szn_mean,P_ERA_amp_msc,P_ERA_min_msc,EVI_szn_mean,EVI_amp_msc,EVI_min_msc,NDVI_szn_mean,NDVI_amp_msc,NDVI_min_msc,NIRv_szn_mean,NIRv_amp_msc,NIRv_min_msc,b4_szn_mean,b4_amp_msc,b4_min_msc
0,201009030000,201009030030,17.05,0,20.063,0.0,0.0,0,0.0,344.0,0.0,369.077,1.361,0,5.681,0.0,0.0,0.0,94.25,0.0,94.277,-54.0,,-40.0,0.0,2.0,0.0,2.67961,-7.0,0.0,-9.35371,1.92074,0.0,1.92074,0.0,-0.88585,0.0,-0.88585,0.0,1.03489,0.75614,1.03198,0.78076,2010-09-03 00:00:00,2010,9,3,0,AU-ASM,2010-09-03,1.92074,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468
2,201009030100,201009030130,16.68,0,20.1,0.0,0.0,0,0.0,353.0,0.0,370.182,1.33,0,5.622,0.0,0.0,0.0,94.29,0.0,94.226,-43.0,,-35.0,0.0,4.0,1.0,5.35922,-11.0476,1.0,-14.7623,1.5979,1.0,1.33708,1.0,-0.58022,0.0,-0.58022,0.0,1.01768,0.74104,1.01503,0.76282,2010-09-03 01:00:00,2010,9,3,1,AU-ASM,2010-09-03,1.5979,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468


#### Precipitation Rolling sums

In [18]:
def engineer_prcp_running_sums(df, precip_col='P_ERA'):
    # Set the datetime column as index
    df = df.set_index('datetime')

    # Calculate the running sum of the last week's precipitation (168 hours)
    df[f"prcp_week_sum"] = df[precip_col].rolling(window='168H').sum()
    df[f"prcp_month_sum"] = df[precip_col].rolling(window='720H').sum()

    # Reset the index
    df.reset_index(inplace=True)

    return df

test = engineer_prcp_running_sums(site_df, 'P_ERA')
test.head(20)

Unnamed: 0,datetime,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,LW_IN_F_QC,LW_IN_ERA,VPD_F,VPD_F_QC,VPD_ERA,P_F,P_F_QC,P_ERA,PA_F,PA_F_QC,PA_ERA,NETRAD,PPFD_IN,G_F_MDS,G_F_MDS_QC,LE_F_MDS,LE_F_MDS_QC,LE_CORR,H_F_MDS,H_F_MDS_QC,H_CORR,NEE_VUT_REF,NEE_VUT_REF_QC,NEE_CUT_REF,NEE_CUT_REF_QC,GPP_NT_VUT_REF,GPP_DT_VUT_REF,GPP_NT_CUT_REF,GPP_DT_CUT_REF,RECO_NT_VUT_REF,RECO_DT_VUT_REF,RECO_NT_CUT_REF,RECO_DT_CUT_REF,year,month,day,hour,SITE_ID,date,NEE_VUT_REF_qa,SW_DIF,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,site_id,TA_ERA_szn_mean,TA_ERA_amp_msc,TA_ERA_min_msc,SW_IN_ERA_szn_mean,SW_IN_ERA_amp_msc,SW_IN_ERA_min_msc,P_ERA_szn_mean,P_ERA_amp_msc,P_ERA_min_msc,EVI_szn_mean,EVI_amp_msc,EVI_min_msc,NDVI_szn_mean,NDVI_amp_msc,NDVI_min_msc,NIRv_szn_mean,NIRv_amp_msc,NIRv_min_msc,b4_szn_mean,b4_amp_msc,b4_min_msc,prcp_week_sum,prcp_month_sum
0,2010-09-03 00:00:00,201009030000,201009030030,17.05,0,20.063,0.0,0.0,0,0.0,344.0,0.0,369.077,1.361,0,5.681,0.0,0.0,0.0,94.25,0.0,94.277,-54.0,,-40.0,0.0,2.0,0.0,2.67961,-7.0,0.0,-9.35371,1.92074,0.0,1.92074,0.0,-0.88585,0.0,-0.88585,0.0,1.03489,0.75614,1.03198,0.78076,2010,9,3,0,AU-ASM,2010-09-03,1.92074,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.0,0.0
1,2010-09-03 01:00:00,201009030100,201009030130,16.68,0,20.1,0.0,0.0,0,0.0,353.0,0.0,370.182,1.33,0,5.622,0.0,0.0,0.0,94.29,0.0,94.226,-43.0,,-35.0,0.0,4.0,1.0,5.35922,-11.0476,1.0,-14.7623,1.5979,1.0,1.33708,1.0,-0.58022,0.0,-0.58022,0.0,1.01768,0.74104,1.01503,0.76282,2010,9,3,1,AU-ASM,2010-09-03,1.5979,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.0,0.0
2,2010-09-03 02:00:00,201009030200,201009030230,18.17,0,20.051,0.0,0.0,0,0.0,386.0,0.0,370.182,2.921,0,5.563,0.0,0.0,0.0,93.77,0.0,94.197,-17.0,,-23.0,0.0,7.25,1.0,9.71359,-12.875,1.0,-17.2041,0.93635,1.0,-0.73019,1.0,0.15116,0.0,0.15116,0.0,1.08751,0.80262,1.08376,0.8363,2010,9,3,2,AU-ASM,2010-09-03,0.93635,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.0,0.0
3,2010-09-03 03:00:00,201009030300,201009030330,18.03,0,20.002,0.0,0.0,0,0.0,390.0,0.0,370.182,2.689,0,5.503,0.0,0.0,0.0,93.58,0.0,94.168,-12.0,,-12.0,0.0,6.84211,1.0,9.16709,-13.0526,1.0,-17.4415,0.93635,1.0,-0.73019,1.0,0.14455,0.0,0.14455,0.0,1.08089,0.79675,1.07725,0.82926,2010,9,3,3,AU-ASM,2010-09-03,0.93635,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.0,0.0
4,2010-09-03 04:00:00,201009030400,201009030430,17.92,0,19.839,0.0,0.0,0,0.0,387.0,0.0,374.892,2.67,0,5.234,0.0,0.0,0.0,93.88,0.0,94.152,-15.0,,-15.0,0.0,2.0,0.0,2.67961,-8.0,0.0,-10.69,1.03457,1.0,-0.47977,1.0,0.04113,0.0,0.04113,0.0,1.0757,0.79215,1.07214,0.82374,2010,9,3,4,AU-ASM,2010-09-03,1.03457,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.0,0.0
5,2010-09-03 05:00:00,201009030500,201009030530,17.35,0,19.56,0.0,0.0,0,0.0,396.0,0.0,374.892,1.784,0,4.756,0.8,0.0,0.0,93.76,0.0,94.149,-5.0,,-14.0,0.0,7.6,1.0,10.1825,-14.32,1.0,-19.135,1.72215,1.0,1.34794,1.0,-0.67324,0.0,-0.67324,0.0,1.04891,0.76848,1.04578,0.79545,2010,9,3,5,AU-ASM,2010-09-03,1.72215,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.0,0.0
6,2010-09-03 06:00:00,201009030600,201009030630,17.32,0,19.282,0.0,0.0,0,0.0,395.0,0.0,374.892,1.187,0,4.277,0.2,0.0,0.937,93.28,0.0,94.147,-6.0,,-13.0,0.0,8.64,1.0,11.5759,-14.8,1.0,-19.7764,1.3739,1.0,0.82882,1.0,-0.32639,0.0,-0.32639,0.0,1.04751,0.76724,1.0444,0.79398,2010,9,3,6,AU-ASM,2010-09-03,1.3739,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.937,0.937
7,2010-09-03 07:00:00,201009030700,201009030730,17.8,0,19.338,137.062,0.0,0,46.192,395.0,0.0,383.387,1.019,0,4.05,1.2,0.0,0.0,94.07,0.0,94.167,-8.0,,-10.0,0.0,11.1429,1.0,14.9293,-17.2857,1.0,-23.0979,0.89316,1.0,0.04761,1.0,0.17688,0.0,0.17688,0.0,1.07005,0.78714,1.06658,0.81775,2010,9,3,7,AU-ASM,2010-09-03,0.89316,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.937,0.937
8,2010-09-03 08:00:00,201009030800,201009030830,18.25,0,19.729,445.977,0.0,0,151.625,401.0,0.0,383.387,1.468,0,4.075,5.0,0.0,0.0,93.32,0.0,94.209,7.0,,-9.0,0.0,11.1579,1.0,14.9494,-15.8947,1.0,-21.2393,0.88515,1.0,-0.08691,1.0,0.20615,0.0,0.20615,0.0,1.0913,0.80598,1.08749,0.84034,2010,9,3,8,AU-ASM,2010-09-03,0.88515,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,0.937,0.937
9,2010-09-03 09:00:00,201009030900,201009030930,18.6,0,20.12,719.73,45.0,0,245.099,402.0,0.0,383.387,1.286,0,4.099,0.0,0.0,1.392,94.15,0.0,94.251,36.0,,15.0,0.0,26.5,2.0,35.5049,1.0,2.0,1.33981,0.21269,1.0,0.21269,1.0,0.89522,0.66353,0.89522,0.64097,1.10791,0.82076,1.10383,0.85812,2010,9,3,9,AU-ASM,2010-09-03,0.21269,,0.1584,0.33739,0.06299,0.0925,0.1867,0.034,0.0572,0.2494,0.2447,0.1949,SAV,Arid,AU-ASM,24.78207,13.98399,14.70853,293.90267,110.90708,195.09566,0.01158,0.04404,0.00093,0.12416,0.0232,0.12303,0.25139,0.06207,0.25139,0.05097,0.0115,0.04723,0.06689,0.01601,0.05468,2.329,2.329


#### Hemisphere Feature (on meta df)

In [20]:
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
site_meta = pd.read_csv(site_metadata_filename)
print(len(site_meta))
site_meta.head(2)

286


Unnamed: 0,site_id,dataset,start_year,end_year,file,is_dup,IGBP,elevation,lat,long,site_name,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename,size,country,record_count,site_IGBP,site_koppen,start_time,end_time,recorded_day_count,recorded_day_percentage,monthly_data_available
0,AR-SLu,FLUXNET,2009,2011,FLX_AR-SLu_FLUXNET2015_FULLSET_MM_2009-2011_1-...,False,MF,,-33.4648,-66.4598,San Luis,7,2,BSk,Arid,C3,67.08,data_full_half_hourly_raw_v0_1_AR-SLu.csv,9785735.0,AR,22128.0,MF,Arid,2009-12-21 00:00:00,2011-03-26 23:30:00,461.0,1.0,Yes
1,AR-Vir,FLUXNET,2009,2012,FLX_AR-Vir_FLUXNET2015_FULLSET_MM_2009-2012_1-...,False,ENF,,-28.2395,-56.1886,Virasoro,14,3,Cfa,Temperate,C3,8.75,data_full_half_hourly_raw_v0_1_AR-Vir.csv,14564015.0,AR,33984.0,ENF,Temperate,2010-02-13 00:00:00,2012-06-13 23:30:00,708.0,0.83099,Yes


In [21]:
# Simple North vs. South hemisphere
def get_hemisphere_NS(latitude):
    if latitude >= 0:
        return 'North'
    else:
        return 'South'

site_meta['hemisphere_NS'] = site_meta['lat'].apply(get_hemisphere_NS)

In [23]:
# 5 hemisphere bands
def get_latitude_band(latitude):
    if -90 <= latitude < -54:
        return 'lat_band_1'
    elif -54 <= latitude < -18:
        return 'lat_band_2'
    elif -18 <= latitude < 18:
        return 'lat_band_3'
    elif 18 <= latitude < 54:
        return 'lat_band_4'
    elif 54 <= latitude <= 90:
        return 'lat_band_5'
    else:
        return 'Invalid latitude'

# Assuming your DataFrame is named 'data' and has a 'latitude' column
# Create a new column called 'latitude_band' based on the 'latitude' column
site_meta['lat_band'] = site_meta['lat'].apply(get_latitude_band)

print(len(site_meta))

286


In [24]:
# Save out to overwrite site meta with new features
#site_meta.to_csv(site_metadata_filename, index=False)