In [65]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import zscore

# Set paths
wd = '/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling/'
data = wd + 'data/datasets/'
hh_data = data + 'half_hourly/'
code = wd + 'code/src/'

## Extreme Events: One Site Example

#### Load Site Data

In [67]:
site = os.listdir(hh_data)[1]
site_df = pd.read_csv(hh_data + site)
site_name = site.split('_')[-1][:-4]
print(f"Site: {site_name}")
site_df.head()

Site: CN-HaM


Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,...,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen
0,200201150000,200201150030,-8.275,0,-14.03,0.0,0.0,0,0.0,238.465,...,-0.006405,0.9252,0.9123,0.8571,0.9113,0.5601,0.1609,0.0708,GRA,Polar
1,200201150030,200201150100,-8.926,0,-14.124,0.0,0.0,0,0.0,238.465,...,-0.006405,0.9252,0.9123,0.8571,0.9113,0.5601,0.1609,0.0708,GRA,Polar
2,200201150100,200201150130,-9.717,0,-14.218,0.0,0.0,0,0.0,238.465,...,-0.006405,0.9252,0.9123,0.8571,0.9113,0.5601,0.1609,0.0708,GRA,Polar
3,200201150130,200201150200,-10.092,0,-14.311,0.0,0.0,0,0.0,238.465,...,-0.006405,0.9252,0.9123,0.8571,0.9113,0.5601,0.1609,0.0708,GRA,Polar
4,200201150200,200201150230,-10.173,0,-14.405,0.0,0.0,0,0.0,228.64,...,-0.006405,0.9252,0.9123,0.8571,0.9113,0.5601,0.1609,0.0708,GRA,Polar


#### Find top events in feature distribution

In [68]:
def identify_outliers(arr, feat, df):
    # Get z score for feature vector
    z = zscore(arr)
    outliers = np.where(np.abs(z) > 3)
    
    # Save out info to df, indicate if any outliers or just saving max value
    if len(outliers[0]) > 0:
        inds = outliers[0]
        for i in inds:
            sev_df.loc[len(sev_df)] = [site_name, feat, 1, i, str(df['TIMESTAMP_START'][i]), arr[i], z[i]]
    # else:
    #     inds = np.argmax(arr)
    #     sev_df.loc[len(sev_df)] = [site_name, feat, 0, inds, str(site['TIMESTAMP_START'][inds]), arr[inds], z[inds]]

# Define features of interest, df
ex_feats = ['TA_ERA', 'P_ERA', 'VPD_ERA', 'SW_IN_ERA', 'NDVI', 'EVI', 'NIRv'] + [f"b{i}" for i in range(1, 8)]
sev_df = pd.DataFrame(columns=['site', 'feature', 'outlier_ind', 'index', 'time', 'value', 'z_val'])

In [70]:
# Loop through features, track outliers
for feat in ex_feats:
    fv = site_df[feat].values
    identify_outliers(fv, feat, site_df)

sev_df.head()

Unnamed: 0,site,feature,outlier_ind,index,time,value,z_val
0,CN-HaM,P_ERA,1,3408,200203270000,0.542,3.449279
1,CN-HaM,P_ERA,1,3409,200203270030,0.542,3.449279
2,CN-HaM,P_ERA,1,3410,200203270100,0.542,3.449279
3,CN-HaM,P_ERA,1,3720,200204021200,0.604,3.870795
4,CN-HaM,P_ERA,1,3721,200204021230,0.604,3.870795


## Extreme Events: All Sites

In [72]:
# Define features of interest, df
ex_feats = ['TA_ERA', 'P_ERA', 'VPD_ERA', 'SW_IN_ERA', 'NDVI', 'EVI', 'NIRv'] + [f"b{i}" for i in range(1, 8)]
sev_df = pd.DataFrame(columns=['site', 'feature', 'outlier_ind', 'index', 'time', 'value', 'z_val'])
sites = [x for x in os.listdir(hh_data) if 'data_full_half_hourly' in x]


# Loop through features, track outliers
for site in sites:
    site_df = pd.read_csv(hh_data + site)
    site_name = site.split('_')[-1][:-4]

    for feat in ex_feats:
        fv = site_df[feat].values
        identify_outliers(fv, feat, site_df)

sev_df.head()

Unnamed: 0,site,feature,outlier_ind,index,time,value,z_val
0,CN-HaM,P_ERA,1,3408,200203270000,0.542,3.449279
1,CN-HaM,P_ERA,1,3409,200203270030,0.542,3.449279
2,CN-HaM,P_ERA,1,3410,200203270100,0.542,3.449279
3,CN-HaM,P_ERA,1,3720,200204021200,0.604,3.870795
4,CN-HaM,P_ERA,1,3721,200204021230,0.604,3.870795


In [76]:
sev_df.groupby(['site', 'feature']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,outlier_ind,index,time,value,z_val
site,feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AU-Emr,P_ERA,767,767,767,767,767
AU-Emr,TA_ERA,26,26,26,26,26
AU-Emr,VPD_ERA,509,509,509,509,509
CN-HaM,P_ERA,927,927,927,927,927
CN-HaM,VPD_ERA,840,840,840,840,840
DK-Gds,P_ERA,129,129,129,129,129
DK-Gds,SW_IN_ERA,159,159,159,159,159
DK-Gds,TA_ERA,20,20,20,20,20
DK-Gds,VPD_ERA,195,195,195,195,195
US-UMd,P_ERA,3474,3474,3474,3474,3474
