In [1]:
from calendar import monthrange
from datetime import datetime
from datetime import timedelta
import pandas as pd
from pandas.api.types import CategoricalDtype
import os
import json

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
sys.path.append(os.path.abspath("../../tools"))
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [2]:
root_dir =  os.path.abspath(os.getcwd()+"..\\..\\..\\..\\..\\")
tmp_dir = root_dir + "\\.tmp"
data_dir = root_dir + "\\data\\"

site_metadata_filename = data_dir + "site-metadata.csv"

In [3]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

target_sites = tier1_sites + tier2_sites
target_sites

['US-MMS',
 'US-Vcp',
 'FR-Pue',
 'CH-Lae',
 'US-Var',
 'US-Ne2',
 'ES-LJu',
 'US-Ton',
 'US-UMB',
 'US-Me2',
 'FI-Hyy',
 'US-NR1',
 'IT-Lav',
 'US-Wkg',
 'US-ARM',
 'US-SRM']

In [4]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename, usecols=['site_id','filename'])

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(target_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df

size:(16, 2)


Unnamed: 0,site_id,filename
67,FR-Pue,data_full_half_hourly_raw_v0_1_FR-Pue.csv
117,US-NR1,data_full_half_hourly_raw_v0_1_US-NR1.csv
119,US-Ne2,
124,US-SRM,data_full_half_hourly_raw_v0_1_US-SRM.csv
127,US-Ton,data_full_half_hourly_raw_v0_1_US-Ton.csv
130,US-Var,data_full_half_hourly_raw_v0_1_US-Var.csv
144,US-Wkg,data_full_half_hourly_raw_v0_1_US-Wkg.csv
166,US-ARM,data_full_half_hourly_raw_v0_1_US-ARM.csv
181,US-MMS,
182,US-Me2,data_full_half_hourly_raw_v0_1_US-Me2.csv


In [5]:
def get_min_max(df):
    return (df.min(), df.max())

def get_min_max_datetime(df):
    return (pd.to_datetime(df).min(), pd.to_datetime(df).max())

def is_leap_year(year):
    return year%4 == 0 ;

# Process a sample file

In [6]:
r = site_metadata_df[['site_id','filename']].iloc[0]
print(r.filename)
local_filename = tmp_dir + "\\" + r.filename

data_full_half_hourly_raw_v0_1_FR-Pue.csv


In [7]:
features = ['SITE_ID','TIMESTAMP_START', 'TIMESTAMP_END', 'datetime', 'date', 'year', 'month', 'day', 'hour',"GPP_NT_VUT_REF"]
site_df = pd.read_csv(local_filename, usecols=features)
site_df['datetime'] = pd.to_datetime(site_df['datetime'])
site_df['date'] = pd.to_datetime(site_df['date'])
print(f"size:{site_df.shape}")
site_df.head()

size:(245760, 10)


Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,GPP_NT_VUT_REF,datetime,year,month,day,hour,SITE_ID,date
0,200007260000,200007260030,0.260122,2000-07-26 00:00:00,2000,7,26,0,FR-Pue,2000-07-26
1,200007260030,200007260100,0.221846,2000-07-26 00:30:00,2000,7,26,0,FR-Pue,2000-07-26
2,200007260100,200007260130,0.315191,2000-07-26 01:00:00,2000,7,26,1,FR-Pue,2000-07-26
3,200007260130,200007260200,-0.813255,2000-07-26 01:30:00,2000,7,26,1,FR-Pue,2000-07-26
4,200007260200,200007260230,-0.830025,2000-07-26 02:00:00,2000,7,26,2,FR-Pue,2000-07-26


In [8]:
# is there NA
if site_df.isna().sum().sum() != 0:
    print(f'{site_df.SITE_ID.iloc[0]}\n{site_df.isna().sum()}\n')

In [9]:
def has_full_month_record(year, month, actual_record_count):
    if actual_record_count == 0:
#         print(f"  {month:2}: none")
        return False
    
    expected_days = monthrange(year, month)
    expected_record_count = expected_days[1]*48
    if(expected_record_count != actual_record_count):
#         print(f"  {month:2}: expected-[{expected_record_count}], actual-[{actual_record_count}]")
        return False
    return True

def has_full_day_record(year, month, day, actual_record_count):
    if actual_record_count == 0:
#         print(f"    {month}/{day}: none")
        return False
    
    expected_record_count = 48
    if(expected_record_count != actual_record_count):
        print(f"    {month}/{day}: expected-[{expected_record_count}], actual-[{actual_record_count}]")
        return False
    return True

def plot_month_records(site_id, year, month, df):
    df.plot.bar(x='datetime', y='GPP_NT_VUT_REF', 
                    title=f'{site_id}:{year}.{month}',
                    figsize= (16,4))

In [10]:
FULL_YEAR_DATA_COUNT = 2*24*365 #17520
LEAP_YEAR_DATA_COUNT = 2*24*366 #17568

def get_gaps(site_id, site_df):
    gaps_intervals_df = pd.DataFrame(columns=['start', 'end', 'site_id'])
    start_year, end_year = get_min_max(site_df['year'])
    gap_start, gap_end = None, None
    for y in range(start_year, end_year+1):
        year_df = site_df.loc[site_df['year'] == y]

        start_date, end_date = get_min_max_datetime(year_df['datetime'])
        date_delta = (end_date - start_date)
        
        if (not start_date.is_leap_year and year_df.shape[0] != FULL_YEAR_DATA_COUNT) or\
           (start_date.is_leap_year and year_df.shape[0] != LEAP_YEAR_DATA_COUNT):
            #print(f"{y}: {year_df.shape[0]} ({start_date.date()}~{end_date.date()} {date_delta}) {'Leap' if start_date.is_leap_year else ''}")
            for m in range(1,13):
                if not (y == start_year and m < start_date.month):
                    month_df = year_df.loc[year_df['month'] == m]
                    if month_df.shape[0] == 0:
                        #print(f"  {m:2}: none")
                        if gap_start == None:
                            gap_start = datetime(y, m, 1)
                    elif has_full_month_record(y, m, month_df.shape[0]):
                        if gap_start != None:
                            gap_end = datetime(y, m, 1, 23, 30) - timedelta(days=1)
                            gap_data = {'start':gap_start,'end':gap_end,'site_id':site_id}
                            gaps_intervals_df=gaps_intervals_df.append(gap_data, ignore_index=True) 
                            #print(f"    gap:{gap_start} - {gap_end}")
                            gap_start = None
                    else:
                        for d in range(1, monthrange(y, m)[1]+1):
                            if not (y == start_year and m <= start_date.month and d < start_date.day):
                                day_df = month_df.loc[month_df['day'] == d]
                                if has_full_day_record(y, m, d, day_df.shape[0]):
                                    if gap_start != None:
                                        gap_end = day_df['datetime'].max() - timedelta(days=1)
                                        gap_data = {'start':gap_start,'end':gap_end,'site_id':site_id}
                                        gaps_intervals_df=gaps_intervals_df.append(gap_data, ignore_index=True) 
                                        #print(f"    gap:{gap_start} - {gap_end}")
                                        gap_start = None
                                elif gap_start == None:
                                    gap_start = datetime(y, m, d)
        elif gap_start != None:
            gap_end = year_df['datetime'].max() - timedelta(days=1)
            gap_data = {'start':gap_start,'end':gap_end,'site_id':site_id}
            gaps_intervals_df=gaps_intervals_df.append(gap_data, ignore_index=True)
            #print(f"    gap:{gap_start} - {gap_end}")
            gap_start = None
            
    if gap_start != None:
        gap_end = site_df['date'].max()
        gap_data = {'start':gap_start,'end':gap_end,'site_id':site_id}
        gaps_intervals_df=gaps_intervals_df.append(gap_data, ignore_index=True)
        #print(f"    gap:{gap_start} - {gap_end}")
        gap_start = None
    
    return gaps_intervals_df

get_gaps("ES-LJu", site_df)

Unnamed: 0,start,end,site_id
0,2000-09-12,2000-09-23 23:30:00,ES-LJu
1,2000-11-16,2000-11-28 23:30:00,ES-LJu
2,2009-08-18,2009-09-01 23:30:00,ES-LJu
3,2010-06-29,2010-07-08 23:30:00,ES-LJu
4,2010-12-06,2010-12-18 23:30:00,ES-LJu
5,2011-08-05,2011-08-31 23:30:00,ES-LJu
6,2012-01-09,2012-03-10 23:30:00,ES-LJu


In [71]:
def get_intervals(site_id, site_df):
    intervals_df = pd.DataFrame(columns=['start', 'end', 'site_id'])
    start_year, end_year = get_min_max(site_df['year'])
    interval_start, interval_end = None, None
    
    for y in range(start_year, end_year+1):
        year_df = site_df.loc[site_df['year'] == y]
        start_date, end_date = get_min_max_datetime(year_df['datetime'])
        date_delta = (end_date - start_date)
        
        if interval_start == None:
            interval_start = year_df['datetime'].min()
            interval_end = None
        
        if (not start_date.is_leap_year and year_df.shape[0] != FULL_YEAR_DATA_COUNT) or\
           (start_date.is_leap_year and year_df.shape[0] != LEAP_YEAR_DATA_COUNT):
#             print(f"{y}: {year_df.shape[0]} ({start_date.date()}~{end_date.date()} {date_delta}) {'Leap' if start_date.is_leap_year else ''}")
            
            for m in range(1,13):
                if not (y == start_year and m < start_date.month):
                    month_df = year_df.loc[year_df['month'] == m]
                    if not has_full_month_record(y, m, month_df.shape[0]):                           
                        for d in range(1, monthrange(y, m)[1]+1):
                            if not (y == start_year and m <= start_date.month and d < start_date.day):
                                day_df = month_df.loc[month_df['day'] == d]
                                if not has_full_day_record(y, m, d, day_df.shape[0]):
                                    if interval_start != None:
                                        interval_end = datetime(y, m, d, 23, 30, 0) - timedelta(days=1)
                                        if interval_end >=  interval_start:
                                            interval_data = {'start':interval_start,'end':interval_end,'site_id':site_id}
                                            intervals_df = intervals_df.append(interval_data, ignore_index=True)     
                                            #print(f"    int: {interval_start} - {interval_end}")
                                            interval_start = None
                                            interval_end = None
                                elif interval_start == None:
                                    interval_start = datetime(y, m, d, 0, 0, 0)
                                    interval_end = None
                    elif interval_start == None:
                        interval_start = month_df['datetime'].min()
                        interval_end = None
                    
    if interval_start != None:
        interval_end = year_df['datetime'].max()
        interval_data = {'start':interval_start,'end':interval_end,'site_id':site_id}
        intervals_df = intervals_df.append(interval_data, ignore_index=True) 
#         print(f"    int: {interval_start} - {interval_end}")
        interval_start = None
    return intervals_df

get_intervals("FR-Pue", site_df)

Unnamed: 0,start,end,site_id
0,2003-01-01,2005-01-23 23:30:00,FR-Pue
1,2005-02-05,2011-09-15 23:30:00,FR-Pue
2,2011-10-04,2011-10-13 23:30:00,FR-Pue
3,2011-11-17,2012-03-22 23:30:00,FR-Pue
4,2012-04-02,2012-05-15 23:30:00,FR-Pue
5,2012-05-29,2012-07-26 23:30:00,FR-Pue
6,2012-08-22,2012-09-01 23:30:00,FR-Pue
7,2012-09-14,2012-09-22 23:30:00,FR-Pue
8,2012-10-05,2012-10-07 23:30:00,FR-Pue
9,2012-10-19,2012-12-24 23:30:00,FR-Pue


# Process all "golden files"

In [12]:
# Iterate through sites
intervals_df = None
features = ['SITE_ID','TIMESTAMP_START', 'TIMESTAMP_END', 'datetime', 'date', 'year', 'month', 'day', 'hour', 'GPP_NT_VUT_REF']
for i, r in site_metadata_df[['site_id','filename']].iterrows():
    if not r.filename or type(r.filename) != type(""):
        print(f'\nERROR: {r.site_id} is mssing hourly data.')
        continue

    local_filename = tmp_dir + "\\" + r.filename
    site_df = pd.read_csv(local_filename, usecols=features)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    
    if site_df.isna().sum().sum() != 0:
        print(f"\n{r.site_id}")
        print(f'{site_df.isna().sum()}\n')
        site_df.dropna(inplace=True)
    
    site_interval_df = get_intervals(r.site_id, site_df)
    if type(intervals_df) == type(None):
        intervals_df = site_interval_df
    else:
        intervals_df = pd.concat([intervals_df, site_interval_df])

# Save site_data checkpoint
intervals_df.to_csv(data_dir + "gold-site-intervals.csv");


ERROR: US-Ne2 is mssing hourly data.

ERROR: US-MMS is mssing hourly data.

US-Me2
TIMESTAMP_START        0
TIMESTAMP_END          0
GPP_NT_VUT_REF     16896
datetime               0
year                   0
month                  0
day                    0
hour                   0
SITE_ID                0
date                   0
dtype: int64


ES-LJu
TIMESTAMP_START       0
TIMESTAMP_END         0
GPP_NT_VUT_REF     8880
datetime              0
year                  0
month                 0
day                   0
hour                  0
SITE_ID               0
date                  0
dtype: int64



In [13]:
# Iterate through sites
gap_df = None
features = ['SITE_ID','TIMESTAMP_START', 'TIMESTAMP_END', 'datetime', 'date', 'year', 'month', 'day', 'hour', 'GPP_NT_VUT_REF']
for i, r in site_metadata_df[['site_id','filename']].iterrows():
    if not r.filename or type(r.filename) != type(""):
        print(f'\nERROR: {r.site_id} is mssing hourly data.')
        continue

    local_filename = tmp_dir + "\\" + r.filename
    site_df = pd.read_csv(local_filename, usecols=features)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    
    if site_df.isna().sum().sum() != 0:
        print(f"\n{r.site_id}")
        print(f'{site_df.isna().sum()}\n')
        site_df.dropna(inplace=True)
    
    site_gap_df = get_gaps(r.site_id, site_df)
    if type(gap_df) == type(None):
        gap_df = site_gap_df
    else:
        gap_df = pd.concat([gap_df, site_gap_df])

# Save site_data checkpoint
gap_df.to_csv(data_dir + "gold-site-gap-intervals.csv");


ERROR: US-Ne2 is mssing hourly data.

ERROR: US-MMS is mssing hourly data.

US-Me2
TIMESTAMP_START        0
TIMESTAMP_END          0
GPP_NT_VUT_REF     16896
datetime               0
year                   0
month                  0
day                    0
hour                   0
SITE_ID                0
date                   0
dtype: int64


ES-LJu
TIMESTAMP_START       0
TIMESTAMP_END         0
GPP_NT_VUT_REF     8880
datetime              0
year                  0
month                 0
day                   0
hour                  0
SITE_ID               0
date                  0
dtype: int64



# Plot Gaps

In [14]:
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = ['start_time', 'end_time', 'site_id', 'IGBP'])
site_metadata_df['start_time'] = pd.to_datetime(site_metadata_df['start_time'])
site_metadata_df['end_time'] = pd.to_datetime(site_metadata_df['end_time'])
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(target_sites)]

In [67]:
fig = px.timeline(site_metadata_df, x_start="start_time", x_end="end_time", y="site_id", color="IGBP")
fig.update_yaxes(title_text = "", title_standoff = 0,
                 showticklabels=True,
                 autorange="reversed") # otherwise tasks are listed from the bottom up
fig.update_layout(title={'text': "Record Timeline per Site",
                         'y':0.95,'x':0.5},
                  height = 500, width = 800,
                 template='plotly_white')
newnames = {
    'CRO': 'Croplands',
    'DBF': 'Deciduous Needleleaf Forests',
    'EBF': 'Evergreen Broadleaf Forests', 
    'ENF': 'Evergreen Needleleaf Forests',
    'GRA': 'Grasslands',
    'MF' : 'Mixed Forests',
    'OSH': 'Open Shrublands',
    'WSA': 'Woody Savannas'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )

fig.show()

In [24]:
sub_df = site_metadata_df[['site_id', 'IGBP']]
sub_df.columns

Index(['site_id', 'IGBP'], dtype='object')

In [20]:
intervals_df.columns

Index(['start', 'end', 'site_id'], dtype='object')

In [70]:
plot_df = intervals_df.merge(sub_df, left_on = 'site_id', right_on = 'site_id')
plot_df[plot_df['site_id'] == 'ES-LJu']

Unnamed: 0,start,end,site_id,IGBP
102,2005-01-01,2005-01-06 23:30:00,ES-LJu,OSH
103,2005-06-18,2005-07-30 23:30:00,ES-LJu,OSH
104,2005-08-11,2006-03-24 23:30:00,ES-LJu,OSH
105,2006-04-06,2006-07-15 23:30:00,ES-LJu,OSH
106,2006-07-28,2006-11-05 23:30:00,ES-LJu,OSH
107,2006-12-21,2007-01-11 23:30:00,ES-LJu,OSH
108,2007-02-16,2007-10-30 23:30:00,ES-LJu,OSH
109,2007-11-22,2007-11-29 23:30:00,ES-LJu,OSH
110,2007-12-24,2008-01-15 23:30:00,ES-LJu,OSH
111,2008-02-18,2008-11-20 23:30:00,ES-LJu,OSH


In [66]:
plot_df = intervals_df.merge(sub_df, left_on = 'site_id', right_on = 'site_id')
fig = px.timeline(plot_df, x_start="start", x_end="end", y="site_id", color="IGBP")
fig.update_yaxes(title_text = "", title_standoff = 0,
                 showticklabels=True,
                 autorange="reversed") # otherwise tasks are listed from the bottom up
fig.update_layout(title={'text': "Recorded Timeline per Site",
                         'y':0.95,'x':0.5},
                  height = 500, width = 800,
                  legend_title_text= "IGBP<br>Classification",
                  legend_orientation="v",
                  template='plotly_white')

newnames = {
    'CRO': 'Croplands',
    'DBF': 'Deciduous Needleleaf Forests',
    'EBF': 'Evergreen Broadleaf Forests', 
    'ENF': 'Evergreen Needleleaf Forests',
    'GRA': 'Grasslands',
    'MF' : 'Mixed Forests',
    'OSH': 'Open Shrublands',
    'WSA': 'Woody Savannas'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )

fig.show()