In [1]:
from calendar import monthrange
from datetime import datetime
import pandas as pd
import os
import json

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

pd.set_option('display.max_rows', 500)

In [2]:
root_dir =  os.path.abspath(os.getcwd()+"..\\..\\..\\..\\..\\")
tmp_dir = root_dir + "\\.tmp"
data_dir = root_dir + "\\data\\"

site_metadata_filename = data_dir + "site-metadata.csv"

In [3]:
# "Golden" Sites
target_site = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]

In [4]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename)

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(target_site)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df

size:(8, 26)


Unnamed: 0,site_id,dataset,start_year,end_year,file,is_dup,IGBP,elevation,lat,long,...,c4_percent,filename,size,country,record_count,site_IGBP,site_koppen,start_time,end_time,recorded_day_count
67,FR-Pue,FLUXNET,2000,2014,FLX_FR-Pue_FLUXNET2015_FULLSET_MM_2000-2014_2-...,False,EBF,270.0,43.7413,3.5957,...,6.59,data_full_half_hourly_raw_v0_1_FR-Pue.csv,109116169.0,FR,245760.0,EBF,Temperate,2000-07-26 00:00:00,2014-12-31 23:30:00,5120.0
119,US-Ne2,FLUXNET,2001,2013,FLX_US-Ne2_FLUXNET2015_FULLSET_MM_2001-2013_1-...,False,CRO,362.0,41.16487,-96.4701,...,48.91,,,,,,,,,
127,US-Ton,FLUXNET,2001,2014,FLX_US-Ton_FLUXNET2015_FULLSET_MM_2001-2014_1-...,False,WSA,177.0,38.4316,-120.96598,...,0.0,data_full_half_hourly_raw_v0_1_US-Ton.csv,103999932.0,US,230928.0,WSA,Temperate,2001-05-24 00:00:00,2014-12-31 23:30:00,4811.0
130,US-Var,FLUXNET,2000,2014,FLX_US-Var_FLUXNET2015_FULLSET_MM_2000-2014_1-...,False,GRA,129.0,38.4133,-120.9507,...,0.0,data_full_half_hourly_raw_v0_1_US-Var.csv,110098318.0,US,245712.0,GRA,Temperate,2000-11-01 00:00:00,2014-12-31 23:30:00,5119.0
181,US-MMS,AmeriFlux,1999,2017,FLX_US-MMS_FLUXNET2015_FULLSET_MM_1999-2017_be...,True,DBF,275.0,39.3232,-86.4131,...,42.28,,,,,,,,,
206,US-Vcp,AmeriFlux,2007,2017,FLX_US-Vcp_FLUXNET2015_FULLSET_MM_2007-2017_be...,False,ENF,2542.0,35.8624,-106.5974,...,0.04,data_full_half_hourly_raw_v0_1_US-Vcp.csv,72934242.0,US,174528.0,ENF,Cold,2007-01-01 00:00:00,2017-12-31 23:30:00,3636.0
219,CH-Lae,ICOS2020,2004,2020,FLX_CH-Lae_FLUXNET2015_FULLSET_MM_2004-2020_be...,True,MF,689.0,47.47833,8.36439,...,0.0,data_full_half_hourly_raw_v0_1_CH-Lae.csv,116210397.0,CH,288384.0,MF,Cold,2004-04-09 00:00:00,2020-12-17 23:30:00,6008.0
244,ES-LJu,ICOS2020,2004,2020,FLX_ES-LJu_FLUXNET2015_FULLSET_MM_2004-2020_be...,True,OSH,1600.0,36.92659,-2.75212,...,0.0,data_full_half_hourly_raw_v0_1_ES-LJu.csv,111661016.0,ES,239616.0,OSH,Temperate,2004-05-26 00:00:00,2020-12-31 23:30:00,4992.0


In [5]:
def get_min_max(df):
    return (df.min(), df.max())

def get_min_max_datetime(df):
    return (pd.to_datetime(df).min(), pd.to_datetime(df).max())

def is_leap_year(year):
    return year%4 == 0 ;

# Process a sample file

In [6]:
r = site_metadata_df[['site_id','filename']].iloc[-1]
print(r.filename)
local_filename = tmp_dir + "\\" + r.filename

data_full_half_hourly_raw_v0_1_ES-LJu.csv


In [7]:
features = ['SITE_ID','TIMESTAMP_START', 'TIMESTAMP_END', 'datetime', 'date', 'year', 'month', 'day', 'hour',"GPP_NT_VUT_REF"]
site_df = pd.read_csv(local_filename, usecols=features)
site_df['datetime'] = pd.to_datetime(site_df['datetime'])
print(f"size:{site_df.shape}")
site_df.head()

size:(239616, 10)


Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,GPP_NT_VUT_REF,datetime,year,month,day,hour,SITE_ID,date
0,200405260000,200405260030,,2004-05-26 00:00:00,2004,5,26,0,ES-LJu,2004-05-26
1,200405260030,200405260100,,2004-05-26 00:30:00,2004,5,26,0,ES-LJu,2004-05-26
2,200405260100,200405260130,,2004-05-26 01:00:00,2004,5,26,1,ES-LJu,2004-05-26
3,200405260130,200405260200,,2004-05-26 01:30:00,2004,5,26,1,ES-LJu,2004-05-26
4,200405260200,200405260230,,2004-05-26 02:00:00,2004,5,26,2,ES-LJu,2004-05-26


In [8]:
# is there NA
if site_df.isna().sum().sum() != 0:
    print(f'{site_df.SITE_ID.iloc[0]}\n{site_df.isna().sum()}\n')

ES-LJu
TIMESTAMP_START       0
TIMESTAMP_END         0
GPP_NT_VUT_REF     8880
datetime              0
year                  0
month                 0
day                   0
hour                  0
SITE_ID               0
date                  0
dtype: int64



In [9]:
def has_full_month_record(year, month, actual_record_count):
    if actual_record_count == 0:
        return False
    
    expected_days = monthrange(year, month)
    expected_record_count = expected_days[1]*48
    if(expected_record_count != actual_record_count):
        #print(f"  {month:2}: expected-[{expected_record_count}], actual-[{actual_record_count}]")
        return False
    return True

def has_full_day_record(year, month, day, actual_record_count):
    if actual_record_count == 0:
        #print(f"    {month}/{day}: none")
        return False
    
    expected_record_count = 48
    if(expected_record_count != actual_record_count):
        print(f"    {month}/{day}: expected-[{expected_record_count}], actual-[{actual_record_count}]")
        return False
    return True

def plot_month_records(site_id, year, month, df):
    df.plot.bar(x='datetime', y='GPP_NT_VUT_REF', 
                    title=f'{site_id}:{year}.{month}',
                    figsize= (16,4))

In [10]:
FULL_YEAR_DATA_COUNT = 2*24*365 #17520
LEAP_YEAR_DATA_COUNT = 2*24*366 #17568

def get_gaps(site_df):
    start_year, end_year = get_min_max(site_df['year'])
    gap_start = None
    for y in range(start_year, end_year+1):
        year_df = site_df.loc[site_df['year'] == y]

        start_date, end_date = get_min_max_datetime(year_df['datetime'])
        date_delta = (end_date - start_date)
        
        if (not is_leap_year(y) and year_df.shape[0] != FULL_YEAR_DATA_COUNT) or\
        (is_leap_year(y) and year_df.shape[0] != LEAP_YEAR_DATA_COUNT):
            print(f"{y}: {year_df.shape[0]} ({date_delta}) {'Leap' if is_leap_year(y) else ''}")
            for m in range(1,13):
                month_df = year_df.loc[year_df['month'] == m]
                if month_df.shape[0] == 0:
                    #print(f"  {m:2}: none")
                    if gap_start == None:
                        gap_start = f"{y}/{m}/1"
                elif has_full_month_record(y, m, month_df.shape[0]):
                    if gap_start != None:
                        if m-1 > 1:
                            print(f"    gap:{gap_start} - {y}/{m-1}/{monthrange(y, m-1)[1]}")
                        else:
                            print(f"    gap:{gap_start} - {y-1}/12/31")
                        gap_start = None
                else:
                    for d in range(1, monthrange(y, m)[1]+1):
                        day_df = month_df.loc[month_df['day'] == d]
                        if has_full_day_record(y, m, d, day_df.shape[0]):
                            if gap_start != None:
                                print(f"    gap: {gap_start} - {y}/{m}/{d-1}")
                                gap_start = None
                        elif gap_start == None:
                            gap_start = f"{y}/{m}/{d}"
        elif gap_start != None:
            print(f"    gap:{gap_start} - {y-1}/12/31")
            gap_start = None
            
    if gap_start != None:
        print(f"    gap:{gap_start} - {site_df['date'].max()}")
        gap_start = None

In [11]:
get_gaps(site_df)

2004: 8880 (219 days 23:30:00) Leap
    gap: 2004/1/1 - 2004/5/25
    gap: 2004/7/15 - 2004/7/29
    gap: 2004/12/5 - 2004/12/24
2005: 9216 (364 days 23:30:00) 
    gap: 2005/1/7 - 2005/6/17
    gap: 2005/7/31 - 2005/8/10
2006: 14208 (364 days 23:30:00) 
    gap: 2006/3/25 - 2006/4/5
    gap: 2006/7/16 - 2006/7/27
    gap: 2006/11/6 - 2006/12/20
2007: 13632 (364 days 23:30:00) 
    gap: 2007/1/12 - 2007/2/15
    gap: 2007/10/31 - 2007/11/21
    gap: 2007/11/30 - 2007/12/23
2008: 14976 (365 days 23:30:00) Leap
    gap: 2008/1/16 - 2008/2/17
    gap: 2008/11/21 - 2008/12/11
2009: 17040 (364 days 23:30:00) 
    gap: 2009/11/26 - 2009/12/5
2010: 15216 (364 days 23:30:00) 
    gap: 2010/1/29 - 2010/3/17
2012: 17520 (364 days 23:30:00) Leap
2013: 17088 (355 days 23:30:00) 
    gap: 2012/12/31 - 2013/1/9
2015: 15072 (364 days 23:30:00) 
    gap: 2015/2/3 - 2015/3/25
2016: 16128 (335 days 23:30:00) Leap
2017: 9504 (277 days 23:30:00) 
    gap: 2016/12/2 - 2017/3/28
    gap: 2017/5/9 - 2017/5/2

# Process all "golden files"

In [12]:
# Iterate through sites
features = ['SITE_ID','TIMESTAMP_START', 'TIMESTAMP_END', 'datetime', 'date', 'year', 'month', 'day', 'hour', 'GPP_NT_VUT_REF']
for i, r in site_metadata_df[['site_id','filename']].iterrows():
    if not r.filename or type(r.filename) != type(""):
        print(f'\nERROR: {r.site_id} is mssing hourly data.')
        continue

    local_filename = tmp_dir + "\\" + r.filename
    site_df = pd.read_csv(local_filename, usecols=features)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    
    print(f"\n{r.site_id}")
    if site_df.isna().sum().sum() != 0:
        print(f'{site_df.isna().sum()}\n')
        site_df.dropna(inplace=True)
    
    get_gaps(site_df)


FR-Pue
2000: 6432 (158 days 23:30:00) Leap
    gap: 2000/1/1 - 2000/7/25
    gap: 2000/9/12 - 2000/9/23
    gap: 2000/11/16 - 2000/11/28
2009: 16800 (364 days 23:30:00) 
    gap: 2009/8/18 - 2009/9/1
2010: 16416 (364 days 23:30:00) 
    gap: 2010/6/29 - 2010/7/8
    gap: 2010/12/6 - 2010/12/18
2011: 16224 (364 days 23:30:00) 
    gap:2011/8/5 - 2011/8/31
2012: 14592 (365 days 23:30:00) Leap
    gap: 2012/1/9 - 2012/3/10

ERROR: US-Ne2 is mssing hourly data.

US-Ton
2001: 10656 (221 days 23:30:00) 
    gap: 2001/1/1 - 2001/5/23
2004: 15408 (365 days 23:30:00) Leap
    gap: 2004/5/23 - 2004/7/6
2007: 15168 (364 days 23:30:00) 
    gap: 2007/1/15 - 2007/2/22
    gap: 2007/6/3 - 2007/6/12
2010: 17136 (356 days 23:30:00) 
2011: 16368 (361 days 23:30:00) 
    gap: 2010/12/24 - 2011/1/3
    gap: 2011/1/11 - 2011/1/21
    gap: 2011/4/3 - 2011/4/12
2013: 15936 (364 days 23:30:00) 
    gap: 2013/4/25 - 2013/5/27

US-Var
2000: 2928 (60 days 23:30:00) Leap
    gap:2000/1/1 - 2000/10/31
2005: 1564