## CRDS Data Processing

In [1]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import DatetimeTickFormatter
output_notebook()

## Reading the data

In [2]:
def read_files(dir_name):
    """Return a list with all filenames from all subdirectories."""
    filenames = []
    for dirs, subdir, files in os.walk(dir_name):
        subdir.sort()
        files.sort()
        for file in files:
            filenames.append(dirs + os.sep + file)
    return filenames

In [3]:
def read_data(dir_name, mycols):
    """ 
    Return a dataframe with concatenated data from read_files.
    Set timestamp as index.
  
    Parameters: 
        dir_name (str): directory name
        my_cols (list-like): selected columns
    """
    filenames = read_files(dir_name)
    list_of_dfs = [pd.read_csv(filename,
                               sep='\s+',
                               usecols=mycols,
                               engine='python',
                               parse_dates=[['DATE', 'TIME']])
                   for filename in tqdm(filenames)]
    df = pd.concat(list_of_dfs, ignore_index=True)
    df = df.set_index('DATE_TIME')
    df.index = pd.to_datetime(df.index)
    return df

In [4]:
my_cols = ['DATE', 'TIME', 'ALARM_STATUS', 'CavityPressure', 'CO2', 'CO2_dry', 'CH4', 'CH4_dry', 'H2O']

df = read_data('data/G2301', my_cols)
df.head()


100%|██████████| 110/110 [00:08<00:00, 12.73it/s]


Unnamed: 0_level_0,ALARM_STATUS,CavityPressure,CH4,CH4_dry,CO2,CO2_dry,H2O
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01 00:52:25.726,0,140.004532,2.234628,2.239051,422.881053,423.673516,0.120068
2021-01-01 00:52:26.648,0,140.002756,2.234628,2.239051,423.131638,423.673516,0.120068
2021-01-01 00:52:27.154,0,140.001343,2.234628,2.238047,423.131638,423.922636,0.120121
2021-01-01 00:52:28.565,0,140.021159,2.234577,2.238047,423.131638,423.922636,0.120121
2021-01-01 00:52:29.464,0,139.999052,2.234577,2.238047,423.247641,423.922636,0.120121


## Apply logbook flags

In [10]:
def read_logbook(path):
    """
    Read events in the logbook file (.csv) 
    Drop empty lines.
    
    Dataframe columns name: Event | Start_Event | End_Event | Flags
    
    Parameters: path of .csv logbook
    Returns: dataframe with the logbook data
    """
    lines = open(path_logbook).readlines()
    lines_skip = lines.index('Event,"Start Event\n')
    df = pd.read_csv(path, sep=',',  skiprows=lines_skip)
    df = df.rename(columns={"Start Event\ndd/mm/aaaa hh:mm": "Start_Event",
                           "End Event\ndd/mm/aaaa hh:mm": "End_Event"})
    df = df.dropna(how='all')
    return df

In [11]:
def drop_invalid_logbook(df):
    """
    Drop invalid datatime in logbook dataframe.
    Index are reseted.
    
    Parameters: logbook dataframe
    Returns: logbook dataframe with valid datatime
    """
    for line in range(1,len(df['Event'])+1):
        try:
            df['Start_Event'][line] = datetime.strptime(df['Start_Event'][line], '%d/%m/%Y %H:%M')
            df['End_Event'][line] = datetime.strptime(df['End_Event'][line], '%d/%m/%Y %H:%M')
        except:
            df = df.drop(index=line)
    df = df.reset_index()
    return df

In [12]:
def insert_flag(df, df_logbook):
    """
    Insert column flag for manual control quality.
    
    Parameters: data dataframe (df), logbook dataframe (df_logbook)
    Returns: data dataframe (df) with flags
    """
    df['FLAGS'] = np.nan
    lines_logbook = range(df_logbook['Start_Event'].shape[0])
    for i in lines_logbook:
        df.loc[(df.index >= df_logbook['Start_Event'][i]) & 
                   (df.index <= df_logbook['End_Event'][i]),['FLAGS']] = df_logbook['Flags'][i]
    return df

In [None]:
path_logbook = "logbook/Logbook-IAG.csv"
df_logbook = read_logbook(path_logbook)
df_logbook = drop_invalid_logbook(df_logbook)
df = insert_flag(df, df_logbook)

## Save 24 hour files

In [5]:
def save_24h(df, file_id, level):
    """ 
    Save 24-hour files
  
    Parameters: 
        df (pandas DataFrame): dataframe
        file_id (str): analyzer serial number
        level (str): data processing level
    """
    for day in df.index.day_of_year.unique():
        df_24h = df[(df.index.day_of_year == day)]
        file_name = file_id + '-' + df_24h.index[0].strftime('%Y%m%d') \
        + '-' + df_24h.index[0].strftime('%H%M%S') + 'Z-DataLog_User_' + level + '.csv'
        df_24h.to_csv(file_name)

In [6]:
t = time.process_time()
save_24h(df, 'CFADS2502', 'level0')
print('elapsed_time: ', time.process_time() - t)

elapsed_time:  3.6329729509999993


## Resampling the data

*As recommended by the World Data Centre for Greenhouse Gases (WDCGG; WMO, 2012), we calculate the means using data from the nearest time aggregation level and not always using the raw data. This implies that raw data are used to calculate 1 min averages, which are then used to calculate hourly averages and so on. For each single averaged data point, we provide the number of data used to compute the average and the standard deviation. The measurement time associated with an average dataset corresponds to the beginning of the averaging period (e.g., the hourly means at 13:00 are calculated from the 1 min means from 13:00 to 13:59), which is also in line with the recommendation of WDCGG (WMO, 2012)* <br> (Hazan et al., 2016)

In [7]:
def resample_data(df, t, my_cols):
    """ 
    Returns a dataframe with resampled data [mean, std, count].
  
    Parameters: 
        df (pandas DataFrame): dataframe
        t ('T', 'H', 'D') : minute, hour or day
        my_cols (list-like): selected columns
    """
    df_mean = df[my_cols].resample(t).mean()
    df_std = df[my_cols].resample(t).std()
    df_count = df[my_cols].resample(t).count()
    return df_mean.join(df_std, rsuffix='_std').join(df_count, rsuffix='_count')

In [8]:
df_min = resample_data(df, 'T', my_cols[4:])
df_min.head()

Unnamed: 0_level_0,CO2,CO2_dry,CH4,CH4_dry,H2O,CO2_std,CO2_dry_std,CH4_std,CH4_dry_std,H2O_std,CO2_count,CO2_dry_count,CH4_count,CH4_dry_count,H2O_count
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-01-01 00:52:00,422.409071,423.233932,2.237005,2.240274,0.120947,0.569139,0.563847,0.001831,0.001805,0.000546,37,37,37,37,37
2021-01-01 00:53:00,421.508341,422.309664,2.243725,2.247031,0.121503,0.181379,0.184222,0.00176,0.001968,0.000823,63,63,63,63,63
2021-01-01 00:54:00,422.997355,423.772085,2.238836,2.242559,0.122019,1.142238,1.160781,0.004424,0.004496,0.000838,63,63,63,63,63
2021-01-01 00:55:00,422.001819,422.83108,2.235708,2.239153,0.122336,0.381906,0.412565,0.001812,0.001785,0.000753,64,64,64,64,64
2021-01-01 00:56:00,422.534788,423.33585,2.235879,2.239439,0.122829,0.957517,0.963247,0.001355,0.00139,0.000691,63,63,63,63,63


In [9]:
df_hour = resample_data(df_min, 'H', my_cols[4:])
df_hour.head()

Unnamed: 0_level_0,CO2,CO2_dry,CH4,CH4_dry,H2O,CO2_std,CO2_dry_std,CH4_std,CH4_dry_std,H2O_std,CO2_count,CO2_dry_count,CH4_count,CH4_dry_count,H2O_count
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-01-01 00:00:00,422.652342,423.448665,2.227509,2.230985,0.122187,0.719745,0.696725,0.015721,0.015772,0.000704,8,8,8,8,8
2021-01-01 01:00:00,422.132774,422.940155,2.311727,2.315429,0.121475,5.170297,5.145131,0.361725,0.361628,0.000503,60,60,60,60,60
2021-01-01 02:00:00,409.772798,410.507216,2.370333,2.373601,0.116543,15.73858,15.742236,0.378899,0.379133,0.004784,60,60,60,60,60
2021-01-01 03:00:00,423.405777,424.129288,2.104195,2.107388,0.10985,1.030447,1.041718,0.166563,0.166889,0.002793,60,60,60,60,60
2021-01-01 04:00:00,426.722403,427.502659,2.112177,2.11521,0.117636,2.287819,2.298517,0.068922,0.069096,0.00162,60,60,60,60,60


In [10]:
df_day = resample_data(df_hour, 'D', my_cols[4:])
df_day.head()

Unnamed: 0_level_0,CO2,CO2_dry,CH4,CH4_dry,H2O,CO2_std,CO2_dry_std,CH4_std,CH4_dry_std,H2O_std,CO2_count,CO2_dry_count,CH4_count,CH4_dry_count,H2O_count
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-01-01,420.255265,421.045348,2.130653,2.133937,0.120832,8.193812,8.207892,0.148728,0.148903,0.002734,24,24,24,24,24
2021-01-02,411.667958,412.439483,1.915622,1.918561,0.120387,4.963992,4.974558,0.058931,0.059024,0.00248,24,24,24,24,24
2021-01-03,412.000238,412.764422,1.965266,1.96825,0.119153,5.568234,5.578429,0.121635,0.121826,0.002603,24,24,24,24,24
2021-01-04,414.231032,414.998467,1.948038,1.950986,0.119046,6.671857,6.678978,0.07701,0.077078,0.002793,24,24,24,24,24
2021-01-05,423.482775,424.26989,2.04718,2.050297,0.119429,9.756528,9.761519,0.087627,0.087778,0.003343,15,15,15,15,15


## Bokeh graph

In [11]:
def bokeh_graph(df, start_time, end_time, var):
    """ 
    Returns a bokeh graph of one variable for the selected period.
  
    Parameters: 
        df (pandas DataFrame): dataframe
        start_time (str) : 'yyyy-mm-dd hh:mm:ss'
        end_time (str) : 'yyyy-mm-dd hh:mm:ss'
        var (str): selected variable
    """    
    df = df.loc[(df.index >= start_time) & (df.index < end_time)]
    p = figure(x_axis_type="datetime", plot_width=750, plot_height=250, toolbar_location="above")
    p.line(df.index, df[var].values, line_width=1.5, color='#2ca02c')
    p.xaxis.axis_label = 'UTC'
    p.yaxis.axis_label = var
#     p.xaxis.major_label_orientation = pi/8
#     p.xaxis.formatter=DatetimeTickFormatter(
#             years=["%Y-%m-%d %H:%M:%S"],
#             months=["%Y-%m-%d %H:%M:%S"],
#             days=["%Y-%m-%d %H:%M:%S"],
#             hours=["%Y-%m-%d %H:%M:%S"],
#             minutes=["%Y-%m-%d %H:%M:%S"],
#             seconds=["%Y-%m-%d %H:%M:%S"])
    return show(p)

In [12]:
bokeh_graph(df, '2021-01-02', '2021-01-04', 'CO2')