In [324]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import datetime
import time
import os
import locale
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, Normalizer, OneHotEncoder, minmax_scale

In [285]:
PATH = './data_files/'

files = []
for f in os.listdir(PATH):
    if '.csv' in f:
        files.append(PATH + f)

#files = [PATH+f for f in os.listdir(PATH) if '.csv' in f]

In [256]:
def transform_timestamp(df, col_name):
    """
    Transform timestamp to proper date/year/month/day values
    
    Args:
        df
        col_name: column of original dataframe based on which to infer dates
    """

    df['date'] = df[f'{col_name}'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'))
    df['year'] = df[f'{col_name}'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S').year)
    df['month'] = df[f'{col_name}'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S').month)
    df['day'] = df[f'{col_name}'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S').day)

    return df

In [319]:
def numerical_to_float(df, cols):
    """
    Args:
        df:
        cols: columns to apply the dtype change to
    """
    for c in cols:
        try:
            df[f'{c}'] = df[f'{c}'].astype(dtype=float)
        except ValueError:
            # some files use ',' (comma) as decimal separator, replace with '.' (dot)
            df[f'{c}'] = df[f'{c}'].apply(lambda x: str(x).replace(',', '.'))
            df[f'{c}'] = df[f'{c}'].astype(dtype=float)
    
    return df

In [341]:
data = []

# transform numerical values to float
cols = ['H_orig', 'LE_orig', 'ET_orig', 'CO2', 'H2O', 'NEE_orig', 'Reco', 'GPP_f', 'Ustar']
# drop unnecessary columns
drop = ['TIMESTAMP_START', 'TIMESTAMP_MITTE', 'TIMESTAMP_ENDE', 'H_f', 'LE_f', 'ET_f', 'NEE_f']

for f in tqdm(files):
    try: 
        df = pd.read_csv(f, sep=',').drop(0)
    except pd.errors.ParserError: 
        df = pd.read_csv(f, sep=';').drop(0)

    # add location based on file name
    df['location'] = 'BG' if 'BG' in f else 'GW'

    df = transform_timestamp(df, 'TIMESTAMP_START')
    df = numerical_to_float(df, cols)
    df.drop(drop, axis=1, inplace=True)

    # drop any row containing NA values
    len_before = df.__len__()
    df.dropna(axis=0, how='any', inplace=True, ignore_index=True)
    na_removed = len_before - df.__len__()

    data.append(df)

100%|██████████| 4/4 [00:03<00:00,  1.20it/s]


In [342]:
data_final = pd.concat(data, axis=0, ignore_index=True)
data_final.head()

Unnamed: 0,H_orig,LE_orig,ET_orig,CO2,H2O,NEE_orig,Reco,GPP_f,Ustar,location,date,year,month,day
0,51.9651,30.423,0.022018,433.89,6.8887,-3.5406,1.62886,5.1695,0.30053,BG,2023-02-16 12:30:00,2023,2,16
1,69.53238,39.713,0.028771,433.54,6.991,-2.9818,1.72713,4.709,0.19136,BG,2023-02-16 13:00:00,2023,2,16
2,71.31974,39.382,0.028549,433.9,6.9597,-3.9531,1.78504,5.7382,0.25479,BG,2023-02-16 13:30:00,2023,2,16
3,48.84439,32.021,0.023216,433.93,7.0648,-3.1875,1.7983,4.9858,0.18293,BG,2023-02-16 14:00:00,2023,2,16
4,43.00187,27.692,0.020084,434.25,7.269,-2.6255,1.82987,4.4554,0.19404,BG,2023-02-16 14:30:00,2023,2,16


In [343]:
data_final.groupby('location').describe()

Unnamed: 0_level_0,H_orig,H_orig,H_orig,H_orig,H_orig,H_orig,H_orig,H_orig,LE_orig,LE_orig,...,month,month,day,day,day,day,day,day,day,day
Unnamed: 0_level_1,count,mean,min,25%,50%,75%,max,std,count,mean,...,max,std,count,mean,min,25%,50%,75%,max,std
location,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
BG,4311.0,32.097986,-84.92777,-11.88504,22.76819,71.217725,306.7473,54.572625,4311.0,77.430169,...,7.0,1.39874,4311.0,15.259801,1.0,7.0,15.0,23.0,31.0,8.970524
GW,5318.0,46.183921,-139.8708,-32.129437,-0.388105,112.886125,476.6478,106.310703,5318.0,57.773913,...,7.0,1.011647,5318.0,15.696502,1.0,8.0,15.0,23.0,31.0,8.77294


In [364]:
for col, type in zip(data_final.columns, data_final.dtypes):
    if type == 'float64':
        data_final[f'{col}'] = minmax_scale(data_final[f'{col}'])

data_final.head()

Unnamed: 0,H_orig,LE_orig,ET_orig,CO2,H2O,NEE_orig,Reco,GPP_f,Ustar,location,date,year,month,day
0,0.31116,0.174929,0.172086,0.476698,0.242068,0.525569,0.048847,0.448053,0.195518,BG,2023-02-16 12:30:00,2023,2,16
1,0.339654,0.192317,0.189252,0.474704,0.24626,0.53379,0.057604,0.441354,0.124494,BG,2023-02-16 13:00:00,2023,2,16
2,0.342553,0.191697,0.188686,0.476755,0.244977,0.5195,0.062764,0.456326,0.16576,BG,2023-02-16 13:30:00,2023,2,16
3,0.306098,0.17792,0.17513,0.476926,0.249284,0.530764,0.063945,0.445381,0.11901,BG,2023-02-16 14:00:00,2023,2,16
4,0.296621,0.169818,0.167168,0.478749,0.257652,0.539033,0.066758,0.437665,0.126238,BG,2023-02-16 14:30:00,2023,2,16


In [None]:
# TODO: location (one-hot), timestamps