In [None]:
import pandas as pd
import os
import glob
import numpy as np

In [None]:
def standarize_pricing(path="/data/raw"):
    parquet_files = glob.glob(os.path.join(path, "*.parquet.gzip"))
    df_list = []
    for file in parquet_files:
        df_ = pd.read_parquet(file)
        df_list.append(df_)
    df = pd.concat(df_list)
    df.drop_duplicates(inplace=True)
    df["time"] = pd.to_datetime(df.time, infer_datetime_format=False)
    df["dayofweek"] = df.time.dt.dayofweek
    df.set_index("time", inplace=True)
    df.sort_index(inplace=True)
    df = df.pivot(columns="dayofweek", values="close")
    # Used to determine the start and end dates of a series
    output = df.resample('1h').mean().replace(0., np.nan)

    earliest_time = output.index.min()

    df_list = []
    for label in output:
        print('Processing {}'.format(label))
        srs = output[label]

        start_date = min(srs.fillna(method='ffill').dropna().index)
        end_date = max(srs.fillna(method='bfill').dropna().index)

        active_range = (srs.index >= start_date) & (srs.index <= end_date)
        srs = srs[active_range].fillna(0.)

        tmp = pd.DataFrame({'closing_price': srs})
        date = tmp.index
        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
            date - earliest_time).days * 24
        tmp['days_from_start'] = (date - earliest_time).days
        tmp['categorical_id'] = label
        tmp['date'] = date
        tmp['id'] = label
        tmp['hour'] = date.hour
        tmp['day'] = date.day
        tmp['day_of_week'] = date.dayofweek
        tmp['month'] = date.month

        df_list.append(tmp)

    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)

    output['categorical_id'] = output['id'].copy()
    output['hours_from_start'] = output['t']
    output['categorical_day_of_week'] = output['day_of_week'].copy()
    output['categorical_hour'] = output['hour'].copy()
    return output

In [None]:
df = standarize_pricing()

In [None]:
df.describe()