In [1]:
import pandas as pd 
import numpy as np 

In [2]:
def read_data(input_path = 'data/'):
    df = pd.read_csv(input_path + 'clean_transactions.csv', parse_dates = ['timestamp'])
    return df 

In [3]:
def preprocess_data(input_path):
    df = read_data(input_path)

    station_cols = df.columns[~df.columns.str.contains('time')]
    # transactions[station_cols] = transactions[station_cols] + 1
    # transactions[station_cols] = transactions[station_cols].transform(np.log)

    # Time variables
    df['year'] = df.timestamp.dt.year
    df['month'] = df.timestamp.dt.month
    df['day'] = df.timestamp.dt.day
    df['hour'] = df.timestamp.dt.hour
    df['dayofweek'] = df.timestamp.dt.dayofweek 
    df['weekday'] = (df.day <= 4).astype(int)

    # Cleaning operation times 
    df = df[~df.hour.isin([0,1,2,3,23])]
    # del df
    return df



In [22]:
def aggreagtion(df, aggregation = None):
    """
    Aggrefates transactions by the given aggregation parameter. 
    
    Parameters:
    -----------
    - df: Pandas DataFrame, 
        Transactions by station. 
    - aggregation: str, default = None. 
        Aggregation interval. If none, it returns transactions every 15 mins. 
        One of ['hour','day','month']
    """

    if aggregation is None:
        
        return df.reset_index(drop = True)


    if aggregation == 'hour':
        groupby_list = ['year', 'month', 'day', 'hour']
    elif aggregation == 'day':
        groupby_list = ['year', 'month', 'day']
    elif aggregation == 'month':
        groupby_list = ['year', 'month']
    else:
        raise ValueError 

    #Groupby 
    df1 = df.groupby(groupby_list).sum().reset_index()
    return df1.reset_index(drop = True)

In [37]:
def split_data(input_path, 
               train_date = (2018, 8, 1), 
               covid_date = (2020, 3, 1), 
               aggreagation = None):
    df = preprocess_data(input_path)
    df = aggreagtion(df, aggregation = aggreagation)

    if aggreagation == 'month':
        train_date_index = df[(df.year == train_date[0]) & (df.month == train_date[1])].index[0]
        pre_covid_index = df[(df.year == covid_date[0]) & (df.month == covid_date[1])].index[0]

    else: 
        train_date_index = df[(df.year == train_date[0]) & (df.month == train_date[1]) & (df.day == train_date[2])].index[0]
        pre_covid_index = df[(df.year == covid_date[0]) & (df.month == covid_date[1]) & (df.day == covid_date[2])].index[0]

    train_df = df[0:train_date_index]
    pre_covid_df = df[train_date_index:pre_covid_index]
    post_covid_df = df[pre_covid_index:]
    return train_df, pre_covid_df, post_covid_df

In [42]:
# a, b, c = split_data('data/', aggreagation = 'hour')