In [None]:
minute_frequencies_conventions = {
    5: '5T', 
    15: '15T'
}

In [1]:
def load_stock_data(path, freq):
    stock = pd.read_csv(path, header=0, sep=';', parse_dates=[0])
    
    stock.rename(columns={ stock.columns[0]: "datetime" }, inplace=True)
    
    #stock['after_hours'] = (stock['datetime'].dt.hour >= 16).astype('float32')
    #stock['pre_market'] = ((stock['datetime'].dt.hour == 8) | ((stock['datetime'].dt.hour == 9) & (stock['datetime'].dt.minute <= 30))).astype('float32')
    
    stock['is_open'] = ((stock['datetime'].dt.hour >= 9) & (stock['datetime'].dt.minute == 30)).astype('float32')
    stock['is_close'] = ((stock['datetime'].dt.hour == 15) & (stock['datetime'].dt.minute == (60-freq))).astype('float32')

    
    sma_1_hours = 2
    sma_2_hours = 8
    
    stock[f'sma_{sma_1_hours}h'] = stock['close'].rolling(window=int(sma_1_hours*60/freq)).mean()
    stock[f'sma_{sma_2_hours}h'] = stock['close'].rolling(window=int(sma_2_hours*60/freq)).mean()
    
    stock['points_from_start'] = stock.index
    
    float64_cols = stock.select_dtypes(include='float64').columns
    stock[float64_cols] = stock[float64_cols].astype('float32')
    
    stock = stock.interpolate(method="ffill")
    stock = stock.interpolate(method="bfill")
    
    assert stock.isna().sum().sum() == 0

    return stock

In [7]:
def separate(stock):
    y = stock[['close', 'points_from_start']]
    X = stock[['points_from_start', 'close', 'open', 'low', 'high', 'vol', 'spread', 'sma_2h', 'sma_8h', 'is_open', 'is_close']]
    
    assert y.isna().sum().sum()    ==    0
    assert X.isna().sum().sum()    ==    0
    
    return { "X": X, "y": y }

In [9]:
def split_data(X, y):
    SPLIT = 0.8
    X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, train_size=SPLIT, shuffle=False)
    X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, train_size=0.5, shuffle=False)
    
    assert X_val.isna().sum().sum()     ==    0
    assert X_test.isna().sum().sum()    ==    0
    assert X_train.isna().sum().sum()   ==    0
    assert y_val.isna().sum().sum()     ==    0
    assert y_test.isna().sum().sum()    ==    0
    assert y_train.isna().sum().sum()   ==    0

    return {
        "X_train": X_train,
        "X_val": X_val,
        "X_test": X_test,
        "y_train": y_train,
        "y_val": y_val,
        "y_test": y_test
    }

In [None]:
def transform_splits_to_time_series(X_train, X_val, X_test, y_train, y_val, y_test):

    ts_y_train = TimeSeries.from_dataframe(y_train, time_col='points_from_start')
    ts_X_train = TimeSeries.from_dataframe(X_train, time_col='points_from_start')

    ts_y_val = TimeSeries.from_dataframe(y_val, time_col='points_from_start')
    ts_X_val = TimeSeries.from_dataframe(X_val, time_col='points_from_start')

    ts_y_test = TimeSeries.from_dataframe(y_test, time_col='points_from_start')
    ts_X_test = TimeSeries.from_dataframe(X_test, time_col='points_from_start')

    return {
        "ts_X_train": ts_X_train,
        "ts_X_val": ts_X_val,
        "ts_X_test": ts_X_test,
        "ts_y_train": ts_y_train,
        "ts_y_val": ts_y_val,
        "ts_y_test": ts_y_test
    }


def transform_to_time_series(X, y):
    ts_X_full = TimeSeries.from_dataframe(X,time_col='points_from_start')
    ts_y_full = TimeSeries.from_dataframe(y,time_col='points_from_start')

    return { "ts_X_full": ts_X_full, "ts_y_full": ts_y_full }

In [2]:
def scale_splits_data(ts_X_train, ts_X_val, ts_X_test, ts_y_train, ts_y_val, ts_y_test):
    scaler_X = Scaler()
    scaler_y = Scaler()

    scaled_X_train = scaler_X.fit_transform(ts_X_train)
    scaled_X_val = scaler_X.transform(ts_X_val)
    scaled_X_test = scaler_X.transform(ts_X_test)

    scaled_y_train = scaler_y.fit_transform(ts_y_train)
    scaled_y_val = scaler_y.transform(ts_y_val)
    scaled_y_test = scaler_y.transform(ts_y_test)

    return {
        "scaled_X_train": scaled_X_train.astype('float32'),
        "scaled_X_val": scaled_X_val.astype('float32'),
        "scaled_X_test": scaled_X_test.astype('float32'),
        "scaled_y_train": scaled_y_train.astype('float32'),
        "scaled_y_val": scaled_y_val.astype('float32'),
        "scaled_y_test": scaled_y_test.astype('float32'),
        "scaler_X": scaler_X,
        "scaler_y": scaler_y
    }

def scale_full_data(ts_X_full, ts_y_full, scaler_X, scaler_y):
    scaled_X_full = scaler_X.fit_transform(ts_X_full).astype('float32')
    scaled_y_full = scaler_y.fit_transform(ts_y_full).astype('float32')

    return { "scaled_X_full": scaled_X_full, "scaled_y_full": scaled_y_full }