In [None]:
#Last used/edited: 5/8/2023

In [None]:
# PURPOSE: 
# Generate Stationary Data

In [None]:
import pandas as pd
import os
import pickle
import datetime

In [None]:
#1. use preprocess data and resample by frequency to check for stationarity
#2. use the pkl data to difference found in step 1
#3. back fill NaN values 
#4. output files as stationary_FILE_NAME

In [None]:
# 1. use 1D data for each freq and fit arima model to get stationarity num

# Load data
BASE_PATH = os.path.dirname(os.getcwd())
DATA_DIR = 'data/'
# horizon = 'SCALP'
# horizon = 'SWING'
# horizon = 'POSITION'

#DATA
#SCALP
# data_1d_SCALP_df0.csv
# data_1d_SCALP_df1.csv
# data_1d_SCALP_df2.csv

#SWING
# data_1d_SWING_freqIdx0_CHECKSTATIONARY.csv
# data_1d_SWING_freqIdx1_CHECKSTATIONARY.csv
# data_1d_SWING_freqIdx2_CHECKSTATIONARY.csv

#POSITION
# data_1d_POSITION_freqIdx0_CHECKSTATIONARY.csv
# data_1d_POSITION_freqIdx1_CHECKSTATIONARY.csv
# data_1d_POSITION_freqIdx2_CHECKSTATIONARY.csv

FILE_NM = 'data_1d_SWING_freqIdx1_CHECKSTATIONARY.csv'
DATA_PATH = os.path.join(BASE_PATH, DATA_DIR)
data = os.path.join(DATA_PATH, FILE_NM)

print('LOADING DATA')
data = pd.read_csv(data, sep=",")[["prediction", "date", "close", "label"]]
print("Rows in df :", len(data))

In [None]:
df = pd.get_dummies(data, columns=['label'], drop_first=True)
df = df.rename(columns={"close": "close", "label_SHORT": "label"})
# LONG = 0, SHORT = 1

In [None]:
# defined using the statistical test (but i just chose a window size)
n_steps = 30 #SWING, SCALP
# n_steps = 20 #POSITION

use_features = ['close'] # continuous input
target = ['label'] # continuous output
n_steps_ahead = 3 #2 #3 #4  # forecasting horizon

In [None]:
### Splitting the time series into training and testing sets
# Split the training and test set by using the first 80% of the time series and the remaining 
#20% for the test set. Note that the test set must be in the future of the training set 
# to avoid look-ahead bias. Also, random sampling of the data can not be used as this would eliminate 
# the auto-correlation structure.


# Make sure the splits are the same as 2D CNN 
# train_weight = 0.8
# split = int(len(df) * train_weight)
split = int(np.floor(0.8*len(df)))

df_train = df[use_features].iloc[:split]
# df_test = df[use_features].iloc[split:] 
#ARIMA change this
df_test = df[use_features].iloc[split-n_steps:] # so the rolling predcition cv can start right at the test date for ARIMA only

# labels (targets)
train_label = df['label'].iloc[:split]
test_label = df['label'].iloc[split:]

# dates
train_date = df['prediction'].iloc[:split]
test_date = df['prediction'].iloc[split:]

In [None]:
### Scaling
# Standardization of the data is important to avoid potential scaling difficulties in the fitting of the model. 
# When there is more than one feature (covariate), scaling avoids one feature dominating over another due to 
# disparate scales.

# To avoid introducing a look-ahead bias into the prediction, we must re-scale the training data without 
# knowledge of the test set. Hence, we will simply standardize the training set using the mean and 
# standard deviation of the training set and not the whole time series. Additionally, to avoid introducing 
# a systematic bias into test set, we use the identical normalization for the test set - the mean and 
# standard deviation of the training set are used to normalize the test set.


# note that for a multivariate time series, you would need to scale 
# each variable by its own mean and standard deviation in the training set
mu = float(df_train.mean())
sigma = float(df_train.std())
min_ = float(df_train.min())
max_ = float(df_train.max())

normalize_input = lambda x: (x - min_) / (max_-min_)
stdize_input = lambda x: (x - mu) / sigma

# df_train = df_train.apply(stdize_input)
df_train = df_train.apply(normalize_input)
df_test = df_test.apply(normalize_input)

In [None]:
# ARIMA
# https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html

#building the model
import pmdarima as pm
from pmdarima import model_selection
from matplotlib import pyplot as plt

train = df_train['close'].to_numpy()

model = pm.auto_arima(train, 
                      max_p = n_steps, max_q = n_steps, #max lags are same as other models
                      trace=True, error_action='ignore', 
                      step_wise=True,
                      suppress_warnings=True,
                      stationary=False, #is the data stationarity ?
                      test='adf'
                      )
model.fit(train)


In [None]:
# RECORD the models here

#SCALP
#freq=0 ARIMA(2,1,1)(0,0,0)[0]
#freq=1 ARIMA(0,1,1)(0,0,0)[0]  
#freq=2 ARIMA(2,1,1)(0,0,0)[0] 

#SWING
#freq=0 ARIMA(0,1,0)(0,0,0)[0] 
#freq=1 ARIMA(1,1,0)(0,0,0)[0]
#freq=2 ARIMA(1,1,0)(0,0,0)[0]  

#POSITION
#freq=0 ARIMA(0,1,0)(0,0,0)[0]    
#freq=1 ARIMA(0,1,0)(0,0,0)[0]  
#freq=2 ARIMA(0,1,0)(0,0,0)[0]  

In [None]:
#2. use the pkl data to difference found in step 1
#3. back fill NaN values 
#4. output files as stationary_FILE_NAME

In [None]:
import pandas as pd
import os
import pickle
import datetime

In [None]:
def take_diff_fill(data, diff_by = 1, fill_with = 0.0):
    diffed = data.diff(diff_by)
    diffed.iloc[0] = fill_with
    return diffed


def create_technical_indicator_data(label, dict_):
    
    for idx in range(len(dict_[label])):
        image_name = dict_[label][idx][0]
        image_name='{0}.png'.format(image_name.replace('-','_').replace(' ','_').replace(':','_'))
        print(image_name)

        for freq in range(4):
            high = dict_[label][idx][1]['High'][freq]
            close = dict_[label][idx][1]['Close'][freq]
            low = dict_[label][idx][1]['Low'][freq]
            open_ = dict_[label][idx][1]['Open'][freq]

            dict_[label][idx][1]['High'][freq] = take_diff_fill(high, diff_by = 1, fill_with = 0.0)
            dict_[label][idx][1]['Close'][freq] = take_diff_fill(close, diff_by = 1, fill_with = 0.0)
            dict_[label][idx][1]['Low'][freq] = take_diff_fill(low, diff_by = 1, fill_with = 0.0)
            dict_[label][idx][1]['Open'][freq] = take_diff_fill(open_, diff_by = 1, fill_with = 0.0)

    return dict_[label]

In [None]:
PATH = os.path.dirname(os.getcwd())
data_path = os.path.join(PATH, 'data')

#SCALP
# file_name = 'W30_H2_DF30_V01_03_26_2023.pkl'
# file_name = 'W30_H2_DF30_V23_03_27_2023.pkl'
# file_name = 'W30_H2_DF30_V45_03_27_2023.pkl'
file_name = 'W30_H2_DF30_V67_03_27_2023.pkl'
#SWING
# file_name = 'W30_H3_DF24_V00_SWING_04_07_2023.pkl'
#POSITION
# file_name = 'W30_H4_DF1W_V00_POSITION_04_07_2023.pkl'


file_path = os.path.join(data_path, file_name)

# Open and Read file
with open(os.path.join(data_path, file_name), 'rb') as f:
    loaded_dict = pickle.load(f)


In [None]:
print("START STATIONARIZING DATA")
for label in ["LONG", "SHORT"]:
    create_technical_indicator_data(label, loaded_dict)
    
print("DONE!")    

In [None]:
horizon = 'SCALP'
# horizon = 'POSITION'
# horizon = 'SWING'
OUT_FILE = '{}_STATIONARY_67.pkl'.format(horizon, )
with open(os.path.join(data_path, OUT_FILE), 'wb') as f:
    pickle.dump(loaded_dict, f)