In [42]:
import datetime
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle

# %load_ext blackcellmagic

In [43]:
data_path = Path(os.path.abspath('')) / 'data'
sentiment_path = data_path /  'sentiment' / 'ea.csv'
stock_path = data_path /  'ea.csv'

In [44]:
ea = pd.read_csv(sentiment_path).drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
ea = ea[ea["created_utc"] != "created_utc"]
ea["created_strf"] = (
    ea["created_utc"]
    .astype(int)
    .apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y-%d-%m"))
)

In [45]:
min_time = ea['created_strf'].min()
max_time = ea['created_strf'].max()

In [46]:
stock_data = pd.read_csv(stock_path)

In [47]:
stock_data

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sma_5,...,wma_50,bb_50_up,bb_50_down,sma_100,ema_100,wma_100,bb_100_up,bb_100_down,macd,signal
0,0,2018-12-11,47.637469,47.921902,46.803791,47.274574,12474126,0.0,0,,...,,,,,47.274574,,,,0.000000,0.000000
1,1,2018-12-12,48.108254,49.255792,47.490349,47.519775,11453591,0.0,0,,...,,,,,47.398401,,,,0.019560,0.003912
2,2,2018-12-13,47.814015,48.314221,46.804772,47.578621,7755190,0.0,0,,...,,,,,47.459680,,,,0.039356,0.011001
3,3,2018-12-14,46.558591,47.519775,46.068191,46.833214,7161356,0.0,0,,...,,,,,47.298334,,,,-0.005045,0.007792
4,4,2018-12-17,46.686096,47.490351,45.744528,46.176083,10064952,0.0,0,47.076453,...,,,,,47.064817,,,,-0.092195,-0.012206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,752,2021-12-06,57.529999,58.450001,57.240002,58.090000,9408963,0.0,0,57.724000,...,67.369043,86.515702,55.688299,76.6782,75.181358,72.416444,93.540384,59.816016,-4.139811,-3.948483
753,753,2021-12-07,58.919998,59.270000,58.282001,58.770000,11553517,0.0,0,57.758000,...,66.885435,86.532406,55.019194,76.3479,74.856381,72.061826,93.306999,59.388801,-3.954544,-3.949695
754,754,2021-12-08,58.779999,60.009998,58.029999,59.270000,9785536,0.0,0,58.156001,...,66.434228,86.437224,54.431976,76.0376,74.547740,71.723650,93.100452,58.974748,-3.724440,-3.904644
755,755,2021-12-09,58.595001,59.810001,58.520000,59.070000,8402308,0.0,0,58.512000,...,65.988557,86.259199,53.877601,75.7132,74.241250,71.387658,92.820961,58.605439,-3.517670,-3.827249


In [64]:
def prepare_data(X, X_cols, save_scaler_path, y_col="Close"):
    X_scaler = MinMaxScaler()
    y_scaler = MinMaxScaler()
    X_df = pd.DataFrame(X_scaler.fit_transform(X.loc[:, X_cols]), columns=X_cols)
    y_series = y_scaler.fit_transform(X.loc[:, y_col].values.reshape(-1, 1))
    X_df[y_col] = y_series
    scalers = {
        'X_scaler': X_scaler, 
        'y_scaler': y_scaler
    }
    with open(save_scaler_path / 'scalers.pickle', 'wb') as handle:
        pickle.dump(scalers, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return X_df

def split_data(X, step_train, step_predict, y_col="Close"):
    data_len = X.shape[0]
    X_list = []
    Y_preds_real_list = []
    Y_whole_real_list = []
    for i in range(data_len):
        X_step = X.loc[i : i + step_train - 1, [col for col in X.columns if col != y_col]]
        Y_pred_real = X.loc[i + step_train : i + step_train + step_predict - 1, y_col]
        Y_whole_real = X.loc[i : i + step_train - 1, y_col]
        if (len(X_step) == step_train) & (len(Y_pred_real) == step_predict):
            X_list.append(X_step)
            Y_preds_real_list.append(Y_pred_real)
            Y_whole_real_list.append(Y_whole_real)
    return np.array(X_list), np.array(Y_preds_real_list), np.array(Y_whole_real_list)

def train_test_split(data, train_percent):
    split_idx = round(len(data) * train_percent)
    return data[:split_idx], data[split_idx:]

In [49]:
STEP_TRAIN = 30
STEP_PREDICT = 1

In [50]:
final_data = (
    stock_data.loc[(stock_data["Date"] >= min_time) & (stock_data["Date"] < max_time)]
    .drop(columns=["Dividends", "Stock Splits", "Unnamed: 0"])
    .dropna(axis="columns")
)
x_cols = [col for col in final_data.columns if col not in ["Close", "Date"]]
scaled_data = prepare_data(final_data, X_cols=x_cols)
X_list, Y_preds_real_list, Y_whole_real_list = split_data(
    scaled_data, STEP_TRAIN, STEP_PREDICT
)

In [51]:
X_list[0].shape

(30, 19)

In [52]:
Y_preds_real_list[0].shape

(1,)

In [53]:
Y_whole_real_list[0].shape

(30,)

In [54]:
save_path = Path(os.path.abspath('')) / 'data' / 'scaled_data' 

In [66]:
df_lists = [X_list, Y_preds_real_list, Y_whole_real_list]
names = ['X_list', 'Y_preds_real_list', 'Y_whole_real_list']
temp = dict(zip(names, df_lists))
save = {}
for name, df_list in temp.items(): 
    train, test = train_test_split(df_list, 0.75)
    save[f'{name}_train'] = train
    save[f'{name}_test'] = test

with open(save_path / 'data.pickle', 'wb') as handle:
    pickle.dump(save, handle, protocol=pickle.HIGHEST_PROTOCOL)