In [30]:
import datetime
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import pickle

%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


In [4]:
data_path = Path(os.path.abspath('')) / 'data'
sentiment_path = data_path /  'sentiment' / 'ea.csv'
stock_path = data_path /  'ea.csv'

In [5]:
ea = pd.read_csv(sentiment_path).drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
ea = ea[ea["created_utc"] != "created_utc"]
ea["created_strf"] = (
    ea["created_utc"]
    .astype(int)
    .apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y-%d-%m"))
)

In [6]:
min_time = ea['created_strf'].min()
max_time = ea['created_strf'].max()

In [8]:
stock_data = pd.read_csv(stock_path)

In [9]:
stock_data

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sma_5,...,wma_50,bb_50_up,bb_50_down,sma_100,ema_100,wma_100,bb_100_up,bb_100_down,macd,signal
0,0,2018-12-11,47.637469,47.921902,46.803791,47.274574,12474126,0.0,0,,...,,,,,47.274574,,,,0.000000,0.000000
1,1,2018-12-12,48.108254,49.255792,47.490349,47.519775,11453591,0.0,0,,...,,,,,47.398401,,,,0.019560,0.003912
2,2,2018-12-13,47.814015,48.314221,46.804772,47.578621,7755190,0.0,0,,...,,,,,47.459680,,,,0.039356,0.011001
3,3,2018-12-14,46.558591,47.519775,46.068191,46.833214,7161356,0.0,0,,...,,,,,47.298334,,,,-0.005045,0.007792
4,4,2018-12-17,46.686096,47.490351,45.744528,46.176083,10064952,0.0,0,47.076453,...,,,,,47.064817,,,,-0.092195,-0.012206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,752,2021-12-06,57.529999,58.450001,57.240002,58.090000,9408963,0.0,0,57.724000,...,67.369043,86.515702,55.688299,76.6782,75.181358,72.416444,93.540384,59.816016,-4.139811,-3.948483
753,753,2021-12-07,58.919998,59.270000,58.282001,58.770000,11553517,0.0,0,57.758000,...,66.885435,86.532406,55.019194,76.3479,74.856381,72.061826,93.306999,59.388801,-3.954544,-3.949695
754,754,2021-12-08,58.779999,60.009998,58.029999,59.270000,9785536,0.0,0,58.156001,...,66.434228,86.437224,54.431976,76.0376,74.547740,71.723650,93.100452,58.974748,-3.724440,-3.904644
755,755,2021-12-09,58.595001,59.810001,58.520000,59.070000,8402308,0.0,0,58.512000,...,65.988557,86.259199,53.877601,75.7132,74.241250,71.387658,92.820961,58.605439,-3.517670,-3.827249


In [105]:
def prepare_data(X, X_cols, y_col="Close"):
    X_scaler = MinMaxScaler()
    y_scaler = MinMaxScaler()
    X_df = pd.DataFrame(X_scaler.fit_transform(X.loc[:, X_cols]), columns=X_cols)
    y_series = y_scaler.fit_transform(X.loc[:, y_col].values.reshape(-1, 1))
    X_df[y_col] = y_series
    return X_df

def split_data(X, step_train, step_predict, y_col="Close"):
    data_len = X.shape[0]
    X_list = []
    Y_preds_real_list = []
    Y_whole_real_list = []
    for i in range(data_len):
        X_step = X.loc[i : i + step_train - 1, [col for col in X.columns if col != y_col]]
        Y_pred_real = X.loc[i + step_train : i + step_train + step_predict - 1, y_col]
        Y_whole_real = X.loc[i : i + step_train + step_predict - 1, y_col]
        if (len(X_step) == step_train) & (len(Y_pred_real) == step_predict):
            X_list.append(X_step)
            Y_preds_real_list.append(Y_pred_real)
            Y_whole_real_list.append(Y_whole_real)
    return np.array(X_list), np.array(Y_preds_real_list), np.array(Y_whole_real_list)

In [106]:
STEP_TRAIN = 30
STEP_PREDICT = 1

In [107]:
final_data = (
    stock_data.loc[(stock_data["Date"] >= min_time) & (stock_data["Date"] < max_time)]
    .drop(columns=["Dividends", "Stock Splits", "Unnamed: 0"])
    .dropna(axis="columns")
)
x_cols = [col for col in final_data.columns if col not in ["Close", "Date"]]
scaled_data = prepare_data(final_data, X_cols=x_cols)
X_list, Y_preds_real_list, Y_whole_real_list = split_data(
    scaled_data, STEP_TRAIN, STEP_PREDICT
)

In [108]:
scaled_data

Unnamed: 0,Open,High,Low,Volume,sma_5,ema_5,wma_5,bb_5_up,bb_5_down,sma_10,ema_10,wma_10,bb_10_up,bb_10_down,ema_20,ema_50,ema_100,macd,signal,Close
0,0.075536,0.086009,0.082343,0.113326,0.078482,0.076267,0.078798,0.074062,0.123424,0.071822,0.074187,0.071794,0.081636,0.081858,0.068708,0.053655,0.044170,0.485153,0.499797,0.106099
1,0.093843,0.072623,0.080214,0.133680,0.075105,0.069100,0.071309,0.083366,0.107309,0.068655,0.070112,0.068626,0.078760,0.078712,0.065803,0.051304,0.041952,0.474867,0.497355,0.080494
2,0.087074,0.086634,0.082238,0.149944,0.075519,0.074075,0.074545,0.084464,0.106963,0.070012,0.072486,0.071342,0.081293,0.078651,0.067005,0.051955,0.042461,0.485516,0.497778,0.108245
3,0.106150,0.110515,0.112766,0.150546,0.080936,0.084549,0.084697,0.101512,0.099651,0.072356,0.078529,0.077714,0.089321,0.074582,0.070708,0.054450,0.044640,0.507678,0.503061,0.128637
4,0.134303,0.142126,0.142515,0.216692,0.091770,0.099439,0.100843,0.129053,0.091826,0.079014,0.087957,0.087972,0.107192,0.068377,0.076816,0.058716,0.048412,0.539999,0.514498,0.151176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738,0.281778,0.274640,0.288309,0.156465,0.270976,0.280126,0.269240,0.261453,0.297541,0.300778,0.309175,0.285191,0.320037,0.276181,0.372165,0.500620,0.633723,0.013646,0.000172,0.293108
739,0.303580,0.287691,0.304857,0.199231,0.271528,0.283348,0.274897,0.263879,0.296082,0.295126,0.306504,0.283178,0.307162,0.278961,0.365284,0.492763,0.627156,0.037162,0.000000,0.303738
740,0.301384,0.299468,0.300855,0.163974,0.277993,0.288239,0.283074,0.276059,0.295824,0.291006,0.305858,0.283686,0.294987,0.284014,0.359911,0.485591,0.620919,0.066369,0.006379,0.311554
741,0.298482,0.296285,0.308636,0.136390,0.283776,0.290402,0.288017,0.278907,0.303941,0.287974,0.304713,0.284344,0.286376,0.287353,0.354709,0.478550,0.614725,0.092614,0.017338,0.308428


In [109]:
X_list[0].shape

(30, 19)

In [110]:
Y_preds_real_list[0].shape

(1,)

In [111]:
Y_whole_real_list[0].shape

(31,)

In [112]:
save_path = Path(os.path.abspath('')) / 'data' / 'scaled_data' 


In [113]:
df_lists = [X_list, Y_preds_real_list, Y_whole_real_list]
names = ['X_list', 'Y_preds_real_list', 'Y_whole_real_list']
for df_list, name in zip(df_lists, names):
    with open(save_path / f'{name}.pickle', 'wb') as handle:
        pickle.dump(df_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [78]:
save_path

PosixPath('/home/kuba1302/Gan/gan/data/scaled_data')