In [115]:
import datetime
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle

# %load_ext blackcellmagic

In [6]:
cd ..

c:\Gan\GAN-market-prediction


In [37]:
data_path = Path(os.path.abspath('')) / 'data'
sentiment_path = data_path /  'sentiment' / 'ea.csv'
stock_path = data_path /  'ea.csv'

In [38]:
ea = pd.read_csv(sentiment_path).drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
ea = ea[ea["created_utc"] != "created_utc"]
ea["Date"] = (
    ea["created_utc"]
    .astype(int)
    .apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y-%d-%m"))
)

In [39]:
min_time = ea['Date'].min()
max_time = ea['Date'].max()

In [40]:
stock_data = pd.read_csv(stock_path)

In [87]:
def rename(newname):
    def decorator(f):
        f.__name__ = newname
        return f
    return decorator

def q_at(y):
    @rename(f'q{y:0.2f}')
    def q(x):
        return x.quantile(y)
    return q

AGGREGATIONS = {
    'sentiment': ['count', 'mean', 'std', 'median', q_at(0.25), q_at(0.75)]
}

def prepare_sentiment_statistics(sentiment_data): 
    df = sentiment_data.copy()
    df['sentiment'] = df['sentiment'].astype(float)
    return df.groupby('Date').agg(AGGREGATIONS)['sentiment']

In [88]:
ea_ready = prepare_sentiment_statistics(ea)

In [96]:
ea_ready

Unnamed: 0_level_0,count,mean,std,median,q0.25,q0.75
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-01,927,0.265973,0.415586,0.008677,0.002547,0.536200
2019-01-02,18,0.143571,0.299807,0.036182,0.001856,0.058880
2019-01-03,68,0.089041,0.243569,0.030828,0.000947,0.037990
2019-01-04,3,0.325564,0.552019,0.006855,0.006855,0.484918
2019-01-05,21,0.277559,0.423043,0.006046,0.001699,0.525224
...,...,...,...,...,...,...
2021-31-03,45,0.269387,0.398128,0.047501,0.013283,0.185307
2021-31-05,132,0.463428,0.458982,0.200640,0.009692,0.993579
2021-31-07,35,0.322731,0.442480,0.005172,0.002272,0.890342
2021-31-08,67,0.384824,0.476207,0.010933,0.002042,0.993785


In [112]:
stock_data.set_index(pd.DatetimeIndex(stock_data['Date']))

Unnamed: 0_level_0,Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sma_5,...,wma_50,bb_50_up,bb_50_down,sma_100,ema_100,wma_100,bb_100_up,bb_100_down,macd,signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-12-11,0,2018-12-11,47.637469,47.921902,46.803791,47.274574,12474126,0.0,0,,...,,,,,47.274574,,,,0.000000,0.000000
2018-12-12,1,2018-12-12,48.108254,49.255792,47.490349,47.519775,11453591,0.0,0,,...,,,,,47.398401,,,,0.019560,0.003912
2018-12-13,2,2018-12-13,47.814015,48.314221,46.804772,47.578621,7755190,0.0,0,,...,,,,,47.459680,,,,0.039356,0.011001
2018-12-14,3,2018-12-14,46.558591,47.519775,46.068191,46.833214,7161356,0.0,0,,...,,,,,47.298334,,,,-0.005045,0.007792
2018-12-17,4,2018-12-17,46.686096,47.490351,45.744528,46.176083,10064952,0.0,0,47.076453,...,,,,,47.064817,,,,-0.092195,-0.012206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-06,752,2021-12-06,57.529999,58.450001,57.240002,58.090000,9408963,0.0,0,57.724000,...,67.369043,86.515702,55.688299,76.6782,75.181358,72.416444,93.540384,59.816016,-4.139811,-3.948483
2021-12-07,753,2021-12-07,58.919998,59.270000,58.282001,58.770000,11553517,0.0,0,57.758000,...,66.885435,86.532406,55.019194,76.3479,74.856381,72.061826,93.306999,59.388801,-3.954544,-3.949695
2021-12-08,754,2021-12-08,58.779999,60.009998,58.029999,59.270000,9785536,0.0,0,58.156001,...,66.434228,86.437224,54.431976,76.0376,74.547740,71.723650,93.100452,58.974748,-3.724440,-3.904644
2021-12-09,755,2021-12-09,58.595001,59.810001,58.520000,59.070000,8402308,0.0,0,58.512000,...,65.988557,86.259199,53.877601,75.7132,74.241250,71.387658,92.820961,58.605439,-3.517670,-3.827249


In [51]:
def prepare_data(X, X_cols, save_scaler_path, y_col="Close"):
    X_scaler = MinMaxScaler()
    y_scaler = MinMaxScaler()
    X_df = pd.DataFrame(X_scaler.fit_transform(X.loc[:, X_cols]), columns=X_cols)
    y_series = y_scaler.fit_transform(X.loc[:, y_col].values.reshape(-1, 1))
    X_df[y_col] = y_series
    scalers = {
        'X_scaler': X_scaler, 
        'y_scaler': y_scaler
    }
    with open(save_scaler_path / 'scalers.pickle', 'wb') as handle:
        pickle.dump(scalers, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return X_df

def split_data(X, step_train, step_predict, y_col="Close"):
    data_len = X.shape[0]
    X_list = []
    Y_preds_real_list = []
    Y_whole_real_list = []
    for i in range(data_len):
        X_step = X.loc[i : i + step_train - 1, [col for col in X.columns if col != y_col]]
        Y_pred_real = X.loc[i + step_train : i + step_train + step_predict - 1, y_col]
        Y_whole_real = X.loc[i : i + step_train - 1, y_col]
        if (len(X_step) == step_train) & (len(Y_pred_real) == step_predict):
            X_list.append(X_step)
            Y_preds_real_list.append(Y_pred_real)
            Y_whole_real_list.append(Y_whole_real)
    return np.array(X_list), np.array(Y_preds_real_list), np.array(Y_whole_real_list)

def train_test_split(data, train_percent):
    split_idx = round(len(data) * train_percent)
    return data[:split_idx], data[split_idx:]

In [52]:
STEP_TRAIN = 30
STEP_PREDICT = 1

In [53]:
save_path = Path(os.path.abspath('')) / 'data' / 'scaled_data' 

In [116]:
def prepare_final_data(sentiment=None):
    final_data = (
        stock_data.loc[(stock_data["Date"] >= min_time) & (stock_data["Date"] < max_time)]
        .drop(columns=["Dividends", "Stock Splits", "Unnamed: 0"])
        .dropna(axis="columns")
    )
    if sentiment is not None: 
        sentiment_df = prepare_sentiment_statistics(sentiment)
        final_data = (final_data.merge(sentiment_df, how='left', on='Date')
                                .set_index(pd.DatetimeIndex(final_data['Date']))
                                .drop(columns='Date')
                                .interpolate(method='time'))
    x_cols = [col for col in final_data.columns if col not in ["Close", "Date"]]
    scaled_data = prepare_data(X=final_data, X_cols=x_cols, save_scaler_path=save_path)
    X_list, Y_preds_real_list, Y_whole_real_list = split_data(
        scaled_data, STEP_TRAIN, STEP_PREDICT
    )
    return X_list, Y_preds_real_list, Y_whole_real_list

In [117]:
X_list, Y_preds_real_list, Y_whole_real_list = prepare_final_data(ea)

In [118]:
X_list[0].shape

(30, 25)

In [119]:
Y_preds_real_list[0].shape

(1,)

In [120]:
Y_whole_real_list[0].shape

(30,)

In [123]:
df_lists = [X_list, Y_preds_real_list, Y_whole_real_list]
names = ['X_list', 'Y_preds_real_list', 'Y_whole_real_list']
temp = dict(zip(names, df_lists))
save = {}
for name, df_list in temp.items(): 
    train, test = train_test_split(df_list, 0.75)
    save[f'{name}_train'] = train
    save[f'{name}_test'] = test

with open(save_path / 'data.pickle', 'wb') as handle:
    pickle.dump(save, handle, protocol=pickle.HIGHEST_PROTOCOL)