In [1]:
import numpy as np
import pandas as pd
import scipy
import os
from alpha_vantage.timeseries import TimeSeries          # stock time series data API
from alpha_vantage.techindicators import TechIndicators        # Technical Indicator data API
import dateutil.parser
from Metric_Computation import *
from datetime import datetime
from dateutil.relativedelta import *
import time

## dfine the function that query one stocl data, return the value and store the corresponding csv file

In [2]:
def stock_data_read_and_process(stock_symbol, query_key, stock_save_path, start_period, end_period, S_P_500_date_array, S_P_500_value_array, series_length=50, fluc_check_period=20, silent_mode=True):
    '''
    :param stock_symbol: the symbol of the stock we are interested
    :param query_key: the key for us to use the Alpha Vantage API
    :param stock_save_path: the path to save the files
    :param start_period: the starting period
    :param end_period: the ending period
    :param S_P_500_date_array: the array passing S&P-500 dates (for storage)
    :param S_P_500_value_array: the array passing S&P-500 values
    :param series_length: the length of series we would like to query. Default is 50. Must be at least >(end_period-start_period+10)
    :param fluc_check_period: the period to check up/down trends and if it matches in/de-crease quantities. default is 20
    :param silent_mode: If the function will be printing information of the stock reading procedure
    return: X_data, containing all the time series data
            y_gradient: the target of gradient (difference)
            y_price: the target of real-price (only for reference usage)
            (the information of each stock will be stored in a .csv file separately)
    '''
    # **********************Price Time Series Query*****************
    if silent_mode is False:
        print('Query the stock...')
    query_gate_ts = TimeSeries(key=alpha_van_key_1,
                               retries=5,
                               output_format='pandas')
    exp_daily_adj_series, _ = query_gate_ts.get_daily_adjusted(symbol=stock_symbol,
                                                               outputsize='full')
    exp_daily_adj_series.index = pd.to_datetime(exp_daily_adj_series.index)
    if silent_mode is False:
        print('Stock Series Query Completed!')
    # ********************retrieve the high, low, and adjusted closed prices******************
    # exp_daily_adj_series = exp_daily_adj_series[df['column_name'] == ]
    # only care the data after 2018 -- we are not responsible for major economic bust!
    if exp_daily_adj_series.index.tolist()[0]<datetime.strptime('2009-01-01','%Y-%m-%d'):
        exp_daily_adj_series = exp_daily_adj_series[exp_daily_adj_series.index>=datetime.strptime('2009-01-01','%Y-%m-%d')]
    # collect dates
    date_list_series = np.array(exp_daily_adj_series.index.tolist(),dtype='object')
    # collect the high, low and adjusted close prices
    high_price_vec = np.array(exp_daily_adj_series.loc[:,'2. high'].values.tolist())
    low_price_vec = np.array(exp_daily_adj_series.loc[:,'3. low'].values.tolist())
    close_price_vec = np.array(exp_daily_adj_series.loc[:,'5. adjusted close'].values.tolist())
    #*************************** make time series **************************
    if silent_mode is False:
        print('Constructing the time series...')
    # compute the nData
    # "-1" for one day difference (essential for fluctuation-control)
    nData = close_price_vec.shape[0] - series_length - fluc_check_period - 1
    # placeholders of a empty time series array
    # current and 1-time-unit-lag adjusted close
    time_series_array = np.zeros([nData,series_length])
    time_series_array_prev = np.zeros([nData,series_length])
    # current and 1-time-unit-lag high
    high_series_array = np.zeros([nData,series_length])
    high_series_array_prev = np.zeros([nData,series_length])
    # current and 1-time-unit-lag low
    low_series_array = np.zeros([nData,series_length])
    low_series_array_prev = np.zeros([nData,series_length])
    # placeholder of target gradient and target value
    target_value_array = np.zeros([nData])
    target_value_array_prev = np.zeros([nData])
    target_gradient_array = np.zeros([nData])
    target_gradient_array_prev = np.zeros([nData])
    # placeholder of target flags -- no need for 'previous' lag array because they are only for regression
    inc_dec_flag_array = np.zeros([nData])
    up_down_trend_flag_array = np.zeros([nData])
    # plcaholder of the corresponding SP-500 values
    S_P_500_stock = np.zeros([nData])
    S_P_500_stock_prev = np.zeros([nData])
    for c_series_ind in range(nData+1):
        current_backward_ind = c_series_ind + fluc_check_period
        if c_series_ind<=(nData-1):
            # assign current sub-vectors
            time_series_array[c_series_ind,:] = close_price_vec[-(current_backward_ind+series_length):-(current_backward_ind)]
            high_series_array[c_series_ind, :] = high_price_vec[-(current_backward_ind+series_length):-(current_backward_ind)]
            low_series_array[c_series_ind, :] = low_price_vec[-(current_backward_ind+series_length):-(current_backward_ind)]
        # assign 'previous' values
        time_series_array_prev[c_series_ind-1,:] = close_price_vec[-(current_backward_ind+series_length):-(current_backward_ind)]
        high_series_array_prev[c_series_ind-1, :] = high_price_vec[-(current_backward_ind+series_length):-(current_backward_ind)]
        low_series_array_prev[c_series_ind-1, :] = low_price_vec[-(current_backward_ind+series_length):-(current_backward_ind)]
        # compute current value and gradient
        current_target_value = close_price_vec[-(current_backward_ind)]
        current_target_gradient = close_price_vec[-(current_backward_ind)] - close_price_vec[-(current_backward_ind+1)]
        # assign the value and gradient to the corresponding place
        if c_series_ind<=(nData-1):
            target_value_array[c_series_ind] = current_target_value
            target_gradient_array[c_series_ind] = current_target_gradient
        # assign the value and gradient to the corresponding 'previous' places
        target_value_array_prev[c_series_ind-1] = current_target_value
        target_gradient_array_prev[c_series_ind-1] = current_target_gradient
        # retrieve the 'future price array' to compute the symbols
        if c_series_ind==0:
            current_future_series_array = close_price_vec[-(current_backward_ind):]
        else:
            current_future_series_array = close_price_vec[-(current_backward_ind):-(c_series_ind)]
        # retrieve the last price from the 'past and now' price array
        current_base_price = close_price_vec[-(current_backward_ind+1)]   # the last 'current' adjusted close price
        # compute and assign the increase/decrease and uptrend/downtrend symbols
        if c_series_ind<=(nData-1):
            # increase/decrease 5% flag
            inc_dec_flag_array[c_series_ind] = increase_decrease_flag_check(price_series = current_future_series_array,
                                                                            base_price = current_base_price)
            # the trend check flag
            up_down_trend_flag_array[c_series_ind] = MA_up_down_check(price_series = current_future_series_array)
        # check current S&P 500
        current_date = date_list_series[-(c_series_ind+1)].replace(day=1)
        current_S_P_ind = np.where(S_P_500_date_array==current_date)[0]
        current_S_P_value = S_P_500_value_array[current_S_P_ind]
        if c_series_ind<=(nData-1):
            S_P_500_stock[c_series_ind] = current_S_P_value
        S_P_500_stock_prev[c_series_ind-1] = current_S_P_value
    if silent_mode is False:
        # print to check the time series array
        print('Time series construction completed!')
    # ********************Compute the metrics*********************
    compute_period_list = list(range(start_period,end_period,1))
    # *SMA
    if silent_mode is False:
        print('Computing simple moving average...')
    SMA_series_array = SMA_batch_compute(time_series_array,compute_period_list,comp_mode='vector')
    SMA_series_array_prev = SMA_batch_compute(time_series_array_prev,compute_period_list,comp_mode='vector')
    if silent_mode is False:
        print('Simple moving average computation completed!')
    # *EMA
    if silent_mode is False:
        print('Computing exponential moving average...')
    EMA_series_array = EMA_batch_computation(time_series_array,compute_period_list,comp_mode='vector')
    EMA_series_array_prev = EMA_batch_computation(time_series_array_prev,compute_period_list,comp_mode='vector')
    if silent_mode is False:
        print('Exponential moving average computation completed!')
    # *Stochastic Oscillator
    if silent_mode is False:
        print('Computing Stochastic Oscillator...')
    stoch_K_series_array, stoch_D_series_array = STOCH_batch_compute(time_series_array,
                                                                     high_series_array,
                                                                     low_series_array,
                                                                     compute_period_list)
    stoch_K_series_array_prev, stoch_D_series_array_prev = STOCH_batch_compute(time_series_array_prev,
                                                                               high_series_array_prev,
                                                                               low_series_array_prev,
                                                                               compute_period_list)
    if np.isnan(np.sum(stoch_K_series_array)):
        stoch_K_series_array = Nan_batch_interpolation(stoch_K_series_array)
    if np.isnan(np.sum(stoch_K_series_array_prev)):
        stoch_K_series_array_prev = Nan_batch_interpolation(stoch_K_series_array_prev)
    if np.isnan(np.sum(stoch_D_series_array)):
        stoch_D_series_array = Nan_batch_interpolation(stoch_D_series_array)
    if np.isnan(np.sum(stoch_D_series_array_prev)):
        stoch_D_series_array_prev = Nan_batch_interpolation(stoch_D_series_array_prev)
    if silent_mode is False:
        print('Stochastic Oscillator Completed!')
    # *ADX metrics
    if silent_mode is False:
        print('Computing ADX metrics...')
    DI_up_series_array, DI_down_series_array = ADX_batch_compute(time_series_array,
                                                                 high_series_array,
                                                                 low_series_array,
                                                                 compute_period_list)
    DI_up_series_array_prev, DI_down_series_array_prev = ADX_batch_compute(time_series_array_prev,
                                                                           high_series_array_prev,
                                                                           low_series_array_prev,
                                                                           compute_period_list)
    if np.isnan(np.sum(DI_up_series_array)):
        DI_up_series_array = Nan_batch_interpolation(DI_up_series_array)
    if np.isnan(np.sum(DI_down_series_array)):
        DI_down_series_array = Nan_batch_interpolation(DI_down_series_array)
    if np.isnan(np.sum(DI_up_series_array_prev)):
        DI_up_series_array_prev = Nan_batch_interpolation(DI_up_series_array_prev)
    if np.isnan(np.sum(DI_down_series_array_prev)):
        DI_down_series_array_prev = Nan_batch_interpolation(DI_down_series_array_prev)
    if silent_mode is False:
        print('ADX metrics computation completed!')
    # *CCI
    if silent_mode is False:
        print('Computing CCI index...')
    CCI_series_array = CCI_batch_compute(time_series_array,
                                         high_series_array,
                                         low_series_array,
                                         compute_period_list)
    CCI_series_array_prev = CCI_batch_compute(time_series_array_prev,
                                              high_series_array_prev,
                                              low_series_array_prev,
                                              compute_period_list)
    if np.isnan(np.sum(CCI_series_array)):
        CCI_series_array = Nan_batch_interpolation(CCI_series_array)
    if np.isnan(np.sum(CCI_series_array_prev)):
        CCI_series_array_prev = Nan_batch_interpolation(CCI_series_array_prev)
    if silent_mode is False:
        print('CCI index computation completed!')
    # ******************* Numpy Array for Data **************************
    # numpy array should be [nData * nTime * nDim]
    # non-for-loop idea concatenate the numpy arrays with a additional dimension, and flip the 2nd dim to get 'old-to-new' result
    stock_data_X = np.concatenate([SMA_series_array[:,:,np.newaxis],
                                   EMA_series_array[:,:,np.newaxis],
                                   stoch_K_series_array[:,:,np.newaxis],
                                   stoch_D_series_array[:,:,np.newaxis],
                                   DI_up_series_array[:,:,np.newaxis],
                                   DI_down_series_array[:,:,np.newaxis],
                                   CCI_series_array[:,:,np.newaxis]],axis=-1)
    # flip the time axis to arrange the array into 'old-to-new' order
    stock_data_X = np.flip(stock_data_X,axis=1)
    # also for the 'previous data'
    stock_data_X_prev = np.concatenate([SMA_series_array_prev[:,:,np.newaxis],
                                        EMA_series_array_prev[:,:,np.newaxis],
                                        stoch_K_series_array_prev[:,:,np.newaxis],
                                        stoch_D_series_array_prev[:,:,np.newaxis],
                                        DI_up_series_array_prev[:,:,np.newaxis],
                                        DI_down_series_array_prev[:,:,np.newaxis],
                                        CCI_series_array_prev[:,:,np.newaxis]],axis=-1)
    # flip the time axis to arrange the array into 'old-to-new' order
    stock_data_X_prev = np.flip(stock_data_X_prev,axis=1)
    # ******************** pandas dataframe and save as .csv ***********************
    # specify the list of column names
    col_names = ['SMA_period_'+str(date) for date in range(start_period,end_period)] + ['EMA_period_'+str(date) for date in range(start_period,end_period)] + ['Stoch_K_period_'+str(date) for date in range(start_period,end_period)]+ ['Stoch_D_period_'+str(date) for date in range(start_period,end_period)]+ ['DI_up_'+str(date) for date in range(start_period,end_period)]+ ['DI_down_'+str(date) for date in range(start_period,end_period)]+ ['CCI_'+str(date) for date in range(start_period,end_period)] + ['S&P_500','Price-change Flag','Trend Flag','Target Value','Target Gradient']
    # spcify the data to concatenate
    stock_data_pd = np.concatenate([SMA_series_array[:],
                                    EMA_series_array[:],
                                    stoch_K_series_array[:],
                                    stoch_D_series_array[:],
                                    DI_up_series_array[:],
                                    DI_down_series_array[:],
                                    CCI_series_array[:],
                                    np.reshape(S_P_500_stock,[nData,1]),
                                    np.reshape(inc_dec_flag_array,[nData,1]),
                                    np.reshape(up_down_trend_flag_array,[nData,1]),
                                    np.reshape(target_value_array,[nData,1]),
                                    np.reshape(target_gradient_array,[nData,1])],axis=1)
    # specify the index
    stock_time_index_pd = np.flip(date_list_series[-nData:],axis=0)
    # create the pandas dataframe object
    this_stock_info_processed_pd = pd.DataFrame(data=stock_data_pd, index=stock_time_index_pd, columns=col_names)
    # save pandas as .csv file
    pandas_save_path = stock_save_path+'stock_info_csv/'
    if not os.path.exists(pandas_save_path):
        os.makedirs(pandas_save_path)
    # specify file name
    file_name = stock_symbol+'_price_info.csv'
    # write
    this_stock_info_processed_pd.to_csv(pandas_save_path+file_name)
    
    return stock_data_X, stock_data_X_prev, S_P_500_stock, S_P_500_stock_prev,  target_value_array, target_value_array_prev, target_gradient_array, target_gradient_array_prev,  inc_dec_flag_array, up_down_trend_flag_array

In [3]:
# define the path of the stock list
stock_symbol_list_path = '../data/stock_list/'
# define the path to read sp500 index
s_p_500_file_path = '../data/s-and-p-500/'
# define the path to save the stock information
stock_info_save_path = '../data/intermediate/'
# create the path if it isn't already existed
if not os.path.exists(stock_info_save_path):
    os.makedirs(stock_info_save_path)

## Load and process the S&P 500 data

In [4]:
# pre-2017 data
s_p_pre_2017_df = pd.read_csv(s_p_500_file_path+'data_csv.csv')
# post-2017 data
s_p_post_2017_df = pd.read_csv(s_p_500_file_path+'data_daily_17_18.csv')
# convert the dates in the pre_2017 data to 'date' format
pre_2017_date_series = pd.to_datetime(s_p_pre_2017_df.loc[:,'Date'])
s_p_pre_2017_df.loc[:,'Date'] = pre_2017_date_series
# similarly, convert the dates in the post_2017 data to 'date' format
post_2017_date_series = pd.to_datetime(s_p_post_2017_df.loc[:,'Date'])
s_p_post_2017_df.loc[:,'Date'] = post_2017_date_series
# construct a mothly array and check
nMonth = 119          # 2009-01 to 2018-11
current_date = datetime.strptime('2009-01','%Y-%m')
threshold_date = datetime.strptime('2018-01','%Y-%m')    # need to change to the other file
# initialize two empty lists to store the values
S_P_500_date_list = []
S_P_500_value_list = []
for cMonth in range(nMonth):
    # compute the end date
    end_date = current_date+relativedelta(months=+1)
    if current_date<threshold_date:
        current_df_sp_500 = s_p_pre_2017_df[(s_p_pre_2017_df['Date']>=current_date)&(s_p_pre_2017_df['Date']<end_date)]
        current_value_sp_500 = np.mean(current_df_sp_500.loc[:,'SP500'].values)
    else:
        current_df_sp_500 = s_p_post_2017_df[(s_p_post_2017_df['Date']>=current_date)&(s_p_post_2017_df['Date']<end_date)]
        current_value_sp_500 = np.mean(current_df_sp_500.loc[:,'Adj Close'].values)
    # assign the values
    S_P_500_date_list.append(current_date)
    S_P_500_value_list.append(current_value_sp_500)
    # replace the current date for date growth
    current_date = end_date
S_P_500_date_array = np.array(S_P_500_date_list)
S_P_500_value_array = np.array(S_P_500_value_list)

## Load the stock symbols and specify hyper-parameters

In [5]:
# load the stock symbol file
Nasdaq_100_df = pd.read_csv(stock_symbol_list_path+'nasdaq100list.csv')
# print(Nasdaq_100_df)
stock_symbol_list = Nasdaq_100_df.loc[:,'Symbol'].values.tolist()

In [6]:
# define the period list
start_period = 1     # which means start_period-1, from some day
end_period = 25   # which means up to (compute_period_list-1)
alpha_van_key_1 = 'CDG9CCIB6BNCIU6A'

## Recursively call the function to load and process the stock data

In [7]:
# define the global variables
holistic_stock_data_X = 0
holistic_stock_data_X_prev = 0
holistic_S_P_500_stock = 0
holistic_S_P_500_stock_prev = 0
holistic_target_value_array = 0
holistic_target_value_array_prev = 0
holistic_target_gradient_array = 0
holistic_target_gradient_array_prev = 0
holistic_price_change_flag = 0
holistic_price_trend_flag = 0
for cSymbol in range(len(stock_symbol_list)):
    print('\rReading and processing stock data, '+str(cSymbol*100/len(stock_symbol_list))+'% has been finished...',end="",
          flush=True)
    try:
        # retrieve and compute stock price info
        this_stock_info = stock_data_read_and_process(stock_symbol=stock_symbol_list[cSymbol],
                                                      query_key=alpha_van_key_1,
                                                      stock_save_path=stock_info_save_path, 
                                                      start_period=1, 
                                                      end_period=25, 
                                                      S_P_500_date_array=S_P_500_date_array,
                                                      S_P_500_value_array=S_P_500_value_array)
    except:
        print('\nStock \''+cSymbol+'\' failed in query!\n')
        continue
    # retrieve information from the tuple
    # 10-returns:
    # stock_data_X, ,stock_data_X_prev, S_P_500_stock, S_P_500_stock_prev,  
    # target_value_array, target_value_array_prev, target_gradient_array, target_gradient_array_prev, 
    # inc_dec_flag_array, up_down_trend_flag_array
    this_stock_data_X = this_stock_info[0]
    this_stock_data_X_prev = this_stock_info[1]
    this_S_P_500_stock = np.reshape(this_stock_info[2] ,[-1,1])
    this_S_P_500_stock_prev = np.reshape(this_stock_info[3] ,[-1,1])
    this_target_value_array = np.reshape(this_stock_info[4],[-1,1])
    this_target_value_array_prev = np.reshape(this_stock_info[5],[-1,1])
    this_target_gradient_array = np.reshape(this_stock_info[6],[-1,1])
    this_target_gradient_array_prev = np.reshape(this_stock_info[7],[-1,1])
    this_inc_dec_flag_array = np.reshape(this_stock_info[8],[-1,1])
    this_up_down_trend_flag_array = np.reshape(this_stock_info[9],[-1,1])
    if cSymbol==0:
        holistic_stock_data_X = this_stock_data_X[:]
        holistic_stock_data_X_prev = this_stock_data_X_prev[:]
        holistic_S_P_500_stock = this_S_P_500_stock[:]
        holistic_S_P_500_stock_prev = this_S_P_500_stock_prev[:]
        holistic_target_value_array = this_target_value_array[:]
        holistic_target_value_array_prev = this_target_value_array_prev[:]
        holistic_target_gradient_array = this_target_gradient_array[:]
        holistic_target_gradient_array_prev = this_target_gradient_array_prev[:]
        holistic_price_change_flag = this_inc_dec_flag_array[:]
        holistic_price_trend_flag = this_up_down_trend_flag_array[:]
    else:
        # 'current' arrays
        holistic_stock_data_X = np.concatenate([holistic_stock_data_X, this_stock_data_X], axis=0)
        holistic_S_P_500_stock = np.concatenate([holistic_S_P_500_stock, this_S_P_500_stock], axis=0)
        holistic_target_value_array = np.concatenate([holistic_target_value_array, this_target_value_array], axis=0)
        holistic_target_gradient_array = np.concatenate([holistic_target_gradient_array, this_target_gradient_array], axis=0)
        # 'previous' arrays
        holistic_stock_data_X_prev = np.concatenate([holistic_stock_data_X_prev, 
                                                     this_stock_data_X_prev], axis=0)
        holistic_S_P_500_stock_prev = np.concatenate([holistic_S_P_500_stock_prev, 
                                                      this_S_P_500_stock_prev], axis=0)
        holistic_target_value_array_prev = np.concatenate([holistic_target_value_array_prev, 
                                                           this_target_value_array_prev], axis=0)
        holistic_target_gradient_array_prev = np.concatenate([holistic_target_gradient_array_prev, 
                                                              this_target_gradient_array_prev], axis=0)
        holistic_price_change_flag = np.concatenate([holistic_price_change_flag,
                                                     this_inc_dec_flag_array],axis=0)
        holistic_price_trend_flag = np.concatenate([holistic_price_trend_flag,
                                                    this_up_down_trend_flag_array],axis=0)
    print('\rReading and processing stock data, '+str((cSymbol+1)*100/len(stock_symbol_list))+'% has been finished...',end="",
          flush=True)
    # avoid query more than 5 times per minute
    time.sleep(12) 

Reading and processing stock data, 0.0% has been finished...

  CCI_rst_mat = (1/0.015)*np.divide(typical_price_mat-MA_tp_mat, MA_MD_mat)


Reading and processing stock data, 12.62135922330097% has been finished....

  DI_up_mat = 100*np.divide(DM_up_SMA,TR_mat)
  DI_down_mat = 100*np.divide(DM_down_SMA,TR_mat)


Reading and processing stock data, 22.33009708737864% has been finished....

  period_K_metric = 100*(close_price-period_lowest_low)/(period_highest_high-period_lowest_low)
  DI_up_mat = 100*np.divide(DM_up_SMA,TR_mat)
  DI_down_mat = 100*np.divide(DM_down_SMA,TR_mat)


Reading and processing stock data, 100.0% has been finished... finished....

In [8]:
print(np.shape(holistic_S_P_500_stock))

(240752, 1)


In [9]:
# save 'current' info
np.save(stock_info_save_path+'data_X.npy',holistic_stock_data_X)
np.save(stock_info_save_path+'SP500.npy',holistic_S_P_500_stock)
np.save(stock_info_save_path+'target_value.npy',holistic_target_value_array)
np.save(stock_info_save_path+'target_gradient.npy',holistic_target_gradient_array)
np.save(stock_info_save_path+'price_change_flag.npy',holistic_price_change_flag)
np.save(stock_info_save_path+'price_trend_flag.npy',holistic_price_trend_flag)
# save 'prev' info
np.save(stock_info_save_path+'data_X_prev.npy',holistic_stock_data_X_prev)
np.save(stock_info_save_path+'SP500_prev.npy',holistic_S_P_500_stock_prev)
np.save(stock_info_save_path+'target_value_prev.npy',holistic_target_value_array_prev)
np.save(stock_info_save_path+'target_gradient_prev.npy',holistic_target_gradient_array_prev)