# Compile Features. 

In [1]:
# Python module. 
import re, os 
import numpy as np 
import pandas as pd 

# Change the current directory from (./notebook) to root directory. 
while not re.match(r".+MADS-CAP$", os.getcwd()): 
	os.chdir("..") 
	
print(f"Current directory: ({os.getcwd()})") 

# For clearing safe warnings. Not important. 
from IPython.display import clear_output

# Custom modules. 
from source.modules.manage_files import ManageFiles 
from source.modules.processor_features import (
	compile_features_each_ticker, 
	concat_eventdates, concat_eachyear, 
	add_eventflag, merge_with_ticker, 
	rolling_sum_bygroup, process_quarter_date, 
	compute_price_to_ratio, 
)
from source.modules.processor_ticker import (
	get_ticker_yfinance, compute_forward_return
)
from source.modules.processor_technical import get_candlesticks 

# Custom configs. 
from source.config_py.config import (
	DIR_DATASET_CONSOLIDATED, DIR_DATASET_TICKER, DIR_DATASET_UTIL, DIR_DATASET_FUNDAMENTAL, 
	DIR_DATASET_ECONOMIC_DATA, DIR_DATASET_SENTIMENT, DIR_DATASET_TECH_IND, 
	TICKER_DATE_COLLECT, TICKER_TO_COLLECT, TICKER_TO_EXCLUDE, 
) 

Current directory: (/Users/lioneltay/Dropbox/Courses/michigan_mads/SIADS_697_/submission/MADS-CAP)


## Configurations (general). 

In [2]:
# Matplotlib setting. 
%matplotlib inline 

# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 100, "display.max_colwidth", 50)

# File management setup. 
manage_files = ManageFiles() 

# List of ticker to collect data. 
ticker_to_collect = TICKER_TO_COLLECT.difference(TICKER_TO_EXCLUDE) 

# # Uncomment this part to use Jeremy's stock pick. 
ticker_to_collect = set(manage_files.load_cache_pk(DIR_DATASET_UTIL, "jeremy_tickers_v1.pickle")) 
ticker_to_collect = TICKER_TO_COLLECT.union(ticker_to_collect).difference(TICKER_TO_EXCLUDE) 

# Date range. 
date_beg, date_end = TICKER_DATE_COLLECT 

# Whether to load the cache file. 
load_cache = True 

# For clearing the output. Not important. 
clear_output()

## Get ticker data. 

In [3]:
# Assign (load_cache) to (False) to overwrite existing data for all tickers. 
# Otherwise, it will append unavailable data to the existing dataset. 
# Take about 30 minutes to complete the entire S&P tickers. 

filepath = os.path.join(DIR_DATASET_TICKER, "ticker_dailydata.csv") 
df_tickers = compile_features_each_ticker(
	get_ticker_yfinance, filepath, ticker_to_collect, 
	load_cache=load_cache, **dict(date_beg=date_beg, date_end=date_end) 
) 

# Preview. 
df_tickers 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY
...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS


## Configuration for processing pipeline. 

In [4]:
# Whether to load the cache file. 
load_cache = True 

# File name to save the consolidated feature dataset. 
filename_output = "consolidated_feature.parquet"

## Create target labels. 

In [5]:
# # Take around 7 minutes to complete around 240 tickers. 

filepath = os.path.join(DIR_DATASET_CONSOLIDATED, filename_output) 

if load_cache and os.path.isfile(filepath): 
	df_feature_w_label = pd.read_parquet(filepath) 
else: 
	# Return lags. 
	returns_lags = [1] 

	# Autocorr lags. 
	autocorr_lags = [1,2,3] 

	# For trimming outliers. 
	trim_out = 0.0001 

	# Define the rolling window and min period for computing 
	# the mean reversion. 252 == 1-year. 
	window = 252 

	# Define market movement scale. 
	volt_lo, volt_hi = 0.25, 1.0 

	# Create numerical labels. 
	df_feature_w_label = compute_forward_return(
		df_tickers.copy(), returns_lags, trim_out, window=252, 
		autocorr_lags=autocorr_lags, volt_range=(volt_lo, volt_hi), 
	) 

	# Clear output. Not important. 
	clear_output() 

	# Cache the processed dataset. 
	df_feature_w_label.to_parquet(filepath, index=False) 

# Preview. 
df_feature_w_label 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker,return_c2c_lag1,tscore_c2c_lag1,return_c2c_lag1_autolag1,tscore_c2c_lag1_autolag1,return_c2c_lag1_autolag2,tscore_c2c_lag1_autolag2,return_c2c_lag1_autolag3,tscore_c2c_lag1_autolag3,vix_date,vix_open,vix_close,date_quarter,vl_ticker,vl_fiscalDateEnding,vl_eps,vl_eps_continuing,vl_rps,vl_date_quarter,vl_pe,vl_pe_continuing,vl_ps,event_jobs_opening_labor_turnover,event_non_farm_employment_adp_mom,event_non_farm_employment_mom,event_unemployment_claims,event_unemployment_rate,event_avg_hourly_earnings_mom,event_personal_dispensable_income_mom,event_personal_consumption_mom,event_ism_pmi_manufacturer,event_ism_pmi_services,event_chicago_pmi,event_industry_production_mom,event_phil_fed_manufacturer,event_capacity_utilisation,event_manufacturer_new_order_mom,event_manufacturer_new_order_ex_trans_mom,event_retail_sales_ex_auto_mom,event_retail_sales_mom,event_uom_consumer_sentiment,event_producer_ppi_mom,...,econ_consumer_sentiment_umich,econ_fed_ffr,econ_mortgage_rate_15yr,econ_mortgage_rate_30yr,econ_prime_loan_rate,rp_ticker,rp_date,rp_relevance,rp_ess,rp_aes,rp_aev,rp_ens,rp_ens_similarity_gap,rp_css,rp_nip,rp_peq,rp_bee,rp_bmq,rp_bam,rp_bca,rp_ber,rp_anl_chg,rp_mcq,techind_ticker,techind_date,techind_macd_MACD,techind_macd_MACD_Hist,techind_macd_MACD_Signal,techind_ema_t20_EMA,techind_ema_t50_EMA,techind_ema_t200_EMA,techind_rsi_t20_RSI,techind_rsi_t50_RSI,candle_cdl3blackcrows,candle_cdldarkcloudcover,candle_cdldoji,candle_cdldojistar,candle_cdldragonflydoji,candle_cdlengulfing,candle_cdleveningdojistar,candle_cdleveningstar,candle_cdlhammer,candle_cdlhangingman,candle_cdlharami,candle_cdlinvertedhammer,candle_cdlmorningdojistar,candle_cdlmorningstar,candle_cdlrickshawman,candle_cdlshootingstar,candle_cdltristar
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY,,,,,,,,,1999-11-23,21.07,21.00,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY,0.244076,,,,,,,,1999-11-24,20.70,20.26,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY,0.000000,,0.244076,,,,,,1999-11-26,20.35,22.33,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY,-0.119048,,0.000000,,0.244076,,,,1999-11-29,24.27,23.57,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0,0,0
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY,-0.047568,,-0.119048,,0.000000,,0.244076,,1999-11-30,23.81,24.18,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,0.021387,0.737431,2022-02-18,26.66,27.75,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.576886,10.599657,1.395693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-18,1.5046,0.0498,1.4548,43.7342,41.4175,36.1478,58.2486,57.2518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,2022-02-22,31.80,28.81,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.579211,10.601987,1.396000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-22,1.3406,-0.0913,1.4319,43.7581,41.5182,36.2258,55.5044,56.0585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,2022-02-23,28.04,31.02,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,9.902606,9.923926,1.306717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-23,1.2770,-0.1239,1.4009,43.8755,41.6544,36.3130,58.0956,57.1937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS,0.003334,0.058647,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,2022-02-24,37.50,30.32,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.214170,10.236160,1.347830,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-24,1.2246,-0.1411,1.3657,43.9959,41.7911,36.4008,58.4736,57.3606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Merge with VIX data. 

In [6]:
filepath = os.path.join(DIR_DATASET_CONSOLIDATED, filename_output) 

if load_cache and os.path.isfile(filepath): 
	df_feature_w_label = pd.read_parquet(filepath) 
else: 
	# Get VIX features. 
	df_vix = get_ticker_yfinance("^vix", date_beg, date_end) 

	# Make a copy of the dataframe to avoid error related to pandas (SettingWarnings). 
	usecols = ["date", "open", "close"] 
	df_feature_w_label = merge_with_ticker(
		df_feature_w_label.copy(), df_vix[usecols].copy(), merge_suffix="vix"
	) 

	# Cache the processed dataset. 
	df_feature_w_label.to_parquet(filepath, index=False) 

# Preview. 
df_feature_w_label 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker,return_c2c_lag1,tscore_c2c_lag1,return_c2c_lag1_autolag1,tscore_c2c_lag1_autolag1,return_c2c_lag1_autolag2,tscore_c2c_lag1_autolag2,return_c2c_lag1_autolag3,tscore_c2c_lag1_autolag3,vix_date,vix_open,vix_close,date_quarter,vl_ticker,vl_fiscalDateEnding,vl_eps,vl_eps_continuing,vl_rps,vl_date_quarter,vl_pe,vl_pe_continuing,vl_ps,event_jobs_opening_labor_turnover,event_non_farm_employment_adp_mom,event_non_farm_employment_mom,event_unemployment_claims,event_unemployment_rate,event_avg_hourly_earnings_mom,event_personal_dispensable_income_mom,event_personal_consumption_mom,event_ism_pmi_manufacturer,event_ism_pmi_services,event_chicago_pmi,event_industry_production_mom,event_phil_fed_manufacturer,event_capacity_utilisation,event_manufacturer_new_order_mom,event_manufacturer_new_order_ex_trans_mom,event_retail_sales_ex_auto_mom,event_retail_sales_mom,event_uom_consumer_sentiment,event_producer_ppi_mom,...,econ_consumer_sentiment_umich,econ_fed_ffr,econ_mortgage_rate_15yr,econ_mortgage_rate_30yr,econ_prime_loan_rate,rp_ticker,rp_date,rp_relevance,rp_ess,rp_aes,rp_aev,rp_ens,rp_ens_similarity_gap,rp_css,rp_nip,rp_peq,rp_bee,rp_bmq,rp_bam,rp_bca,rp_ber,rp_anl_chg,rp_mcq,techind_ticker,techind_date,techind_macd_MACD,techind_macd_MACD_Hist,techind_macd_MACD_Signal,techind_ema_t20_EMA,techind_ema_t50_EMA,techind_ema_t200_EMA,techind_rsi_t20_RSI,techind_rsi_t50_RSI,candle_cdl3blackcrows,candle_cdldarkcloudcover,candle_cdldoji,candle_cdldojistar,candle_cdldragonflydoji,candle_cdlengulfing,candle_cdleveningdojistar,candle_cdleveningstar,candle_cdlhammer,candle_cdlhangingman,candle_cdlharami,candle_cdlinvertedhammer,candle_cdlmorningdojistar,candle_cdlmorningstar,candle_cdlrickshawman,candle_cdlshootingstar,candle_cdltristar
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY,,,,,,,,,1999-11-23,21.07,21.00,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY,0.244076,,,,,,,,1999-11-24,20.70,20.26,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY,0.000000,,0.244076,,,,,,1999-11-26,20.35,22.33,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY,-0.119048,,0.000000,,0.244076,,,,1999-11-29,24.27,23.57,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0,0,0
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY,-0.047568,,-0.119048,,0.000000,,0.244076,,1999-11-30,23.81,24.18,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,0.021387,0.737431,2022-02-18,26.66,27.75,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.576886,10.599657,1.395693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-18,1.5046,0.0498,1.4548,43.7342,41.4175,36.1478,58.2486,57.2518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,2022-02-22,31.80,28.81,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.579211,10.601987,1.396000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-22,1.3406,-0.0913,1.4319,43.7581,41.5182,36.2258,55.5044,56.0585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,2022-02-23,28.04,31.02,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,9.902606,9.923926,1.306717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-23,1.2770,-0.1239,1.4009,43.8755,41.6544,36.3130,58.0956,57.1937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS,0.003334,0.058647,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,2022-02-24,37.50,30.32,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.214170,10.236160,1.347830,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-24,1.2246,-0.1411,1.3657,43.9959,41.7911,36.4008,58.4736,57.3606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Merge with valuation / fundamental data. 

In [7]:
filepath = os.path.join(DIR_DATASET_CONSOLIDATED, filename_output) 

if load_cache and os.path.isfile(filepath): 
	df_feature_w_label = pd.read_parquet(filepath) 
else: 
	infcols, usecols = ["ticker", "fiscalDateEnding"], ["eps", "eps_continuing", "rps"] 

	# Load the cached financial statement quarterly report. 
	filepath_feat = os.path.join(DIR_DATASET_FUNDAMENTAL, "financial_statement_qrt_ext.csv") 
	df_finstate_qrt = pd.read_csv(filepath_feat, usecols=infcols + usecols) 

	# Process and shift the quarter date. 
	df_finstate_qrt = process_quarter_date(df_finstate_qrt, datecol="fiscalDateEnding", shift_qrt=2) 

	# Compute the rolling sum for quarterly report. 
	df_finstate_qrt = rolling_sum_bygroup(df_finstate_qrt, groupby=["ticker"], window=4, usecols=usecols) 

	# Move the date to quarter end to ensure the date is consistent. 
	df_feature_w_label["date"] = pd.to_datetime(df_feature_w_label["date"]) 
	df_feature_w_label["date_quarter"] = df_feature_w_label["date"] - pd.DateOffset(days=1) + pd.tseries.offsets.QuarterEnd() 

	# Make a copy of the dataframe to avoid error related to pandas (SettingWarnings). 
	df_feature_w_label = merge_with_ticker(
		df_feature_w_label.copy(), df_finstate_qrt.copy(), merge_suffix="vl", 
		merge_datecol="date_quarter", merge_on=["ticker", "date_quarter"], relation="many_to_one"
	) 

	# Compute price-to ratio. 
	compute_cols = {
		"vl_pe": ("open", "vl_eps"), 
		"vl_pe_continuing": ("open", "vl_eps_continuing"), 
		"vl_ps": ("open", "vl_rps"), 
	}
	df_feature_w_label = compute_price_to_ratio(df_feature_w_label, compute_cols) 

	# Cache the processed dataset. 
	df_feature_w_label.to_parquet(filepath, index=False) 

# Preview. 
df_feature_w_label 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker,return_c2c_lag1,tscore_c2c_lag1,return_c2c_lag1_autolag1,tscore_c2c_lag1_autolag1,return_c2c_lag1_autolag2,tscore_c2c_lag1_autolag2,return_c2c_lag1_autolag3,tscore_c2c_lag1_autolag3,vix_date,vix_open,vix_close,date_quarter,vl_ticker,vl_fiscalDateEnding,vl_eps,vl_eps_continuing,vl_rps,vl_date_quarter,vl_pe,vl_pe_continuing,vl_ps,event_jobs_opening_labor_turnover,event_non_farm_employment_adp_mom,event_non_farm_employment_mom,event_unemployment_claims,event_unemployment_rate,event_avg_hourly_earnings_mom,event_personal_dispensable_income_mom,event_personal_consumption_mom,event_ism_pmi_manufacturer,event_ism_pmi_services,event_chicago_pmi,event_industry_production_mom,event_phil_fed_manufacturer,event_capacity_utilisation,event_manufacturer_new_order_mom,event_manufacturer_new_order_ex_trans_mom,event_retail_sales_ex_auto_mom,event_retail_sales_mom,event_uom_consumer_sentiment,event_producer_ppi_mom,...,econ_consumer_sentiment_umich,econ_fed_ffr,econ_mortgage_rate_15yr,econ_mortgage_rate_30yr,econ_prime_loan_rate,rp_ticker,rp_date,rp_relevance,rp_ess,rp_aes,rp_aev,rp_ens,rp_ens_similarity_gap,rp_css,rp_nip,rp_peq,rp_bee,rp_bmq,rp_bam,rp_bca,rp_ber,rp_anl_chg,rp_mcq,techind_ticker,techind_date,techind_macd_MACD,techind_macd_MACD_Hist,techind_macd_MACD_Signal,techind_ema_t20_EMA,techind_ema_t50_EMA,techind_ema_t200_EMA,techind_rsi_t20_RSI,techind_rsi_t50_RSI,candle_cdl3blackcrows,candle_cdldarkcloudcover,candle_cdldoji,candle_cdldojistar,candle_cdldragonflydoji,candle_cdlengulfing,candle_cdleveningdojistar,candle_cdleveningstar,candle_cdlhammer,candle_cdlhangingman,candle_cdlharami,candle_cdlinvertedhammer,candle_cdlmorningdojistar,candle_cdlmorningstar,candle_cdlrickshawman,candle_cdlshootingstar,candle_cdltristar
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY,,,,,,,,,1999-11-23,21.07,21.00,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY,0.244076,,,,,,,,1999-11-24,20.70,20.26,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY,0.000000,,0.244076,,,,,,1999-11-26,20.35,22.33,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY,-0.119048,,0.000000,,0.244076,,,,1999-11-29,24.27,23.57,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0,0,0
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY,-0.047568,,-0.119048,,0.000000,,0.244076,,1999-11-30,23.81,24.18,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,0.021387,0.737431,2022-02-18,26.66,27.75,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.576886,10.599657,1.395693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-18,1.5046,0.0498,1.4548,43.7342,41.4175,36.1478,58.2486,57.2518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,2022-02-22,31.80,28.81,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.579211,10.601987,1.396000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-22,1.3406,-0.0913,1.4319,43.7581,41.5182,36.2258,55.5044,56.0585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,2022-02-23,28.04,31.02,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,9.902606,9.923926,1.306717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-23,1.2770,-0.1239,1.4009,43.8755,41.6544,36.3130,58.0956,57.1937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS,0.003334,0.058647,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,2022-02-24,37.50,30.32,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.214170,10.236160,1.347830,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-24,1.2246,-0.1411,1.3657,43.9959,41.7911,36.4008,58.4736,57.3606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Merge with event date occurance. 

In [8]:
filepath = os.path.join(DIR_DATASET_CONSOLIDATED, filename_output) 

if load_cache and os.path.isfile(filepath): 
	df_feature_w_label = pd.read_parquet(filepath) 
else: 
	# Get the event date flags. 
	df_eventdates = concat_eventdates() 
	df_feature_w_label = add_eventflag(df_feature_w_label.copy(), df_eventdates) 

	# Cache the processed dataset. 
	df_feature_w_label.to_parquet(filepath, index=False) 
	
# Preview. 
df_feature_w_label 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker,return_c2c_lag1,tscore_c2c_lag1,return_c2c_lag1_autolag1,tscore_c2c_lag1_autolag1,return_c2c_lag1_autolag2,tscore_c2c_lag1_autolag2,return_c2c_lag1_autolag3,tscore_c2c_lag1_autolag3,vix_date,vix_open,vix_close,date_quarter,vl_ticker,vl_fiscalDateEnding,vl_eps,vl_eps_continuing,vl_rps,vl_date_quarter,vl_pe,vl_pe_continuing,vl_ps,event_jobs_opening_labor_turnover,event_non_farm_employment_adp_mom,event_non_farm_employment_mom,event_unemployment_claims,event_unemployment_rate,event_avg_hourly_earnings_mom,event_personal_dispensable_income_mom,event_personal_consumption_mom,event_ism_pmi_manufacturer,event_ism_pmi_services,event_chicago_pmi,event_industry_production_mom,event_phil_fed_manufacturer,event_capacity_utilisation,event_manufacturer_new_order_mom,event_manufacturer_new_order_ex_trans_mom,event_retail_sales_ex_auto_mom,event_retail_sales_mom,event_uom_consumer_sentiment,event_producer_ppi_mom,...,econ_consumer_sentiment_umich,econ_fed_ffr,econ_mortgage_rate_15yr,econ_mortgage_rate_30yr,econ_prime_loan_rate,rp_ticker,rp_date,rp_relevance,rp_ess,rp_aes,rp_aev,rp_ens,rp_ens_similarity_gap,rp_css,rp_nip,rp_peq,rp_bee,rp_bmq,rp_bam,rp_bca,rp_ber,rp_anl_chg,rp_mcq,techind_ticker,techind_date,techind_macd_MACD,techind_macd_MACD_Hist,techind_macd_MACD_Signal,techind_ema_t20_EMA,techind_ema_t50_EMA,techind_ema_t200_EMA,techind_rsi_t20_RSI,techind_rsi_t50_RSI,candle_cdl3blackcrows,candle_cdldarkcloudcover,candle_cdldoji,candle_cdldojistar,candle_cdldragonflydoji,candle_cdlengulfing,candle_cdleveningdojistar,candle_cdleveningstar,candle_cdlhammer,candle_cdlhangingman,candle_cdlharami,candle_cdlinvertedhammer,candle_cdlmorningdojistar,candle_cdlmorningstar,candle_cdlrickshawman,candle_cdlshootingstar,candle_cdltristar
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY,,,,,,,,,1999-11-23,21.07,21.00,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY,0.244076,,,,,,,,1999-11-24,20.70,20.26,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY,0.000000,,0.244076,,,,,,1999-11-26,20.35,22.33,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY,-0.119048,,0.000000,,0.244076,,,,1999-11-29,24.27,23.57,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0,0,0
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY,-0.047568,,-0.119048,,0.000000,,0.244076,,1999-11-30,23.81,24.18,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,0.021387,0.737431,2022-02-18,26.66,27.75,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.576886,10.599657,1.395693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-18,1.5046,0.0498,1.4548,43.7342,41.4175,36.1478,58.2486,57.2518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,2022-02-22,31.80,28.81,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.579211,10.601987,1.396000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-22,1.3406,-0.0913,1.4319,43.7581,41.5182,36.2258,55.5044,56.0585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,2022-02-23,28.04,31.02,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,9.902606,9.923926,1.306717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-23,1.2770,-0.1239,1.4009,43.8755,41.6544,36.3130,58.0956,57.1937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS,0.003334,0.058647,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,2022-02-24,37.50,30.32,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.214170,10.236160,1.347830,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-24,1.2246,-0.1411,1.3657,43.9959,41.7911,36.4008,58.4736,57.3606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Merge with economic data. 

In [9]:
filepath = os.path.join(DIR_DATASET_CONSOLIDATED, filename_output) 

if load_cache and os.path.isfile(filepath): 
	df_feature_w_label = pd.read_parquet(filepath) 
else: 
	# Load the cached technical indicator. 
	filepath_feat = os.path.join(DIR_DATASET_ECONOMIC_DATA, "econometric_fred.csv") 
	df_econometric = pd.read_csv(filepath_feat) 
	df_econometric = df_econometric \
		.pivot(index="date", columns="econometric", values="value") \
		.ffill() \
		.reset_index(drop=False) 

	# Make a copy of the dataframe to avoid error related to pandas (SettingWarnings). 
	df_feature_w_label = merge_with_ticker(
		df_feature_w_label.copy(), df_econometric.copy(), 
		merge_suffix="econ", merge_on=["date"], relation="many_to_one"
	) 

	# Cache the processed dataset. 
	df_feature_w_label.to_parquet(filepath, index=False) 

# Preview. 
df_feature_w_label 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker,return_c2c_lag1,tscore_c2c_lag1,return_c2c_lag1_autolag1,tscore_c2c_lag1_autolag1,return_c2c_lag1_autolag2,tscore_c2c_lag1_autolag2,return_c2c_lag1_autolag3,tscore_c2c_lag1_autolag3,vix_date,vix_open,vix_close,date_quarter,vl_ticker,vl_fiscalDateEnding,vl_eps,vl_eps_continuing,vl_rps,vl_date_quarter,vl_pe,vl_pe_continuing,vl_ps,event_jobs_opening_labor_turnover,event_non_farm_employment_adp_mom,event_non_farm_employment_mom,event_unemployment_claims,event_unemployment_rate,event_avg_hourly_earnings_mom,event_personal_dispensable_income_mom,event_personal_consumption_mom,event_ism_pmi_manufacturer,event_ism_pmi_services,event_chicago_pmi,event_industry_production_mom,event_phil_fed_manufacturer,event_capacity_utilisation,event_manufacturer_new_order_mom,event_manufacturer_new_order_ex_trans_mom,event_retail_sales_ex_auto_mom,event_retail_sales_mom,event_uom_consumer_sentiment,event_producer_ppi_mom,...,econ_consumer_sentiment_umich,econ_fed_ffr,econ_mortgage_rate_15yr,econ_mortgage_rate_30yr,econ_prime_loan_rate,rp_ticker,rp_date,rp_relevance,rp_ess,rp_aes,rp_aev,rp_ens,rp_ens_similarity_gap,rp_css,rp_nip,rp_peq,rp_bee,rp_bmq,rp_bam,rp_bca,rp_ber,rp_anl_chg,rp_mcq,techind_ticker,techind_date,techind_macd_MACD,techind_macd_MACD_Hist,techind_macd_MACD_Signal,techind_ema_t20_EMA,techind_ema_t50_EMA,techind_ema_t200_EMA,techind_rsi_t20_RSI,techind_rsi_t50_RSI,candle_cdl3blackcrows,candle_cdldarkcloudcover,candle_cdldoji,candle_cdldojistar,candle_cdldragonflydoji,candle_cdlengulfing,candle_cdleveningdojistar,candle_cdleveningstar,candle_cdlhammer,candle_cdlhangingman,candle_cdlharami,candle_cdlinvertedhammer,candle_cdlmorningdojistar,candle_cdlmorningstar,candle_cdlrickshawman,candle_cdlshootingstar,candle_cdltristar
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY,,,,,,,,,1999-11-23,21.07,21.00,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY,0.244076,,,,,,,,1999-11-24,20.70,20.26,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY,0.000000,,0.244076,,,,,,1999-11-26,20.35,22.33,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY,-0.119048,,0.000000,,0.244076,,,,1999-11-29,24.27,23.57,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0,0,0
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY,-0.047568,,-0.119048,,0.000000,,0.244076,,1999-11-30,23.81,24.18,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,0.021387,0.737431,2022-02-18,26.66,27.75,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.576886,10.599657,1.395693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-18,1.5046,0.0498,1.4548,43.7342,41.4175,36.1478,58.2486,57.2518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,2022-02-22,31.80,28.81,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.579211,10.601987,1.396000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-22,1.3406,-0.0913,1.4319,43.7581,41.5182,36.2258,55.5044,56.0585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,2022-02-23,28.04,31.02,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,9.902606,9.923926,1.306717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-23,1.2770,-0.1239,1.4009,43.8755,41.6544,36.3130,58.0956,57.1937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS,0.003334,0.058647,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,2022-02-24,37.50,30.32,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.214170,10.236160,1.347830,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-24,1.2246,-0.1411,1.3657,43.9959,41.7911,36.4008,58.4736,57.3606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Merge with sentiment data. 

### Concat sentiment data. 

In [10]:
# Information. 
infcols = [
	"ticker", "rpna_date_utc", "timestamp_utc", 
	"news_type", "source", "position_name", 
] 
# Categorical features. 
catcols = [
	"topic", "group", "type", "sub_type", "category", 
] 
# Numerical features. 
numcols = [
	"relevance", "ess", "aes", "aev", "ens", "ens_similarity_gap", 
	"css", "nip", "peq", "bee", "bmq", "bam", "bca", "ber", "anl_chg", "mcq", 
] 

# Get RavenPack sentiment data. 
df_sentiment = concat_eachyear(
	DIR_DATASET_SENTIMENT, keep_tickers=ticker_to_collect, 
	keep_cols=infcols + catcols + numcols, yearrange=(2010,2022)
) 
df_sentiment.rename(columns={"rpna_date_utc": "date"}, inplace=True) 

# Not important. Clear mixed types warning when reading CSV into dataframe. 
clear_output() 

# Preview. 
df_sentiment 

Unnamed: 0,ticker,date,timestamp_utc,news_type,source,position_name,topic,group,type,sub_type,category,relevance,ess,aes,aev,ens,ens_similarity_gap,css,nip,peq,bee,bmq,bam,bca,ber,anl_chg,mcq
0,C,2011-01-01,2011-01-01 00:00:42.496,TABULAR-MATERIAL,B5569E,,,,,,,37.0,,78.0,208.0,,,52.0,41.0,50.0,50.0,100.0,50.0,50.0,50.0,50.0,50.0
1,C,2011-01-01,2011-01-01 05:01:43.246,FULL-ARTICLE,18A55F,,,,,,,3.0,,78.0,208.0,,,50.0,44.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
2,C,2011-01-01,2011-01-01 05:04:12.297,FULL-ARTICLE,18A55F,,,,,,,23.0,,78.0,208.0,,,50.0,40.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
3,C,2011-01-01,2011-01-01 05:04:52.345,FULL-ARTICLE,18A55F,,,,,,,4.0,,78.0,208.0,,,50.0,47.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
4,C,2011-01-02,2011-01-02 03:57:38.333,FULL-ARTICLE,B5569E,,,,,,,43.0,,78.0,208.0,,,50.0,41.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267685,GOOGL,2020-10-21 00:00:00,2020-10-21 14:21:54.693000,FULL-ARTICLE,B5569E,,,,,,,3.0,,66.0,145.0,,,52.0,25.0,50.0,50.0,100.0,50.0,50.0,50.0,50.0,50.0
267686,GOOGL,2020-10-21 00:00:00,2020-10-21 14:24:54.265000,PRESS-RELEASE,B5569E,,,,,,,4.0,,66.0,145.0,,,52.0,41.0,50.0,50.0,100.0,50.0,50.0,50.0,50.0,50.0
267687,GOOGL,2020-10-21 00:00:00,2020-10-21 14:29:41.502000,FULL-ARTICLE,B5569E,,,,,,,3.0,,66.0,145.0,,,52.0,34.0,50.0,50.0,100.0,50.0,50.0,50.0,50.0,50.0
267688,GOOGL,2020-10-21 00:00:00,2020-10-21 14:40:20.023000,FULL-ARTICLE,B5569E,,,,,,,45.0,,66.0,145.0,,,50.0,51.0,50.0,100.0,0.0,50.0,50.0,0.0,50.0,50.0


In [11]:
# Preview the news topics, types, and categories. 
df_sentiment.dropna(how="all", subset=catcols) 

Unnamed: 0,ticker,date,timestamp_utc,news_type,source,position_name,topic,group,type,sub_type,category,relevance,ess,aes,aev,ens,ens_similarity_gap,css,nip,peq,bee,bmq,bam,bca,ber,anl_chg,mcq
10,C,2011-01-03,2011-01-03 00:31:00.121,NEWS-FLASH,B5569E,,business,price-targets,price-target,upgrade,price-target-upgrade-rater,20.0,50.0,78.0,208.0,100.0,100.00000,55.0,72.0,100.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
11,C,2011-01-03,2011-01-03 00:32:10.321,NEWS-FLASH,B5569E,,business,price-targets,price-target,upgrade,price-target-upgrade-rater,20.0,50.0,78.0,208.0,100.0,100.00000,55.0,72.0,100.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
15,C,2011-01-03,2011-01-03 01:21:49.902,FULL-ARTICLE,B5569E,,business,price-targets,price-target,upgrade,price-target-upgrade-rater,20.0,50.0,78.0,208.0,75.0,0.03529,56.0,60.0,100.0,100.0,100.0,50.0,100.0,100.0,50.0,50.0
17,C,2011-01-03,2011-01-03 01:34:37.367,FULL-ARTICLE,B5569E,,business,price-targets,price-target,upgrade,price-target-upgrade-rater,20.0,50.0,78.0,208.0,75.0,0.04337,56.0,48.0,100.0,100.0,100.0,50.0,100.0,100.0,50.0,50.0
24,C,2011-01-03,2011-01-03 03:47:26.398,NEWS-FLASH,B5569E,,business,price-targets,price-target,upgrade,price-target-upgrade-rater,20.0,50.0,78.0,208.0,100.0,100.00000,55.0,64.0,100.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267575,GOOGL,2020-10-21 00:00:00,2020-10-21 06:12:12.549000,FULL-ARTICLE,B5569E,,society,legal,legal-issues,,legal-issues-defendant,100.0,22.0,68.0,142.0,75.0,0.00058,55.0,42.0,100.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
267584,GOOGL,2020-10-21 00:00:00,2020-10-21 06:32:30.934000,FULL-ARTICLE,AA6E89,,society,legal,legal-issues,,legal-issues-defendant,100.0,22.0,67.0,143.0,56.0,0.01410,55.0,41.0,100.0,50.0,50.0,50.0,50.0,0.0,50.0,50.0
267585,GOOGL,2020-10-21 00:00:00,2020-10-21 06:32:30.940000,FULL-ARTICLE,AA6E89,,society,legal,legal-issues,,legal-issues-defendant,100.0,22.0,67.0,144.0,42.0,0.00000,50.0,41.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
267631,GOOGL,2020-10-21 00:00:00,2020-10-21 10:29:31.204000,FULL-ARTICLE,B5569E,,society,legal,legal-issues,,legal-issues-defendant,100.0,22.0,66.0,145.0,100.0,100.00000,53.0,52.0,100.0,50.0,100.0,50.0,50.0,50.0,50.0,100.0


### Aggregate sentiment data. 

In [12]:
'''
!!! WIP for the aggregation part. Some features need a different aggregation method.
''' 

# Process the sentiment data before merging with the ticker data on date. 
# Sentiment data contains multiple rows of information on each date. 
# Ensure that you aggregate them first so that the dates are unique. 

groupcols = ["ticker", "date"] 

df_sentiment_agg = df_sentiment \
	.dropna(axis="index", how="all", subset=catcols) \
	.set_index(groupcols) \
	.loc[:, numcols] \
	.groupby(groupcols) \
	.agg(np.max) \
	.reset_index(drop=False) 

# Preview. 
df_sentiment_agg 

Unnamed: 0,ticker,date,relevance,ess,aes,aev,ens,ens_similarity_gap,css,nip,peq,bee,bmq,bam,bca,ber,anl_chg,mcq
0,AAPL,2010-01-04,100.0,67.0,57.0,77.0,100.0,27.42061,50.0,45.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
1,AAPL,2010-01-05,100.0,67.0,58.0,82.0,100.0,100.00000,52.0,45.0,50.0,50.0,100.0,50.0,50.0,50.0,50.0,100.0
2,AAPL,2010-01-06,100.0,40.0,56.0,81.0,100.0,33.95867,47.0,36.0,50.0,50.0,0.0,50.0,50.0,50.0,50.0,0.0
3,AAPL,2010-01-07,100.0,64.0,56.0,82.0,100.0,97.03524,50.0,34.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
4,AAPL,2010-01-08,100.0,67.0,56.0,82.0,100.0,3.00000,50.0,45.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54842,VZ,2022-04-25,100.0,40.0,66.0,108.0,100.0,100.00000,39.0,76.0,0.0,50.0,0.0,50.0,0.0,50.0,50.0,0.0
54843,VZ,2022-04-26,100.0,40.0,61.0,92.0,75.0,0.99991,50.0,43.0,0.0,100.0,50.0,50.0,50.0,0.0,50.0,100.0
54844,VZ,2022-04-27,100.0,40.0,62.0,86.0,100.0,1.00015,39.0,43.0,0.0,50.0,0.0,50.0,0.0,0.0,50.0,0.0
54845,VZ,2022-04-28,100.0,40.0,62.0,89.0,100.0,21.99604,47.0,39.0,50.0,50.0,0.0,50.0,50.0,50.0,50.0,0.0


### Merge with ticker data. 

In [13]:
filepath = os.path.join(DIR_DATASET_CONSOLIDATED, filename_output) 

if load_cache and os.path.isfile(filepath): 
	df_feature_w_label = pd.read_parquet(filepath) 
else: 
	# Make a copy of the dataframe to avoid error related to pandas (SettingWarnings). 
	df_feature_w_label = merge_with_ticker(
		df_feature_w_label.copy(), df_sentiment_agg.copy(), 
		merge_suffix="rp", merge_on=["ticker", "date"], relation="one_to_many"
	) 

	# Cache the processed dataset. 
	df_feature_w_label.to_parquet(filepath, index=False) 

# Preview. 
df_feature_w_label 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker,return_c2c_lag1,tscore_c2c_lag1,return_c2c_lag1_autolag1,tscore_c2c_lag1_autolag1,return_c2c_lag1_autolag2,tscore_c2c_lag1_autolag2,return_c2c_lag1_autolag3,tscore_c2c_lag1_autolag3,vix_date,vix_open,vix_close,date_quarter,vl_ticker,vl_fiscalDateEnding,vl_eps,vl_eps_continuing,vl_rps,vl_date_quarter,vl_pe,vl_pe_continuing,vl_ps,event_jobs_opening_labor_turnover,event_non_farm_employment_adp_mom,event_non_farm_employment_mom,event_unemployment_claims,event_unemployment_rate,event_avg_hourly_earnings_mom,event_personal_dispensable_income_mom,event_personal_consumption_mom,event_ism_pmi_manufacturer,event_ism_pmi_services,event_chicago_pmi,event_industry_production_mom,event_phil_fed_manufacturer,event_capacity_utilisation,event_manufacturer_new_order_mom,event_manufacturer_new_order_ex_trans_mom,event_retail_sales_ex_auto_mom,event_retail_sales_mom,event_uom_consumer_sentiment,event_producer_ppi_mom,...,econ_consumer_sentiment_umich,econ_fed_ffr,econ_mortgage_rate_15yr,econ_mortgage_rate_30yr,econ_prime_loan_rate,rp_ticker,rp_date,rp_relevance,rp_ess,rp_aes,rp_aev,rp_ens,rp_ens_similarity_gap,rp_css,rp_nip,rp_peq,rp_bee,rp_bmq,rp_bam,rp_bca,rp_ber,rp_anl_chg,rp_mcq,techind_ticker,techind_date,techind_macd_MACD,techind_macd_MACD_Hist,techind_macd_MACD_Signal,techind_ema_t20_EMA,techind_ema_t50_EMA,techind_ema_t200_EMA,techind_rsi_t20_RSI,techind_rsi_t50_RSI,candle_cdl3blackcrows,candle_cdldarkcloudcover,candle_cdldoji,candle_cdldojistar,candle_cdldragonflydoji,candle_cdlengulfing,candle_cdleveningdojistar,candle_cdleveningstar,candle_cdlhammer,candle_cdlhangingman,candle_cdlharami,candle_cdlinvertedhammer,candle_cdlmorningdojistar,candle_cdlmorningstar,candle_cdlrickshawman,candle_cdlshootingstar,candle_cdltristar
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY,,,,,,,,,1999-11-23,21.07,21.00,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY,0.244076,,,,,,,,1999-11-24,20.70,20.26,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY,0.000000,,0.244076,,,,,,1999-11-26,20.35,22.33,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY,-0.119048,,0.000000,,0.244076,,,,1999-11-29,24.27,23.57,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0,0,0
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY,-0.047568,,-0.119048,,0.000000,,0.244076,,1999-11-30,23.81,24.18,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,0.021387,0.737431,2022-02-18,26.66,27.75,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.576886,10.599657,1.395693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-18,1.5046,0.0498,1.4548,43.7342,41.4175,36.1478,58.2486,57.2518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,2022-02-22,31.80,28.81,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.579211,10.601987,1.396000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-22,1.3406,-0.0913,1.4319,43.7581,41.5182,36.2258,55.5044,56.0585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,2022-02-23,28.04,31.02,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,9.902606,9.923926,1.306717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-23,1.2770,-0.1239,1.4009,43.8755,41.6544,36.3130,58.0956,57.1937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS,0.003334,0.058647,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,2022-02-24,37.50,30.32,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.214170,10.236160,1.347830,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-24,1.2246,-0.1411,1.3657,43.9959,41.7911,36.4008,58.4736,57.3606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Get technical indicator data. 

In [14]:
filepath = os.path.join(DIR_DATASET_CONSOLIDATED, filename_output) 

if load_cache and os.path.isfile(filepath): 
	df_feature_w_label = pd.read_parquet(filepath) 
else: 
	# Load the cached technical indicator. 
	filepath_feat = os.path.join(DIR_DATASET_TECH_IND, "technical_indicator.csv") 
	df_techind = pd.read_csv(filepath_feat) 

	# Make a copy of the dataframe to avoid error related to pandas (SettingWarnings). 
	df_feature_w_label = merge_with_ticker(
		df_feature_w_label.copy(), df_techind.copy(), 
		merge_suffix="techind", merge_on=["ticker", "date"], relation="one_to_many"
	) 

	# Cache the processed dataset. 
	df_feature_w_label.to_parquet(filepath, index=False) 

# Preview. 
df_feature_w_label 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker,return_c2c_lag1,tscore_c2c_lag1,return_c2c_lag1_autolag1,tscore_c2c_lag1_autolag1,return_c2c_lag1_autolag2,tscore_c2c_lag1_autolag2,return_c2c_lag1_autolag3,tscore_c2c_lag1_autolag3,vix_date,vix_open,vix_close,date_quarter,vl_ticker,vl_fiscalDateEnding,vl_eps,vl_eps_continuing,vl_rps,vl_date_quarter,vl_pe,vl_pe_continuing,vl_ps,event_jobs_opening_labor_turnover,event_non_farm_employment_adp_mom,event_non_farm_employment_mom,event_unemployment_claims,event_unemployment_rate,event_avg_hourly_earnings_mom,event_personal_dispensable_income_mom,event_personal_consumption_mom,event_ism_pmi_manufacturer,event_ism_pmi_services,event_chicago_pmi,event_industry_production_mom,event_phil_fed_manufacturer,event_capacity_utilisation,event_manufacturer_new_order_mom,event_manufacturer_new_order_ex_trans_mom,event_retail_sales_ex_auto_mom,event_retail_sales_mom,event_uom_consumer_sentiment,event_producer_ppi_mom,...,econ_consumer_sentiment_umich,econ_fed_ffr,econ_mortgage_rate_15yr,econ_mortgage_rate_30yr,econ_prime_loan_rate,rp_ticker,rp_date,rp_relevance,rp_ess,rp_aes,rp_aev,rp_ens,rp_ens_similarity_gap,rp_css,rp_nip,rp_peq,rp_bee,rp_bmq,rp_bam,rp_bca,rp_ber,rp_anl_chg,rp_mcq,techind_ticker,techind_date,techind_macd_MACD,techind_macd_MACD_Hist,techind_macd_MACD_Signal,techind_ema_t20_EMA,techind_ema_t50_EMA,techind_ema_t200_EMA,techind_rsi_t20_RSI,techind_rsi_t50_RSI,candle_cdl3blackcrows,candle_cdldarkcloudcover,candle_cdldoji,candle_cdldojistar,candle_cdldragonflydoji,candle_cdlengulfing,candle_cdleveningdojistar,candle_cdleveningstar,candle_cdlhammer,candle_cdlhangingman,candle_cdlharami,candle_cdlinvertedhammer,candle_cdlmorningdojistar,candle_cdlmorningstar,candle_cdlrickshawman,candle_cdlshootingstar,candle_cdltristar
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY,,,,,,,,,1999-11-23,21.07,21.00,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY,0.244076,,,,,,,,1999-11-24,20.70,20.26,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY,0.000000,,0.244076,,,,,,1999-11-26,20.35,22.33,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY,-0.119048,,0.000000,,0.244076,,,,1999-11-29,24.27,23.57,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0,0,0
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY,-0.047568,,-0.119048,,0.000000,,0.244076,,1999-11-30,23.81,24.18,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,0.021387,0.737431,2022-02-18,26.66,27.75,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.576886,10.599657,1.395693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-18,1.5046,0.0498,1.4548,43.7342,41.4175,36.1478,58.2486,57.2518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,2022-02-22,31.80,28.81,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.579211,10.601987,1.396000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-22,1.3406,-0.0913,1.4319,43.7581,41.5182,36.2258,55.5044,56.0585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,2022-02-23,28.04,31.02,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,9.902606,9.923926,1.306717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-23,1.2770,-0.1239,1.4009,43.8755,41.6544,36.3130,58.0956,57.1937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS,0.003334,0.058647,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,2022-02-24,37.50,30.32,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.214170,10.236160,1.347830,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-24,1.2246,-0.1411,1.3657,43.9959,41.7911,36.4008,58.4736,57.3606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Get candlesticks data. 

In [15]:
filepath = os.path.join(DIR_DATASET_CONSOLIDATED, filename_output) 

if load_cache and os.path.isfile(filepath): 
	df_feature_w_label = pd.read_parquet(filepath) 
else: 
	# Get candlestick data. 
	df_feature_w_label = get_candlesticks(df_feature_w_label.copy()) 

	# Cache the processed dataset. 
	df_feature_w_label.to_parquet(filepath, index=False) 

# Preview. 
df_feature_w_label 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,ticker,return_c2c_lag1,tscore_c2c_lag1,return_c2c_lag1_autolag1,tscore_c2c_lag1_autolag1,return_c2c_lag1_autolag2,tscore_c2c_lag1_autolag2,return_c2c_lag1_autolag3,tscore_c2c_lag1_autolag3,vix_date,vix_open,vix_close,date_quarter,vl_ticker,vl_fiscalDateEnding,vl_eps,vl_eps_continuing,vl_rps,vl_date_quarter,vl_pe,vl_pe_continuing,vl_ps,event_jobs_opening_labor_turnover,event_non_farm_employment_adp_mom,event_non_farm_employment_mom,event_unemployment_claims,event_unemployment_rate,event_avg_hourly_earnings_mom,event_personal_dispensable_income_mom,event_personal_consumption_mom,event_ism_pmi_manufacturer,event_ism_pmi_services,event_chicago_pmi,event_industry_production_mom,event_phil_fed_manufacturer,event_capacity_utilisation,event_manufacturer_new_order_mom,event_manufacturer_new_order_ex_trans_mom,event_retail_sales_ex_auto_mom,event_retail_sales_mom,event_uom_consumer_sentiment,event_producer_ppi_mom,...,econ_consumer_sentiment_umich,econ_fed_ffr,econ_mortgage_rate_15yr,econ_mortgage_rate_30yr,econ_prime_loan_rate,rp_ticker,rp_date,rp_relevance,rp_ess,rp_aes,rp_aev,rp_ens,rp_ens_similarity_gap,rp_css,rp_nip,rp_peq,rp_bee,rp_bmq,rp_bam,rp_bca,rp_ber,rp_anl_chg,rp_mcq,techind_ticker,techind_date,techind_macd_MACD,techind_macd_MACD_Hist,techind_macd_MACD_Signal,techind_ema_t20_EMA,techind_ema_t50_EMA,techind_ema_t200_EMA,techind_rsi_t20_RSI,techind_rsi_t50_RSI,candle_cdl3blackcrows,candle_cdldarkcloudcover,candle_cdldoji,candle_cdldojistar,candle_cdldragonflydoji,candle_cdlengulfing,candle_cdleveningdojistar,candle_cdleveningstar,candle_cdlhammer,candle_cdlhangingman,candle_cdlharami,candle_cdlinvertedhammer,candle_cdlmorningdojistar,candle_cdlmorningstar,candle_cdlrickshawman,candle_cdlshootingstar,candle_cdltristar
0,1999-11-23,8.44,8.44,8.44,8.44,60000.0,0.0,0.0,TDY,,,,,,,,,1999-11-23,21.07,21.00,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1999-11-24,8.44,10.69,8.44,10.50,917700.0,0.0,0.0,TDY,0.244076,,,,,,,,1999-11-24,20.70,20.26,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1999-11-26,10.50,10.50,10.50,10.50,32700.0,0.0,0.0,TDY,0.000000,,0.244076,,,,,,1999-11-26,20.35,22.33,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1999-11-29,10.62,10.62,9.25,9.25,511000.0,0.0,0.0,TDY,-0.119048,,0.000000,,0.244076,,,,1999-11-29,24.27,23.57,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0,0,0
4,1999-11-30,9.38,9.38,8.50,8.81,2424000.0,0.0,0.0,TDY,-0.047568,,-0.119048,,0.000000,,0.244076,,1999-11-30,23.81,24.18,1999-12-31,,NaT,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,107.2,5.42,7.36,7.74,8.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150137,2022-02-18,45.49,45.79,44.42,44.79,4823100.0,0.0,0.0,MOS,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,0.021387,0.737431,2022-02-18,26.66,27.75,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.576886,10.599657,1.395693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-18,1.5046,0.0498,1.4548,43.7342,41.4175,36.1478,58.2486,57.2518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150138,2022-02-22,45.50,45.63,43.38,43.99,7738900.0,0.0,0.0,MOS,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,0.015649,0.516161,2022-02-22,31.80,28.81,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.579211,10.601987,1.396000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-22,1.3406,-0.0913,1.4319,43.7581,41.5182,36.2258,55.5044,56.0585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150139,2022-02-23,42.59,45.59,42.16,44.99,9786700.0,0.0,0.0,MOS,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,-0.005859,-0.315586,2022-02-23,28.04,31.02,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,9.902606,9.923926,1.306717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-23,1.2770,-0.1239,1.4009,43.8755,41.6544,36.3130,58.0956,57.1937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1150140,2022-02-24,43.93,47.89,43.87,45.14,10706300.0,0.0,0.0,MOS,0.003334,0.058647,0.022732,0.842288,-0.017861,-0.776731,-0.022266,-0.952283,2022-02-24,37.50,30.32,2022-03-31,MOS,2021-12-31,4.300888,4.291648,32.593133,2022-03-31,10.214170,10.236160,1.347830,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,62.8,0.08,3.00,3.76,3.25,,,,,,,,,,,,,,,,,,,MOS,2022-02-24,1.2246,-0.1411,1.3657,43.9959,41.7911,36.4008,58.4736,57.3606,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
