# MADS Milestone 1 Project (Preprocessing Steps). 

In [1]:
# Python module. 
import os, re 
import datetime as dt
import numpy as np 
import pandas as pd 
import yfinance as yf 

# Change the current directory from (./notebook) to root directory. 
if re.match(r".+/notebook", os.getcwd()): 
	os.chdir("../..") 

# Custom module. 
from source.config.config import (
	DATASET_DIR, TICKER_DATE_COLLECT, TICKER_TO_COLLECT, 
	NEWS_KEYWORDS_MAPPING, EVENTS_FILENAMES, 
	INTENT_MEASURES, RE_PATS_AND_CONDITIONS, 
	METRIC_CHOICES, METRICS_TO_IDENTIFY_CONVERGENCE, 
)

# Create the directory if it doesn't exist. 
if not os.path.exists(DATASET_DIR):
    os.makedirs(DATASET_DIR) 

# For clearing safe warnings. Not important. 
from IPython.core.display import clear_output
clear_output() 

## Configurations (general). 

In [2]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 200)

# For clearing the output. Not important. 
clear_output()

# Collect & Consolidate Ticker Data. 

In [3]:
ls_tickers = [(t, yf.Ticker(t)) for t in TICKER_TO_COLLECT] 

# Initiate an empty dataframe to consolidate all the ticker data. 
df_tickers = pd.DataFrame() 

# Consolidate all tickers into a single dataframe. 
for t, ticker in ls_tickers: 
	# Read the data. 
	df_ticker = ticker.history(period="max", interval="1d", start=TICKER_DATE_COLLECT[0], end=TICKER_DATE_COLLECT[1], auto_adjust=True, rounding=True) 

	# Add a column to indicate the ticker name. 
	df_ticker["ticker"] = t

	# Convert column to lowercase. 
	df_ticker.columns = [c.lower() for c in df_ticker.columns] 

	# We will be computing 3 types of price difference: 
	# 1. Gapping / Close market price change. Difference between previous day 
	# 	 closing price and current day open price. 
	# 2. Open market price change. Difference between current day open and closing price. 
	# 3. Daily price change. Difference between previous day and current day closing price.

	# 1. price_chg_close_to_open 
	# Measure the change using the following day open price after the event has occured. 
	next_open = df_ticker["open"].shift(-1) 
	df_ticker["price_chg_c2o"] = (next_open - df_ticker["close"]) / df_ticker["close"] 

	# 2. price_chg_open_to_close
	df_ticker["price_chg_o2c"] = (df_ticker["close"] - df_ticker["open"]) / df_ticker["open"] 

	# 3. price_chg_close_to_close
	df_ticker["price_chg_c2c"] = df_ticker["close"].pct_change(1) 

	# Compute the rolling median for volume over a specific window. 
	df_ticker["volume_rollmed"] = df_ticker["volume"].rolling(window=90, min_periods=90, win_type=None).median() 

	# Compute the difference between each volume with the 3 months rolling median volume. 
	df_ticker["volume_diff_to_med"] = df_ticker["volume"] - df_ticker["volume_rollmed"] 

	# Compute the percent change from the 3 months rolling median volume. Comparing 
	# percent change between each period is easier than looking at the difference. 
	df_ticker["volume_pchg_from_med"] = df_ticker["volume_diff_to_med"] / df_ticker["volume_rollmed"] 

	# Compute the t-score for price change. 
	price_chg_c2o_rollavg = df_ticker["price_chg_c2o"].rolling(window=360, min_periods=360, win_type=None).mean() 
	price_chg_o2c_rollavg = df_ticker["price_chg_o2c"].rolling(window=360, min_periods=360, win_type=None).mean() 
	price_chg_c2c_rollavg = df_ticker["price_chg_c2c"].rolling(window=360, min_periods=360, win_type=None).mean() 

	price_chg_c2o_rollstd = df_ticker["price_chg_c2o"].rolling(window=360, min_periods=360, win_type=None).std(ddof=1) 
	price_chg_o2c_rollstd = df_ticker["price_chg_o2c"].rolling(window=360, min_periods=360, win_type=None).std(ddof=1) 
	price_chg_c2c_rollstd = df_ticker["price_chg_c2c"].rolling(window=360, min_periods=360, win_type=None).std(ddof=1) 

	df_ticker["tscore_c2o"] = (df_ticker["price_chg_c2o"] - price_chg_c2o_rollavg) / price_chg_c2o_rollstd 
	df_ticker["tscore_o2c"] = (df_ticker["price_chg_o2c"] - price_chg_o2c_rollavg) / price_chg_o2c_rollstd 
	df_ticker["tscore_c2c"] = (df_ticker["price_chg_c2c"] - price_chg_c2c_rollavg) / price_chg_c2c_rollstd 

	# Concat the data into a long table format in (pd.DataFrame) object. 
	df_tickers = pd.concat([df_tickers, df_ticker]) 

# Reset the index. 
df_tickers = df_tickers.reset_index(drop=False) 

# Convert columns into lowercase. 
df_tickers.columns = [c.lower() for c in df_tickers.columns] 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med,tscore_c2o,tscore_o2c,tscore_c2c
0,1998-12-22,12.09,12.09,11.97,12.02,55887,0.0,0.0,XLF,-0.004160,-0.005790,,,,,,,
1,1998-12-23,11.97,12.20,11.97,12.20,78784,0.0,0.0,XLF,0.000000,0.019215,0.014975,,,,,,
2,1998-12-24,12.20,12.28,12.16,12.28,43824,0.0,0.0,XLF,-0.000814,0.006557,0.006557,,,,,,
3,1998-12-28,12.27,12.27,12.09,12.12,51948,0.0,0.0,XLF,0.000825,-0.012225,-0.013029,,,,,,
4,1998-12-29,12.13,12.25,11.99,12.25,100819,0.0,0.0,XLF,-0.005714,0.009893,0.010726,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54170,2021-12-10,56.70,56.74,55.50,56.51,19524200,0.0,0.0,XLE,-0.009025,-0.003351,0.007488,26789050.0,-7264850.0,-0.271187,-0.813114,-0.214171,0.258881
54171,2021-12-13,56.00,56.17,54.67,54.94,28604000,0.0,0.0,XLE,-0.006189,-0.018929,-0.027783,27344050.0,1259950.0,0.046078,-0.591711,-1.137226,-1.345164
54172,2021-12-14,54.60,55.55,54.52,54.71,28798800,0.0,0.0,XLE,-0.001462,0.002015,-0.004186,27952200.0,846600.0,0.030287,-0.229611,0.112317,-0.267088
54173,2021-12-15,54.63,54.77,53.23,54.45,33858100,0.0,0.0,XLE,0.008632,-0.003295,-0.004752,28378450.0,5479650.0,0.193092,0.552860,-0.199646,-0.288803


# Collect & Consolidate VIX Data. 

## Collect VIX Data. 

In [4]:
# Collect VIX data to merge with the cosolidated ticker data. 
df_vix = yf.Ticker("^VIX") 
df_vix = df_vix.history(period="max", interval="1d", start=TICKER_DATE_COLLECT[0], end=TICKER_DATE_COLLECT[1], auto_adjust=True, rounding=True) 

# Convert column to lowercase. 
df_vix.columns = [c.lower() for c in df_vix.columns] 

# Convert index name to lowercase. 
df_vix.index.name = df_vix.index.name.lower() 

# Preview. 
df_vix

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock splits
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1998-11-30,22.79,26.01,22.79,26.01,0,0,0
1998-12-01,27.38,27.40,24.84,24.97,0,0,0
1998-12-02,25.63,26.52,25.28,25.43,0,0,0
1998-12-03,25.53,28.77,25.13,28.70,0,0,0
1998-12-04,26.28,26.76,25.08,25.31,0,0,0
...,...,...,...,...,...,...,...
2021-12-10,21.27,21.30,18.69,18.69,0,0,0
2021-12-13,19.29,21.18,18.96,20.31,0,0,0
2021-12-14,19.67,23.00,19.67,21.89,0,0,0
2021-12-15,21.60,23.47,19.02,19.29,0,0,0


## Process VIX Data. 

In [5]:
# Compute percentage change. 
df_vix["chg_c2c"] = df_vix["close"].pct_change(1) 

# Compute tscore. 
vix_chg_c2c_rollavg = df_vix["chg_c2c"].rolling(window=360, min_periods=360, win_type=None).mean() 
vix_chg_c2c_rollavg = df_vix["chg_c2c"].rolling(window=360, min_periods=360, win_type=None).std(ddof=1) 
df_vix["tscore_c2c"] = (df_vix["chg_c2c"] - vix_chg_c2c_rollavg) / vix_chg_c2c_rollavg 

# Add prefix to column names. 
base_columns = ["open", "close", 'chg_c2c', 'tscore_c2c']
df_vix = df_vix[base_columns]
renamed_columns = [f"vix_{column}".lower() for column in base_columns] 
df_vix.columns = renamed_columns 

# Preview. 
df_vix

Unnamed: 0_level_0,vix_open,vix_close,vix_chg_c2c,vix_tscore_c2c
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-11-30,22.79,26.01,,
1998-12-01,27.38,24.97,-0.039985,
1998-12-02,25.63,25.43,0.018422,
1998-12-03,25.53,28.70,0.128588,
1998-12-04,26.28,25.31,-0.118118,
...,...,...,...,...
2021-12-10,21.27,18.69,-0.133920,-2.530976
2021-12-13,19.29,20.31,0.086677,-0.009505
2021-12-14,19.67,21.89,0.077794,-0.106837
2021-12-15,21.60,19.29,-0.118776,-2.361829


# Merge Ticker & VIX Data. 

In [6]:
# Merge VIX with the ticker data. 
df_tickers = df_tickers.merge(right=df_vix, how="left", left_on="date", right_on="date", validate="many_to_one") 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med,tscore_c2o,tscore_o2c,tscore_c2c,vix_open,vix_close,vix_chg_c2c,vix_tscore_c2c
0,1998-12-22,12.09,12.09,11.97,12.02,55887,0.0,0.0,XLF,-0.004160,-0.005790,,,,,,,,24.05,22.78,-0.045264,
1,1998-12-23,11.97,12.20,11.97,12.20,78784,0.0,0.0,XLF,0.000000,0.019215,0.014975,,,,,,,21.89,20.21,-0.112818,
2,1998-12-24,12.20,12.28,12.16,12.28,43824,0.0,0.0,XLF,-0.000814,0.006557,0.006557,,,,,,,21.00,21.48,0.062840,
3,1998-12-28,12.27,12.27,12.09,12.12,51948,0.0,0.0,XLF,0.000825,-0.012225,-0.013029,,,,,,,22.92,23.50,0.094041,
4,1998-12-29,12.13,12.25,11.99,12.25,100819,0.0,0.0,XLF,-0.005714,0.009893,0.010726,,,,,,,23.68,22.18,-0.056170,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54170,2021-12-10,56.70,56.74,55.50,56.51,19524200,0.0,0.0,XLE,-0.009025,-0.003351,0.007488,26789050.0,-7264850.0,-0.271187,-0.813114,-0.214171,0.258881,21.27,18.69,-0.133920,-2.530976
54171,2021-12-13,56.00,56.17,54.67,54.94,28604000,0.0,0.0,XLE,-0.006189,-0.018929,-0.027783,27344050.0,1259950.0,0.046078,-0.591711,-1.137226,-1.345164,19.29,20.31,0.086677,-0.009505
54172,2021-12-14,54.60,55.55,54.52,54.71,28798800,0.0,0.0,XLE,-0.001462,0.002015,-0.004186,27952200.0,846600.0,0.030287,-0.229611,0.112317,-0.267088,19.67,21.89,0.077794,-0.106837
54173,2021-12-15,54.63,54.77,53.23,54.45,33858100,0.0,0.0,XLE,0.008632,-0.003295,-0.004752,28378450.0,5479650.0,0.193092,0.552860,-0.199646,-0.288803,21.60,19.29,-0.118776,-2.361829


# Save Processing Stage 1 (Processed Ticker Data). 

In [7]:
# filepath = os.path.join(DATASET_DIR, "sector_price_history_processed_stg_1.csv") 
# df_tickers.write_to_csv(filepath, index=False) 

# print(f"Write to ({filepath})") 

# Load Processing Stage 1 (Processed Ticker Data). 

In [8]:
# filepath = os.path.join(DATASET_DIR, "sector_price_history_processed_stg_1.csv") 
# df_tickers = pd.read_csv(filepath) 

# print(f"Read from ({filepath})") 

# Collect News Headline Keywords. 

## Read News Data. 

In [9]:
# Read the data. 
filepath = os.path.join(DATASET_DIR, "raw_partner_headlines.csv") 
df_articles = pd.read_csv(filepath, usecols=["date", "publisher", "headline"]) 

# Preview. 
df_articles

Unnamed: 0,headline,publisher,date
0,Agilent Technologies Announces Pricing of $5…… Million of Senior Notes,GuruFocus,2020-06-01 00:00:00
1,Agilent (A) Gears Up for Q2 Earnings: What's in the Cards?,Zacks,2020-05-18 00:00:00
2,J.P. Morgan Asset Management Announces Liquidation of Six Exchange-Traded Funds,GuruFocus,2020-05-15 00:00:00
3,"Pershing Square Capital Management, L.P. Buys Agilent Technologies Inc, The Howard Hughes Corp, ...",GuruFocus,2020-05-15 00:00:00
4,Agilent Awards Trilogy Sciences with a Golden Ticket at LabCentral,GuruFocus,2020-05-12 00:00:00
...,...,...,...
1845554,Consumer Cyclical Sector Wrap,webmaster,2012-08-20 00:00:00
1845555,Consumer Cyclical Sector Wrap,webmaster,2012-07-23 00:00:00
1845556,Zacks #5 Rank Additions for Monday - Tale of the Tape,Zacks,2012-04-23 00:00:00
1845557,4 Stock Strategies From Wall Street: Feb. 9 (Update 1),TheStreet.Com,2012-02-09 00:00:00


## Identify News Headline Keywords. 

In [10]:
topics = list(NEWS_KEYWORDS_MAPPING.keys()) 

matching_headlines = dict() 

# Identify the headline keywords for each news topic. 
for topic in topics: 
	# Identify headline keywords via regex. 
	re_pattern = f"""({"|".join(NEWS_KEYWORDS_MAPPING[topic])})"""

	# Extract matches. 
	re_matches = df_articles.loc[:, "headline"].str.findall(re_pattern, flags=re.IGNORECASE) 
	match_indicator = re_matches.str.len() > 0 

	# Consolidate the dates for each each headline keyword. 
	matches = match_indicator[match_indicator == True] 
	dates = pd.concat([matches, df_articles["date"]], axis=1, join="inner")["date"].str[:-9] 

	# Store the dates. 
	matching_headlines[topic] = dates 

# Concat the dataframes. 
df_headline_keywords = pd.concat(list(matching_headlines.values()), keys=list(matching_headlines.keys()), axis=1)

# Preview. 
df_headline_keywords

Unnamed: 0,news_ffr,news_fed,news_earnings,news_interest_rate,news_rate_hikes
1,,,2020-05-18,,
6,,,2020-05-07,,
27,,,2020-03-19,,
36,,,2020-02-19,,
38,,,2020-02-18,,
...,...,...,...,...,...
1845533,,,2016-04-15,,
1845534,,,2016-04-15,,
1845536,,,2015-12-09,,
1845538,,,2015-08-19,,


## Save Headline Keywords Data. 

In [11]:
# filepath = os.path.join(DATASET_DIR, "news_headline_keywords.csv") 
# df_headline_keywords.write_to_csv(filepath, index=False) 

# print(f"Write to ({filepath})") 

## Load Headline Keywords Data. 

In [12]:
# filepath = os.path.join(DATASET_DIR, "news_headline_keywords.csv") 
# df_headline_keywords = pd.read_csv(filepath) 

# print(f"Read from ({filepath})") 

# Consolidate Event Data & Merge With Ticker Data. 

## Consolidate Event Data. 

In [13]:
df_event_dates = pd.DataFrame() 

# Read the data and consolidate all the event dates. 
for filename in EVENTS_FILENAMES: 
	print(f"Read from ({filename})") 

	# Read the data. 
	filepath = os.path.join(DATASET_DIR, filename) 
	df_dates = pd.read_csv(filepath)

	# Sort the dates. Just to ensure it's in order. 
	event_dates = df_dates.sort_values(by=df_dates.columns.to_list()) 

	# Concat the dataframe. 
	df_event_dates = pd.concat([df_event_dates, event_dates], axis="columns") 

# Preview. 
df_event_dates

Read from (observance_dates_ext.csv)
Read from (santa_rally.csv)
Read from (triple_witching_week.csv)
Read from (economic_reported_date.csv)
Read from (news_headline_keywords.csv)


Unnamed: 0,black_friday,christmas,columbus,cyber_monday,good_friday,labor,martin_lut_king,new_year,thanksgiving,us_event_sep11,us_independence,us_memorial,us_president,us_veterans,valentine,santa_rally,tww_trdrday,jobs_opening_labor_turnover,non_farm_employment_adp_mom,non_farm_employment_mom,unemployment_claims,unemployment_rate,avg_hourly_earnings_mom,personal_dispensable_income_mom,personal_consumption_mom,...,producer_ppi_mom,producer_ppi_ex_food_energy_mom,consumer_cpi_mom,consumer_cpi_ex_food_energy_mom,pce_ex_food_energy_mom,housing_hpi_mom,housing_hpi_cs_yoy,building_permit,housing_starts,exist_home_sales,new_home_sales,gdp_us,gdp_deflator,gdp_advance_us,crude_oil_inventory,natural_gas_inventory,fomc_presscf,fomc_minutes,opec,opec_jmmc,news_ffr,news_fed,news_earnings,news_interest_rate,news_rate_hikes
0,1998-11-27,1998-12-25,1998-10-12,1998-11-30,1998-04-10,1998-09-07,1998-01-19,1998-01-01,1998-11-26,1998-09-11,1998-07-04,1998-05-25,1998-02-16,1998-11-11,1998-02-14,1997-12-31,1998-03-16,2010-10-07,2007-01-03,2007-01-05,2007-01-04,2007-01-05,2007-01-05,2007-02-01,2007-02-01,...,2007-01-17,2007-01-17,2007-01-18,2007-01-18,2007-02-01,2008-03-25,2007-01-30,2007-01-18,2007-01-18,2007-01-25,2007-01-26,2007-03-29,2007-03-29,2007-01-31,2007-01-04,2007-01-05,2011-04-28,2007-01-04,2007-03-15,2017-01-22,,,2020-05-18,,
1,1998-11-25,1998-12-24,1998-10-09,1998-11-27,1998-04-09,1998-09-04,1998-01-16,1997-12-31,1998-11-25,1998-09-10,1998-07-02,1998-05-22,1998-02-13,1998-11-10,1998-02-13,1997-12-30,1998-03-17,2010-11-09,2007-01-31,2007-02-02,2007-01-11,2007-02-02,2007-02-02,2007-03-01,2007-03-01,...,2007-02-16,2007-02-16,2007-02-21,2007-02-21,2007-03-01,2008-04-22,2007-02-27,2007-02-16,2007-02-16,2007-02-27,2007-02-28,2007-06-28,2007-06-28,2007-04-27,2007-01-10,2007-01-11,2011-06-23,2007-02-22,2007-09-11,2017-02-24,,,2020-05-07,,
2,1998-11-30,1998-12-28,1998-10-13,1998-12-01,1998-04-13,1998-09-08,1998-01-20,1998-01-02,1998-11-27,1998-09-14,1998-07-06,1998-05-26,1998-02-17,1998-11-12,1998-02-17,1997-12-29,1998-03-18,2010-12-07,2007-03-07,2007-03-09,2007-01-18,2007-03-09,2007-03-09,2007-03-30,2007-03-30,...,2007-03-15,2007-03-15,2007-03-16,2007-03-16,2007-03-30,2008-05-22,2007-03-27,2007-03-20,2007-03-20,2007-03-23,2007-03-26,2007-09-27,2007-09-27,2007-07-27,2007-01-18,2007-01-18,2011-11-03,2007-04-12,2007-12-05,2017-03-26,,,2020-03-19,,
3,1998-11-24,1998-12-23,1998-10-08,1998-11-25,1998-04-08,1998-09-03,1998-01-15,1997-12-30,1998-11-24,1998-09-09,1998-07-01,1998-05-21,1998-02-12,1998-11-09,1998-02-12,1997-12-26,1998-03-19,2011-01-11,2007-04-04,2007-04-06,2007-01-25,2007-04-06,2007-04-06,2007-04-30,2007-04-30,...,2007-04-13,2007-04-13,2007-04-17,2007-04-17,2007-04-30,2008-06-24,2007-04-24,2007-04-17,2007-04-17,2007-04-24,2007-04-25,2007-12-20,2007-12-20,2007-10-31,2007-01-24,2007-01-25,2012-01-26,2007-05-31,2008-02-01,2017-04-28,,,2020-02-19,,
4,1998-12-01,1998-12-29,1998-10-14,1998-12-02,1998-04-14,1998-09-09,1998-01-21,1998-01-05,1998-11-30,1998-09-15,1998-07-07,1998-05-27,1998-02-18,1998-11-13,1998-02-18,1997-12-24,1998-03-20,2011-02-08,2007-05-02,2007-05-04,2007-02-01,2007-05-04,2007-05-04,2007-06-01,2007-06-01,...,2007-05-11,2007-05-11,2007-05-15,2007-05-15,2007-06-01,2008-07-22,2007-05-29,2007-05-16,2007-05-16,2007-05-25,2007-05-24,2008-03-27,2008-03-27,2008-01-30,2007-01-31,2007-02-01,2012-04-26,2007-07-20,2008-03-05,2017-05-24,,,2020-02-18,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235511,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,2016-04-15,,
235512,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,2016-04-15,,
235513,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,2015-12-09,,
235514,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,2015-08-19,,


## Store The Factor Names. 

In [14]:
# Store factor names in a list to keep track of them. 
factors = df_event_dates.columns.tolist() 

# Preview. 
factors

['black_friday',
 'christmas',
 'columbus',
 'cyber_monday',
 'good_friday',
 'labor',
 'martin_lut_king',
 'new_year',
 'thanksgiving',
 'us_event_sep11',
 'us_independence',
 'us_memorial',
 'us_president',
 'us_veterans',
 'valentine',
 'santa_rally',
 'tww_trdrday',
 'jobs_opening_labor_turnover',
 'non_farm_employment_adp_mom',
 'non_farm_employment_mom',
 'unemployment_claims',
 'unemployment_rate',
 'avg_hourly_earnings_mom',
 'personal_dispensable_income_mom',
 'personal_consumption_mom',
 'ism_pmi_manufacturer',
 'ism_pmi_services',
 'chicago_pmi',
 'industry_production_mom',
 'phil_fed_manufacturer',
 'capacity_utilisation',
 'manufacturer_new_order_mom',
 'manufacturer_new_order_ex_trans_mom',
 'retail_sales_ex_auto_mom',
 'retail_sales_mom',
 'uom_consumer_sentiment',
 'producer_ppi_mom',
 'producer_ppi_ex_food_energy_mom',
 'consumer_cpi_mom',
 'consumer_cpi_ex_food_energy_mom',
 'pce_ex_food_energy_mom',
 'housing_hpi_mom',
 'housing_hpi_cs_yoy',
 'building_permit',
 'hou

## Merge Event Data With Ticker Data. 

In [15]:
for event_name in df_event_dates.columns: 
	# Default to 0. 
	df_tickers[event_name] = 0 

	# Ensure the datetime is converted to str to be able to match dates. 
	df_tickers["date"] = df_tickers["date"].astype(str) 
	
	# Filter non economic report dates and assign 1. 
	boo_dates = df_tickers["date"].isin(df_event_dates[event_name].values) 
	df_tickers.loc[boo_dates, event_name] = 1 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med,tscore_c2o,tscore_o2c,tscore_c2c,vix_open,vix_close,vix_chg_c2c,vix_tscore_c2c,black_friday,christmas,columbus,...,producer_ppi_mom,producer_ppi_ex_food_energy_mom,consumer_cpi_mom,consumer_cpi_ex_food_energy_mom,pce_ex_food_energy_mom,housing_hpi_mom,housing_hpi_cs_yoy,building_permit,housing_starts,exist_home_sales,new_home_sales,gdp_us,gdp_deflator,gdp_advance_us,crude_oil_inventory,natural_gas_inventory,fomc_presscf,fomc_minutes,opec,opec_jmmc,news_ffr,news_fed,news_earnings,news_interest_rate,news_rate_hikes
0,1998-12-22,12.09,12.09,11.97,12.02,55887,0.0,0.0,XLF,-0.004160,-0.005790,,,,,,,,24.05,22.78,-0.045264,,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1998-12-23,11.97,12.20,11.97,12.20,78784,0.0,0.0,XLF,0.000000,0.019215,0.014975,,,,,,,21.89,20.21,-0.112818,,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1998-12-24,12.20,12.28,12.16,12.28,43824,0.0,0.0,XLF,-0.000814,0.006557,0.006557,,,,,,,21.00,21.48,0.062840,,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1998-12-28,12.27,12.27,12.09,12.12,51948,0.0,0.0,XLF,0.000825,-0.012225,-0.013029,,,,,,,22.92,23.50,0.094041,,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1998-12-29,12.13,12.25,11.99,12.25,100819,0.0,0.0,XLF,-0.005714,0.009893,0.010726,,,,,,,23.68,22.18,-0.056170,,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54170,2021-12-10,56.70,56.74,55.50,56.51,19524200,0.0,0.0,XLE,-0.009025,-0.003351,0.007488,26789050.0,-7264850.0,-0.271187,-0.813114,-0.214171,0.258881,21.27,18.69,-0.133920,-2.530976,0,0,0,...,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
54171,2021-12-13,56.00,56.17,54.67,54.94,28604000,0.0,0.0,XLE,-0.006189,-0.018929,-0.027783,27344050.0,1259950.0,0.046078,-0.591711,-1.137226,-1.345164,19.29,20.31,0.086677,-0.009505,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
54172,2021-12-14,54.60,55.55,54.52,54.71,28798800,0.0,0.0,XLE,-0.001462,0.002015,-0.004186,27952200.0,846600.0,0.030287,-0.229611,0.112317,-0.267088,19.67,21.89,0.077794,-0.106837,0,0,0,...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
54173,2021-12-15,54.63,54.77,53.23,54.45,33858100,0.0,0.0,XLE,0.008632,-0.003295,-0.004752,28378450.0,5479650.0,0.193092,0.552860,-0.199646,-0.288803,21.60,19.29,-0.118776,-2.361829,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


# Save Processing Stage 2 (Processed Ticker Data). 

In [16]:
# filepath = os.path.join(DATASET_DIR, "sector_price_history_processed_stg_2.csv") 
# df_tickers.write_to_csv(filepath, index=False) 

# print(f"Write to ({filepath})") 

# Load Proocessing Stage 2 (Processed Ticker Data). 

In [17]:
# filepath = os.path.join(DATASET_DIR, "sector_price_history_processed_stg_2.csv") 
# df_tickers = pd.read_csv(filepath) 

# print(f"Read from ({filepath})") 

# Compute Aggregation. 

In [18]:
# An empty dataframe to consolidate all the aggregates for 
# different metrics. 
df_consolidated_agg = pd.DataFrame() 

# To store the ticker and factor values. 
arr_tickers, arr_factors = [], [] 

# Start consolidating the aggregates. This will take a while. 
for intent_measure, metrics in INTENT_MEASURES.items(): 

	# Define what we want to measure. 
	for metric in metrics: 

		# Define whether to measure the event or non-occuring event period. 
		for measure_event_period in [0, 1]: 

			# Define the variable to assign as the pivot value. 
			pivot_value = metric 

			# Define the the aggregate function and the name for the aggregated value. 
			aggvalue_name, aggfunc = metric, np.mean 

			# Define the positive threshold for probability count. 
			prob_threshold = 0 

			# An empty dataframe to consolidate all the pivot tables. 
			df_aggregates = pd.DataFrame() 

			# Consolidate all the pivot tables. 
			for factor in factors: 
				# Remove event or non-occuring event period (either 1 or 0) for that factor. 
				df_processed = df_tickers.loc[df_tickers[factor] == measure_event_period, :] 

				# Convert all negative to positive unless we are looking to measure 
				# the directional probability or distance from the threshold. 
				if intent_measure == "mag": 
					df_processed.loc[:, pivot_value] = df_processed.loc[:, pivot_value].abs() 

				# To compute directional probabilities, we need to conver the negatives to 0 
				# and positives to 1 before aggregating it with the mean. 
				if intent_measure in ["dir", "abv"]: 
					df_processed.loc[df_processed[pivot_value] <= prob_threshold, pivot_value] = 0 
					df_processed.loc[df_processed[pivot_value] >  prob_threshold, pivot_value] = 1 

				# Convert to pivot table. Average the value across entire the timeframe. 
				df_pivottable = df_processed.pivot_table(values=pivot_value, index="ticker", columns=factor, aggfunc=aggfunc) 

				# Rename the column heading. 
				df_pivottable.columns.name = "factor" 

				# Rename the column. There should be only 1 colume in this case. 
				# The original column name will either be 0 or 1. 
				try: 
					df_pivottable.columns = [factor] 
				except:
					print("Failed on Metric: " + metric + " Factor: " + factor)

				# Combine all the pivot tables into a single dataframe. 
				df_aggregates = pd.concat([df_aggregates, df_pivottable], axis="columns") 

			# Convert into long table. 
			df_aggregates = df_aggregates \
				.reset_index(drop=False) \
				.melt(id_vars="ticker", var_name="factor", value_vars=df_aggregates.columns, value_name=aggvalue_name) 

			# Rename the metric name. Example (tscore_c2c) will be (tscore_c2c_mag) or 
			# (price_chg_c2o) will be (price_chg_c2o_dir). 
			metric_newname = f"{metric}_{intent_measure}_{measure_event_period}" 
			df_aggregates = df_aggregates.rename(mapper={metric: metric_newname}, axis="columns") 

			# Combine all the pivot tables into a single dataframe. 
			df_consolidated_agg = pd.concat([df_consolidated_agg, df_aggregates[[metric_newname]]], axis="columns") 

			if not arr_tickers and not arr_factors: 
				arr_tickers = df_aggregates["ticker"].to_list() 
				arr_factors = df_aggregates["factor"].to_list() 

		if intent_measure == "mag": 
			# Compute the value difference between occurring event and non-occuring event. 
			df_consolidated_agg[f"{metric}_{intent_measure}_diff"] = \
				df_consolidated_agg[f"{metric}_{intent_measure}_1"] - df_consolidated_agg[f"{metric}_{intent_measure}_0"] 

# Add new columns for tickers and factors since the pivot table doesn't contain these columns. 
df_consolidated_agg["ticker"] = arr_tickers 
df_consolidated_agg["factor"] = arr_factors 

# Rearragne the columns. 
cols = ["ticker", "factor"] + df_consolidated_agg.columns[:-2].to_list() 
df_consolidated_agg = df_consolidated_agg[cols]

# Preview. The warning is safe to ignore. 
# False positive could occur for (SettingWithCopyWarning). 
df_consolidated_agg 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,ticker,factor,price_chg_c2o_dir_0,price_chg_c2o_dir_1,price_chg_c2c_dir_0,price_chg_c2c_dir_1,price_chg_o2c_dir_0,price_chg_o2c_dir_1,volume_pchg_from_med_abv_0,volume_pchg_from_med_abv_1,tscore_o2c_mag_0,tscore_o2c_mag_1,tscore_o2c_mag_diff,price_chg_c2o_mag_0,price_chg_c2o_mag_1,price_chg_c2o_mag_diff,tscore_c2o_mag_0,tscore_c2o_mag_1,tscore_c2o_mag_diff,tscore_c2c_mag_0,tscore_c2c_mag_1,tscore_c2c_mag_diff,price_chg_c2c_mag_0,price_chg_c2c_mag_1,price_chg_c2c_mag_diff,vix_tscore_c2c_mag_0,vix_tscore_c2c_mag_1,vix_tscore_c2c_mag_diff,vix_chg_c2c_mag_0,vix_chg_c2c_mag_1,vix_chg_c2c_mag_diff,price_chg_o2c_mag_0,price_chg_o2c_mag_1,price_chg_o2c_mag_diff
0,XHB,black_friday,0.536321,0.517857,0.508243,0.500000,0.488540,0.553571,0.501318,0.392857,0.747469,0.792466,0.044997,0.006562,0.007237,0.000675,0.711182,0.783044,0.071861,0.746003,0.824535,0.078532,0.013898,0.016297,0.002399,1.190777,1.194601,0.003825,0.055074,0.059339,0.004265,0.012308,0.013863,0.001555
1,XLB,black_friday,0.531389,0.515528,0.518762,0.583851,0.494666,0.546584,0.505510,0.472050,0.737755,0.698880,-0.038875,0.005560,0.005650,0.000090,0.713369,0.697686,-0.015683,0.744061,0.780115,0.036055,0.010722,0.011354,0.000632,1.189757,1.141327,-0.048431,0.050670,0.051788,0.001118,0.008967,0.008262,-0.000705
2,XLE,black_friday,0.533879,0.472050,0.512716,0.515528,0.496622,0.540373,0.518699,0.546584,0.774411,0.754016,-0.020396,0.006515,0.008068,0.001552,0.749362,0.931600,0.182238,0.762831,0.907170,0.144339,0.012559,0.015385,0.002826,1.189757,1.141327,-0.048431,0.050670,0.051788,0.001118,0.010443,0.010723,0.000280
3,XLF,black_friday,0.513961,0.509317,0.505780,0.527950,0.490932,0.472050,0.514905,0.416149,0.735050,0.678253,-0.056796,0.006612,0.007275,0.000663,0.688446,0.710424,0.021978,0.731602,0.809077,0.077475,0.011646,0.013628,0.001982,1.189757,1.141327,-0.048431,0.050670,0.051788,0.001118,0.009675,0.009423,-0.000252
4,XLI,black_friday,0.528721,0.521739,0.532278,0.534161,0.505156,0.540373,0.534598,0.422360,0.738487,0.725368,-0.013120,0.005529,0.005869,0.000341,0.725300,0.760056,0.034757,0.737487,0.751662,0.014175,0.009330,0.009542,0.000212,1.189757,1.141327,-0.048431,0.050670,0.051788,0.001118,0.007803,0.007556,-0.000247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,XLK,news_rate_hikes,0.538868,0.567568,0.533756,0.603604,0.507931,0.558559,0.513697,0.558559,0.737438,0.715862,-0.021575,0.006442,0.004057,-0.002385,0.700081,0.715198,0.015116,0.732926,0.689035,-0.043891,0.011099,0.006863,-0.004236,1.191466,1.040517,-0.150949,0.050741,0.048669,-0.002072,0.009295,0.005860,-0.003435
606,XLP,news_rate_hikes,0.508902,0.504505,0.515248,0.576577,0.503172,0.567568,0.522829,0.531532,0.727099,0.900129,0.173030,0.003837,0.002570,-0.001267,0.688727,0.694401,0.005675,0.728507,0.818334,0.089827,0.006782,0.005907,-0.000875,1.191466,1.040517,-0.150949,0.050741,0.048669,-0.002072,0.006058,0.005639,-0.000419
607,XLU,news_rate_hikes,0.535872,0.450450,0.527234,0.531532,0.489602,0.576577,0.524799,0.549550,0.736898,0.956453,0.219555,0.004006,0.002458,-0.001549,0.684367,0.751967,0.067600,0.735975,0.973395,0.237420,0.008369,0.008559,0.000190,1.191466,1.040517,-0.150949,0.050741,0.048669,-0.002072,0.007626,0.007806,0.000180
608,XLY,news_rate_hikes,0.528644,0.648649,0.527587,0.540541,0.519210,0.567568,0.517637,0.477477,0.742013,0.722875,-0.019138,0.005494,0.003699,-0.001795,0.696108,0.700545,0.004437,0.743206,0.709000,-0.034206,0.009742,0.006355,-0.003387,1.191466,1.040517,-0.150949,0.050741,0.048669,-0.002072,0.008194,0.005479,-0.002715


# Save Processing Stage 3 (Processed Ticker Data). 

In [19]:
# filepath = os.path.join(DATASET_DIR, "sector_price_history_processed_stg_3.csv") 
# df_tickers.write_to_csv(filepath, index=False) 

# print(f"Write to ({filepath})") 

# Load Processing Stage 3 (Processed Ticker Data). 

In [20]:
# filepath = os.path.join(DATASET_DIR, "sector_price_history_processed_stg_3.csv") 
# df_tickers = pd.read_csv(filepath) 

# print(f"Read from ({filepath})") 

# Identify Convergence Across Different Metrics. 

## Process Metrics Based On Specific Conditionss. 

In [21]:
# Process on the copy instead of the original dataframe. 
df_identified_condition = df_consolidated_agg.copy() 

# Gather matched column names. 
cols = ["ticker", "factor"] 

# Identify convergence. If all the required conditions for 
# specific metrics are fulfilled, we will assume convergence. 
for regex_pat, condition in RE_PATS_AND_CONDITIONS.items(): 
	# Get the columns that matches the regex. 
	cols_matched = [c for c in df_identified_condition.columns if re.match(f"\\w+{regex_pat}", c)] 
	cols.extend(cols_matched) 

	for c in cols_matched: 
		if regex_pat in ["_mag_\\d", "_avg_\\d"]: 
			df_identified_condition.loc[df_identified_condition[c] <  condition, c] = 0 
			df_identified_condition.loc[df_identified_condition[c] >= condition, c] = 1 

		elif regex_pat == "_abv_\\d": 
			boo_abv_threshold = df_identified_condition[c] >= condition
			df_identified_condition.loc[:, c] = 0 
			df_identified_condition.loc[boo_abv_threshold, c] = 1 

		elif regex_pat == "_dir_\\d": 
			boo_exceed_threshold = (df_identified_condition[c] <= (1 - condition)) | (df_identified_condition[c] >= condition)
			df_identified_condition.loc[:, c] = 0 
			df_identified_condition.loc[boo_exceed_threshold, c] = 1 

# Filter columns. 
df_identified_condition = df_identified_condition[cols] 

# Preview. 
df_identified_condition

Unnamed: 0,ticker,factor,price_chg_c2o_dir_0,price_chg_c2o_dir_1,price_chg_c2c_dir_0,price_chg_c2c_dir_1,price_chg_o2c_dir_0,price_chg_o2c_dir_1,volume_pchg_from_med_abv_0,volume_pchg_from_med_abv_1,tscore_o2c_mag_0,tscore_o2c_mag_1,price_chg_c2o_mag_0,price_chg_c2o_mag_1,tscore_c2o_mag_0,tscore_c2o_mag_1,tscore_c2c_mag_0,tscore_c2c_mag_1,price_chg_c2c_mag_0,price_chg_c2c_mag_1,vix_tscore_c2c_mag_0,vix_tscore_c2c_mag_1,vix_chg_c2c_mag_0,vix_chg_c2c_mag_1,price_chg_o2c_mag_0,price_chg_o2c_mag_1
0,XHB,black_friday,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,XLB,black_friday,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,XLE,black_friday,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,XLF,black_friday,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,XLI,black_friday,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,XLK,news_rate_hikes,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
606,XLP,news_rate_hikes,0,0,0,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
607,XLU,news_rate_hikes,0,0,0,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
608,XLY,news_rate_hikes,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


## Identify Convergence. 

In [22]:
cols = ["ticker", "factor"] + METRIC_CHOICES 

# Set the conditions for identifying convergence. 
boo_conditions = (df_identified_condition["ticker"].notnull()) 
for metric in METRICS_TO_IDENTIFY_CONVERGENCE:  
	boo_conditions &= (df_identified_condition[metric] == 1.0) 

# Mark influential variables. 
df_identified_condition.loc[:, "influential"] = 0
df_identified_condition.loc[boo_conditions, "influential"] = 1

# Filter columns. 
df_identified_convergence = df_identified_condition.loc[boo_conditions, cols + ["influential"]] 

# Preview. 
df_identified_convergence

Unnamed: 0,ticker,factor,tscore_c2o_mag_1,tscore_c2c_mag_1,tscore_o2c_mag_1,vix_tscore_c2c_mag_1,price_chg_c2o_dir_1,price_chg_c2c_dir_1,price_chg_o2c_dir_1,volume_pchg_from_med_abv_1,influential
254,XLI,ism_pmi_manufacturer,0.0,1.0,1.0,1.0,0,0,0,1,1
521,XLB,fomc_presscf,0.0,1.0,1.0,1.0,0,0,0,1,1
524,XLI,fomc_presscf,0.0,1.0,0.0,1.0,0,0,0,1,1
525,XLK,fomc_presscf,0.0,1.0,0.0,1.0,0,0,0,1,1
526,XLP,fomc_presscf,0.0,1.0,1.0,1.0,0,0,0,1,1
527,XLU,fomc_presscf,0.0,1.0,0.0,1.0,0,0,0,1,1
541,XLB,opec,0.0,1.0,0.0,1.0,0,0,0,1,1
542,XLE,opec,0.0,1.0,1.0,1.0,0,0,0,1,1
543,XLF,opec,0.0,1.0,0.0,1.0,0,0,0,1,1
545,XLK,opec,0.0,1.0,1.0,1.0,0,0,0,1,1


In [23]:
# Collect identified factors. 
identified_factors = df_identified_convergence.loc[:, "factor"].unique() 

# Preview. 
identified_factors 

array(['ism_pmi_manufacturer', 'fomc_presscf', 'opec'], dtype=object)