# MADS SIADS 591 Milestone 1 Comprehension Exam. 

In [1]:
# Python module. 
import os, scipy 
import datetime as dt
import numpy as np 
import pandas as pd 
import pandas_datareader as pdr
import exchange_calendars as xcals
import yfinance as yf 

# Change the current directory from (./notebook) to root directory. 
os.chdir("..") 

# Custom module. 
from config.config import DATASET_DIR, TICKER_DATE_COLLECT 

# Create the directory if it doesn't exist. 
if not os.path.exists(DATASET_DIR):
    os.makedirs(DATASET_DIR) 

# For clearing safe warnings. Not important. 
from IPython.core.display import clear_output
clear_output() 

## Configurations (general). 

In [2]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 200)

# For clearing the Bokeh and Holoview logo. Not important. 
clear_output()

# Data Collecting. 

## Collect & Consolidate Tickers. 

In [3]:
ticker_names = [
	"XLF", "XHB", "XLK", "XLY", "XLP", 
	"XRT", "XLI", "XLB", "XTL", "XLU", 
] 

# Initiate an empty dataframe to consolidate all the ticker data. 
df_tickers = pd.DataFrame() 

# Consolidate all tickers into a single dataframe. 
for t in ticker_names: 
	# Read the data. 
	ticker = yf.Ticker(t) 
	ticker = ticker.history(period="max", interval="1d", start=TICKER_DATE_COLLECT[0], end=TICKER_DATE_COLLECT[1], auto_adjust=True, rounding=True) 

	# Add a column to indicate the ticker name. 
	ticker["ticker"] = t

	# Concat the data into a long table format in (pd.DataFrame) object. 
	df_tickers = pd.concat([df_tickers, ticker]) 

# Reset the index. 
df_tickers = df_tickers.reset_index(drop=False) 

# Convert column to lowercase. 
df_tickers.columns = [c.lower() for c in df_tickers.columns] 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker
0,1998-12-22,12.15,12.15,12.02,12.08,55887,0.0,0.0,XLF
1,1998-12-23,12.03,12.26,12.03,12.26,78784,0.0,0.0,XLF
2,1998-12-24,12.26,12.34,12.22,12.34,43824,0.0,0.0,XLF
3,1998-12-28,12.33,12.33,12.14,12.18,51948,0.0,0.0,XLF
4,1998-12-29,12.19,12.31,12.05,12.31,100819,0.0,0.0,XLF
...,...,...,...,...,...,...,...,...,...
51127,2021-12-10,69.06,69.48,68.79,69.15,11701800,0.0,0.0,XLU
51128,2021-12-13,69.08,70.34,69.08,70.00,13750000,0.0,0.0,XLU
51129,2021-12-14,69.99,70.13,69.25,69.58,17502500,0.0,0.0,XLU
51130,2021-12-15,69.50,70.78,69.50,70.72,22682300,0.0,0.0,XLU


## Collect & Consolidate VIX Data. 

In [4]:

# Collect VIX data to merge with the cosolidated data. 
vix = yf.Ticker("^VIX") 
vix = vix.history(period="max", interval="1d", start=TICKER_DATE_COLLECT[0], end=TICKER_DATE_COLLECT[1], auto_adjust=True, rounding=True) 

# Process columns. 
vix = vix[["Open", "Close"]] 
vix.columns = [f"vix_{c}".lower() for c in vix.columns] 

# Minus the open, high, low, close with the threshold. If the value 
# exceeds the threshold, that means traders assume the future price 
# movement could be more volatile. Vice versa. VIX is also known as 
# a fear indicator. 

# Default the VIX threshold to 18, but need to make it configurable. 
vix_threshold = 18
for c in vix.columns: 
	vix[f"{c}_minus_thresh"] = vix[c] - vix_threshold 

## Merge With Ticker Data. 

In [5]:
# Merge VIX with the ticker data. 
df_tickers = df_tickers.merge(right=vix, how="left", left_on="date", right_on="Date", validate="many_to_one") 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,vix_open,vix_close,vix_open_minus_thresh,vix_close_minus_thresh
0,1998-12-22,12.15,12.15,12.02,12.08,55887,0.0,0.0,XLF,24.05,22.78,6.05,4.78
1,1998-12-23,12.03,12.26,12.03,12.26,78784,0.0,0.0,XLF,21.89,20.21,3.89,2.21
2,1998-12-24,12.26,12.34,12.22,12.34,43824,0.0,0.0,XLF,21.00,21.48,3.00,3.48
3,1998-12-28,12.33,12.33,12.14,12.18,51948,0.0,0.0,XLF,22.92,23.50,4.92,5.50
4,1998-12-29,12.19,12.31,12.05,12.31,100819,0.0,0.0,XLF,23.68,22.18,5.68,4.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51127,2021-12-10,69.06,69.48,68.79,69.15,11701800,0.0,0.0,XLU,21.27,18.69,3.27,0.69
51128,2021-12-13,69.08,70.34,69.08,70.00,13750000,0.0,0.0,XLU,19.29,20.31,1.29,2.31
51129,2021-12-14,69.99,70.13,69.25,69.58,17502500,0.0,0.0,XLU,19.67,21.89,1.67,3.89
51130,2021-12-15,69.50,70.78,69.50,70.72,22682300,0.0,0.0,XLU,21.60,19.29,3.60,1.29


## Save Consolidated Ticker Data. 

In [6]:
# Save the dataframe into CSV. 
filepath = os.path.join(DATASET_DIR, "sector_price_history.csv") 
df_tickers.to_csv(filepath, index=False) 

# Data Preparation. 

## Read The Ticker Data. 

In [7]:
# Read the data. 
filepath = os.path.join(DATASET_DIR, "sector_price_history.csv") 
df_tickers = pd.read_csv(filepath) 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,vix_open,vix_close,vix_open_minus_thresh,vix_close_minus_thresh
0,1998-12-22,12.15,12.15,12.02,12.08,55887,0.0,0.0,XLF,24.05,22.78,6.05,4.78
1,1998-12-23,12.03,12.26,12.03,12.26,78784,0.0,0.0,XLF,21.89,20.21,3.89,2.21
2,1998-12-24,12.26,12.34,12.22,12.34,43824,0.0,0.0,XLF,21.00,21.48,3.00,3.48
3,1998-12-28,12.33,12.33,12.14,12.18,51948,0.0,0.0,XLF,22.92,23.50,4.92,5.50
4,1998-12-29,12.19,12.31,12.05,12.31,100819,0.0,0.0,XLF,23.68,22.18,5.68,4.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51127,2021-12-10,69.06,69.48,68.79,69.15,11701800,0.0,0.0,XLU,21.27,18.69,3.27,0.69
51128,2021-12-13,69.08,70.34,69.08,70.00,13750000,0.0,0.0,XLU,19.29,20.31,1.29,2.31
51129,2021-12-14,69.99,70.13,69.25,69.58,17502500,0.0,0.0,XLU,19.67,21.89,1.67,3.89
51130,2021-12-15,69.50,70.78,69.50,70.72,22682300,0.0,0.0,XLU,21.60,19.29,3.60,1.29


## Read The Economic Report Data. 

In [8]:
# Read the data. 
filepath = os.path.join(DATASET_DIR, "economic_reported_date.csv") 
df_econ_reportdate = pd.read_csv(filepath) 

# Preview. Some columns are weekly economic reports which is 
# why they have a lot more dates than the rest. 
df_econ_reportdate 

Unnamed: 0,non_farm_employment_adp_mom,non_farm_employment_mom,unemployment_claims,unemployment_rate,avg_hourly_earnings_mom,personal_dispensable_income_mom,personal_consumption_mom,ism_pmi_manufacturer,ism_pmi_services,chicago_pmi,industry_production_mom,phil_fed_manufacturer,capacity_utilisation,manufacturer_new_order_mom,manufacturer_new_order_ex_trans_mom,retail_sales_ex_auto_mom,retail_sales_mom,producer_ppi_mom,producer_ppi_ex_food_energy_mom,consumer_cpi_mom,consumer_cpi_ex_food_energy_mom,pce_ex_food_energy_mom,housing_hpi_mom,building_permit,housing_starts,new_home_sales,gdp_us,gdp_advance_us,crude_oil_inventory,natural_gas_inventory,fomc_presscf,fomc_minutes,opec,opec_jmmc
0,2007-01-03,2007-01-05,2007-01-04,2007-01-05,2007-01-05,2007-02-01,2007-02-01,2007-01-03,2007-01-04,2007-01-31,2007-01-17,2007-01-19,2007-01-17,2007-01-26,2007-01-26,2007-01-12,2007-01-12,2007-01-17,2007-01-17,2007-01-18,2007-01-18,2007-02-01,2008-03-25,2007-01-18,2007-01-18,2007-01-26,2007-03-29,2007-01-31,2007-01-04,2007-01-05,2011-04-28,2007-01-04,2007-03-15,2017-01-22
1,2007-01-31,2007-02-02,2007-01-11,2007-02-02,2007-02-02,2007-03-01,2007-03-01,2007-02-01,2007-02-05,2007-02-28,2007-02-15,2007-02-16,2007-02-15,2007-02-27,2007-02-27,2007-02-14,2007-02-14,2007-02-16,2007-02-16,2007-02-21,2007-02-21,2007-03-01,2008-04-22,2007-02-16,2007-02-16,2007-02-28,2007-06-28,2007-04-27,2007-01-10,2007-01-11,2011-06-23,2007-02-22,2007-09-11,2017-02-24
2,2007-03-07,2007-03-09,2007-01-18,2007-03-09,2007-03-09,2007-03-30,2007-03-30,2007-03-01,2007-03-05,2007-03-30,2007-03-16,2007-03-16,2007-03-16,2007-03-28,2007-03-28,2007-03-13,2007-03-13,2007-03-15,2007-03-15,2007-03-16,2007-03-16,2007-03-30,2008-05-22,2007-03-20,2007-03-20,2007-03-26,2007-09-27,2007-07-27,2007-01-18,2007-01-18,2011-11-03,2007-04-12,2007-12-05,2017-03-26
3,2007-04-04,2007-04-06,2007-01-25,2007-04-06,2007-04-06,2007-04-30,2007-04-30,2007-04-02,2007-04-04,2007-04-30,2007-04-17,2007-04-20,2007-04-17,2007-04-25,2007-04-25,2007-04-16,2007-04-16,2007-04-13,2007-04-13,2007-04-17,2007-04-17,2007-04-30,2008-06-24,2007-04-17,2007-04-17,2007-04-25,2007-12-20,2007-10-31,2007-01-24,2007-01-25,2012-01-26,2007-05-31,2008-02-01,2017-04-28
4,2007-05-02,2007-05-04,2007-02-01,2007-05-04,2007-05-04,2007-06-01,2007-06-01,2007-05-01,2007-05-03,2007-05-31,2007-05-16,2007-05-18,2007-05-16,2007-05-24,2007-05-24,2007-05-11,2007-05-11,2007-05-11,2007-05-11,2007-05-15,2007-05-15,2007-06-01,2008-07-22,2007-05-16,2007-05-16,2007-05-24,2008-03-27,2008-01-30,2007-01-31,2007-02-01,2012-04-26,2007-07-20,2008-03-05,2017-05-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776,,,2021-11-18,,,,,,,,,,,,,,,,,,,,,,,,,,2021-11-17,2021-11-18,,,,
777,,,2021-11-24,,,,,,,,,,,,,,,,,,,,,,,,,,2021-11-24,2021-11-25,,,,
778,,,2021-12-02,,,,,,,,,,,,,,,,,,,,,,,,,,2021-12-01,2021-12-02,,,,
779,,,2021-12-09,,,,,,,,,,,,,,,,,,,,,,,,,,2021-12-08,2021-12-09,,,,


## Read The Event Dates. 

In [9]:
# I have identified the dates for each event using Julia (programming language) 
# previously and saved them in CSV. I own the Julia source code (belongs to me). 
# It is extremely cumbersome to calculate the date pattern in Python. 

# Refer to "Mark The Dates For Each Event ((With Python)" section in this notebook. 
# Inaccurate dates will occur starting from 2030. Plus, difficult to provide customised 
# date pattern. I haven't discover another library that can do that conveniently. 

filenames = ["event_dates_ext.csv", "firsttrdrday_ofmonth.csv", "santa_rally.csv", "triple_witching_week.csv"] 
df_event_dates = pd.DataFrame() 

# Read the data and consolidate all the event dates. 
for filename in filenames: 
	# Read the data.
	filepath = os.path.join(DATASET_DIR, filename) 
	event_dates = pd.read_csv(filepath) 

	# Sort the dates. Just to ensure it's in order. 
	event_dates = event_dates.sort_values(by=event_dates.columns.to_list()) 

	# Concat the dataframe. 
	df_event_dates = pd.concat([df_event_dates, event_dates], axis="columns") 

# Preview. 
df_event_dates 

Unnamed: 0,black_friday,christmas,columbus,cyber_monday,good_friday,labor,martin_lut_king,new_year,thanksgiving,us_event_sep11,us_independence,us_memorial,us_president,us_veterans,valentine,firsttrdrday_ofmonth,santa_rally,tww_trdrday
0,1998-11-27,1998-12-25,1998-10-12,1998-11-30,1998-04-10,1998-09-07,1998-01-19,1998-01-01,1998-11-26,1998-09-11,1998-07-04,1998-05-25,1998-02-16,1998-11-11,1998-02-14,1998-01-02,1997-12-31,1998-03-16
1,1998-11-25,1998-12-24,1998-10-09,1998-11-27,1998-04-09,1998-09-04,1998-01-16,1997-12-31,1998-11-25,1998-09-10,1998-07-02,1998-05-22,1998-02-13,1998-11-10,1998-02-13,1998-02-02,1997-12-30,1998-03-17
2,1998-11-30,1998-12-28,1998-10-13,1998-12-01,1998-04-13,1998-09-08,1998-01-20,1998-01-02,1998-11-27,1998-09-14,1998-07-06,1998-05-26,1998-02-17,1998-11-12,1998-02-17,1998-03-02,1997-12-29,1998-03-18
3,1998-11-24,1998-12-23,1998-10-08,1998-11-25,1998-04-08,1998-09-03,1998-01-15,1997-12-30,1998-11-24,1998-09-09,1998-07-01,1998-05-21,1998-02-12,1998-11-09,1998-02-12,1998-04-01,1997-12-26,1998-03-19
4,1998-12-01,1998-12-29,1998-10-14,1998-12-02,1998-04-14,1998-09-09,1998-01-21,1998-01-05,1998-11-30,1998-09-15,1998-07-07,1998-05-27,1998-02-18,1998-11-13,1998-02-18,1998-05-01,1997-12-24,1998-03-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1278,,,,,,,,,,,,,,,,,,2030-12-20
1279,,,,,,,,,,,,,,,,,,2030-12-23
1280,,,,,,,,,,,,,,,,,,2030-12-24
1281,,,,,,,,,,,,,,,,,,2030-12-26


In [10]:
# Read the data. 
filepath = os.path.join(DATASET_DIR, "economic_reported_data.csv") 
df_econ_report = pd.read_csv(filepath) 

# Preview. 
df_econ_report 

Unnamed: 0,econ_forecast,frequency,timestamp,reported,forecast,reported_prev,abv_forecast,forecast_dif,abv_zero
0,non_farm_employment_adp_mom,monthly,2007-01-03,-4.000000e+04,1.200000e+05,2.300000e+05,0,-1.600000e+05,0
1,non_farm_employment_adp_mom,monthly,2007-01-31,1.520000e+05,1.350000e+05,1.470000e+05,1,1.700000e+04,1
2,non_farm_employment_adp_mom,monthly,2007-03-07,5.700000e+04,1.000000e+05,1.210000e+05,0,-4.300000e+04,1
3,non_farm_employment_adp_mom,monthly,2007-04-04,1.060000e+05,1.250000e+05,6.500000e+04,0,-1.900000e+04,1
4,non_farm_employment_adp_mom,monthly,2007-05-02,6.400000e+04,1.070000e+05,9.800000e+04,0,-4.300000e+04,1
...,...,...,...,...,...,...,...,...,...
6932,natural_gas_inventory,weekly,2021-11-18,2.600000e+10,2.500000e+10,7.000000e+09,1,1.000000e+09,1
6933,natural_gas_inventory,weekly,2021-11-25,-2.100000e+10,-2.300000e+10,2.600000e+10,1,2.000000e+09,0
6934,natural_gas_inventory,weekly,2021-12-02,-5.900000e+10,-5.900000e+10,-2.100000e+10,1,0.000000e+00,0
6935,natural_gas_inventory,weekly,2021-12-09,-5.900000e+10,-6.000000e+10,-5.900000e+10,1,1.000000e+09,0


# Data Processing. 

## Compute Price Change. 

In [11]:
# We will be computing 3 types of price difference. 2nd option is not a must but just 
# want to compare the difference between open and closing market. 

# 1. Gapping / Close market price change. Difference between previous day 
# 	 closing price and current day open price. 
# 2. Open market price change. Difference between current day open and closing price. 
# 3. Daily price change. Difference between previous day and current day closing price. 

# 1. price_chg_close_to_open
prev_close = df_tickers["close"].shift(1)
df_tickers["price_chg_c2o"] = (df_tickers["open"] - prev_close) / prev_close 

# 2. price_chg_open_to_close
df_tickers["price_chg_o2c"] = (df_tickers["close"] - df_tickers["open"]) / df_tickers["open"] 

# 3. price_chg_close_to_close
df_tickers["price_chg_c2c"] = df_tickers["close"].pct_change(1) 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,vix_open,vix_close,vix_open_minus_thresh,vix_close_minus_thresh,price_chg_c2o,price_chg_o2c,price_chg_c2c
0,1998-12-22,12.15,12.15,12.02,12.08,55887,0.0,0.0,XLF,24.05,22.78,6.05,4.78,,-0.005761,
1,1998-12-23,12.03,12.26,12.03,12.26,78784,0.0,0.0,XLF,21.89,20.21,3.89,2.21,-0.004139,0.019119,0.014901
2,1998-12-24,12.26,12.34,12.22,12.34,43824,0.0,0.0,XLF,21.00,21.48,3.00,3.48,0.000000,0.006525,0.006525
3,1998-12-28,12.33,12.33,12.14,12.18,51948,0.0,0.0,XLF,22.92,23.50,4.92,5.50,-0.000810,-0.012165,-0.012966
4,1998-12-29,12.19,12.31,12.05,12.31,100819,0.0,0.0,XLF,23.68,22.18,5.68,4.18,0.000821,0.009844,0.010673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51127,2021-12-10,69.06,69.48,68.79,69.15,11701800,0.0,0.0,XLU,21.27,18.69,3.27,0.69,0.004071,0.001303,0.005379
51128,2021-12-13,69.08,70.34,69.08,70.00,13750000,0.0,0.0,XLU,19.29,20.31,1.29,2.31,-0.001012,0.013318,0.012292
51129,2021-12-14,69.99,70.13,69.25,69.58,17502500,0.0,0.0,XLU,19.67,21.89,1.67,3.89,-0.000143,-0.005858,-0.006000
51130,2021-12-15,69.50,70.78,69.50,70.72,22682300,0.0,0.0,XLU,21.60,19.29,3.60,1.29,-0.001150,0.017554,0.016384


## Compute Rolling Median Volume & Volume Difference To Rolling Median. 

In [12]:
# Compute the 3 months rolling median volume. 
cols = ["volume"] 
df_tickers["volume_rollmed"] = df_tickers[cols].rolling(window=90, min_periods=90, win_type=None).median() 

# Compute the difference between each volume with the 3 months rolling median volume. 
df_tickers["volume_diff_to_med"] = df_tickers["volume"] - df_tickers["volume_rollmed"] 

# Compute the percent change from the 3 months rolling median volume. Comparing 
# percent change between each period is easier than looking at the difference. 
df_tickers["volume_pchg_from_med"] = df_tickers["volume_diff_to_med"] / df_tickers["volume_rollmed"] 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,vix_open,vix_close,vix_open_minus_thresh,vix_close_minus_thresh,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med
0,1998-12-22,12.15,12.15,12.02,12.08,55887,0.0,0.0,XLF,24.05,22.78,6.05,4.78,,-0.005761,,,,
1,1998-12-23,12.03,12.26,12.03,12.26,78784,0.0,0.0,XLF,21.89,20.21,3.89,2.21,-0.004139,0.019119,0.014901,,,
2,1998-12-24,12.26,12.34,12.22,12.34,43824,0.0,0.0,XLF,21.00,21.48,3.00,3.48,0.000000,0.006525,0.006525,,,
3,1998-12-28,12.33,12.33,12.14,12.18,51948,0.0,0.0,XLF,22.92,23.50,4.92,5.50,-0.000810,-0.012165,-0.012966,,,
4,1998-12-29,12.19,12.31,12.05,12.31,100819,0.0,0.0,XLF,23.68,22.18,5.68,4.18,0.000821,0.009844,0.010673,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51127,2021-12-10,69.06,69.48,68.79,69.15,11701800,0.0,0.0,XLU,21.27,18.69,3.27,0.69,0.004071,0.001303,0.005379,11464750.0,237050.0,0.020676
51128,2021-12-13,69.08,70.34,69.08,70.00,13750000,0.0,0.0,XLU,19.29,20.31,1.29,2.31,-0.001012,0.013318,0.012292,11464750.0,2285250.0,0.199328
51129,2021-12-14,69.99,70.13,69.25,69.58,17502500,0.0,0.0,XLU,19.67,21.89,1.67,3.89,-0.000143,-0.005858,-0.006000,11547800.0,5954700.0,0.515657
51130,2021-12-15,69.50,70.78,69.50,70.72,22682300,0.0,0.0,XLU,21.60,19.29,3.60,1.29,-0.001150,0.017554,0.016384,11639400.0,11042900.0,0.948752


## Compute Bollinger Band & z-score. 

In [13]:
# Refer to this link to understand what Bollinger Band is and its formula. 
# https://www.investopedia.com/terms/b/bollingerbands.asp

# We will use 360 days for the rolling window. If the window is too short, 
# the average price could fluctuate higher or lower. We can't use median 
# because we need to calculate t-score. 

# Compute the rolling average and standard deviation. 
tp =  (df_tickers["close"] + df_tickers["low"] + df_tickers["high"]) / 3 
tp_rollavg = tp.rolling(window=360, min_periods=90, win_type=None).mean() 
tp_rollstd = tp.rolling(window=360, min_periods=90, win_type=None).std(ddof=0) 

# Compute Bollinger Band. 
n_std = 2 
df_tickers["bo_upper"] = tp_rollavg + n_std * tp_rollstd 
df_tickers["bo_lower"] = tp_rollavg - n_std * tp_rollstd 

# Compute the z-score using closing price and Bollinger Band. 
df_tickers["zscore_bo"] = (df_tickers["close"] - tp_rollavg) / tp_rollstd 

# Compute the z-score for price change. 
price_chg_c2o_rollavg = df_tickers["price_chg_c2o"].rolling(window=360, min_periods=90, win_type=None).mean() 
price_chg_o2c_rollavg = df_tickers["price_chg_o2c"].rolling(window=360, min_periods=90, win_type=None).mean() 
price_chg_c2c_rollavg = df_tickers["price_chg_c2c"].rolling(window=360, min_periods=90, win_type=None).mean() 

price_chg_c2o_rollstd = df_tickers["price_chg_c2o"].rolling(window=360, min_periods=90, win_type=None).std(ddof=0) 
price_chg_o2c_rollstd = df_tickers["price_chg_o2c"].rolling(window=360, min_periods=90, win_type=None).std(ddof=0) 
price_chg_c2c_rollstd = df_tickers["price_chg_c2c"].rolling(window=360, min_periods=90, win_type=None).std(ddof=0) 

df_tickers["zscore_c2o"] = (df_tickers["price_chg_c2o"] - price_chg_c2o_rollavg) / price_chg_c2o_rollstd 
df_tickers["zscore_o2c"] = (df_tickers["price_chg_o2c"] - price_chg_o2c_rollavg) / price_chg_o2c_rollstd 
df_tickers["zscore_c2c"] = (df_tickers["price_chg_c2c"] - price_chg_c2c_rollavg) / price_chg_c2c_rollstd 

# Preview. 
df_tickers 

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,vix_open,vix_close,vix_open_minus_thresh,vix_close_minus_thresh,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med,bo_upper,bo_lower,zscore_bo,zscore_c2o,zscore_o2c,zscore_c2c
0,1998-12-22,12.15,12.15,12.02,12.08,55887,0.0,0.0,XLF,24.05,22.78,6.05,4.78,,-0.005761,,,,,,,,,,
1,1998-12-23,12.03,12.26,12.03,12.26,78784,0.0,0.0,XLF,21.89,20.21,3.89,2.21,-0.004139,0.019119,0.014901,,,,,,,,,
2,1998-12-24,12.26,12.34,12.22,12.34,43824,0.0,0.0,XLF,21.00,21.48,3.00,3.48,0.000000,0.006525,0.006525,,,,,,,,,
3,1998-12-28,12.33,12.33,12.14,12.18,51948,0.0,0.0,XLF,22.92,23.50,4.92,5.50,-0.000810,-0.012165,-0.012966,,,,,,,,,
4,1998-12-29,12.19,12.31,12.05,12.31,100819,0.0,0.0,XLF,23.68,22.18,5.68,4.18,0.000821,0.009844,0.010673,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51127,2021-12-10,69.06,69.48,68.79,69.15,11701800,0.0,0.0,XLU,21.27,18.69,3.27,0.69,0.004071,0.001303,0.005379,11464750.0,237050.0,0.020676,69.876471,55.718769,1.794749,0.896736,0.132350,0.468445
51128,2021-12-13,69.08,70.34,69.08,70.00,13750000,0.0,0.0,XLU,19.29,20.31,1.29,2.31,-0.001012,0.013318,0.012292,11464750.0,2285250.0,0.199328,69.910044,55.766215,2.025440,-0.417890,1.465345,1.167248
51129,2021-12-14,69.99,70.13,69.25,69.58,17502500,0.0,0.0,XLU,19.67,21.89,1.67,3.89,-0.000143,-0.005858,-0.006000,11547800.0,5954700.0,0.515657,69.944639,55.809268,1.896815,-0.192838,-0.656261,-0.671193
51130,2021-12-15,69.50,70.78,69.50,70.72,22682300,0.0,0.0,XLU,21.60,19.29,3.60,1.29,-0.001150,0.017554,0.016384,11639400.0,11042900.0,0.948752,69.990639,55.842824,2.206212,-0.452267,1.925723,1.574283


## Mark The Dates For Each Event. 

In [14]:
# Assign 1 if economic report occurred on that date, else 0. 
for event_name in df_event_dates.columns: 
	# Default to 0. 
	df_tickers[event_name] = 0 

	# Filter non economic report dates and assign 1. 
	boo_dates = df_tickers["date"].isin(df_event_dates[event_name].values) 
	df_tickers.loc[boo_dates, event_name] = 1 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,vix_open,vix_close,vix_open_minus_thresh,vix_close_minus_thresh,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med,bo_upper,bo_lower,zscore_bo,zscore_c2o,zscore_o2c,zscore_c2c,black_friday,christmas,columbus,cyber_monday,good_friday,labor,martin_lut_king,new_year,thanksgiving,us_event_sep11,us_independence,us_memorial,us_president,us_veterans,valentine,firsttrdrday_ofmonth,santa_rally,tww_trdrday
0,1998-12-22,12.15,12.15,12.02,12.08,55887,0.0,0.0,XLF,24.05,22.78,6.05,4.78,,-0.005761,,,,,,,,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1998-12-23,12.03,12.26,12.03,12.26,78784,0.0,0.0,XLF,21.89,20.21,3.89,2.21,-0.004139,0.019119,0.014901,,,,,,,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1998-12-24,12.26,12.34,12.22,12.34,43824,0.0,0.0,XLF,21.00,21.48,3.00,3.48,0.000000,0.006525,0.006525,,,,,,,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,1998-12-28,12.33,12.33,12.14,12.18,51948,0.0,0.0,XLF,22.92,23.50,4.92,5.50,-0.000810,-0.012165,-0.012966,,,,,,,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,1998-12-29,12.19,12.31,12.05,12.31,100819,0.0,0.0,XLF,23.68,22.18,5.68,4.18,0.000821,0.009844,0.010673,,,,,,,,,,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51127,2021-12-10,69.06,69.48,68.79,69.15,11701800,0.0,0.0,XLU,21.27,18.69,3.27,0.69,0.004071,0.001303,0.005379,11464750.0,237050.0,0.020676,69.876471,55.718769,1.794749,0.896736,0.132350,0.468445,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
51128,2021-12-13,69.08,70.34,69.08,70.00,13750000,0.0,0.0,XLU,19.29,20.31,1.29,2.31,-0.001012,0.013318,0.012292,11464750.0,2285250.0,0.199328,69.910044,55.766215,2.025440,-0.417890,1.465345,1.167248,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
51129,2021-12-14,69.99,70.13,69.25,69.58,17502500,0.0,0.0,XLU,19.67,21.89,1.67,3.89,-0.000143,-0.005858,-0.006000,11547800.0,5954700.0,0.515657,69.944639,55.809268,1.896815,-0.192838,-0.656261,-0.671193,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
51130,2021-12-15,69.50,70.78,69.50,70.72,22682300,0.0,0.0,XLU,21.60,19.29,3.60,1.29,-0.001150,0.017554,0.016384,11639400.0,11042900.0,0.948752,69.990639,55.842824,2.206212,-0.452267,1.925723,1.574283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## Mark The Dates For Economic Report. 

In [15]:
# Assign 1 if economic report occurred on that date, else 0. 
for econ_name in df_econ_reportdate.columns: 
	# Default to 0. 
	df_tickers[econ_name] = 0 

	# Filter non economic report dates and assign 1. 
	boo_dates = df_tickers["date"].isin(df_econ_reportdate[econ_name].values) 
	df_tickers.loc[boo_dates, econ_name] = 1 

# Preview. 
df_tickers

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,vix_open,vix_close,vix_open_minus_thresh,vix_close_minus_thresh,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med,bo_upper,bo_lower,zscore_bo,zscore_c2o,zscore_o2c,zscore_c2c,...,chicago_pmi,industry_production_mom,phil_fed_manufacturer,capacity_utilisation,manufacturer_new_order_mom,manufacturer_new_order_ex_trans_mom,retail_sales_ex_auto_mom,retail_sales_mom,producer_ppi_mom,producer_ppi_ex_food_energy_mom,consumer_cpi_mom,consumer_cpi_ex_food_energy_mom,pce_ex_food_energy_mom,housing_hpi_mom,building_permit,housing_starts,new_home_sales,gdp_us,gdp_advance_us,crude_oil_inventory,natural_gas_inventory,fomc_presscf,fomc_minutes,opec,opec_jmmc
0,1998-12-22,12.15,12.15,12.02,12.08,55887,0.0,0.0,XLF,24.05,22.78,6.05,4.78,,-0.005761,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1998-12-23,12.03,12.26,12.03,12.26,78784,0.0,0.0,XLF,21.89,20.21,3.89,2.21,-0.004139,0.019119,0.014901,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1998-12-24,12.26,12.34,12.22,12.34,43824,0.0,0.0,XLF,21.00,21.48,3.00,3.48,0.000000,0.006525,0.006525,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1998-12-28,12.33,12.33,12.14,12.18,51948,0.0,0.0,XLF,22.92,23.50,4.92,5.50,-0.000810,-0.012165,-0.012966,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1998-12-29,12.19,12.31,12.05,12.31,100819,0.0,0.0,XLF,23.68,22.18,5.68,4.18,0.000821,0.009844,0.010673,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51127,2021-12-10,69.06,69.48,68.79,69.15,11701800,0.0,0.0,XLU,21.27,18.69,3.27,0.69,0.004071,0.001303,0.005379,11464750.0,237050.0,0.020676,69.876471,55.718769,1.794749,0.896736,0.132350,0.468445,...,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
51128,2021-12-13,69.08,70.34,69.08,70.00,13750000,0.0,0.0,XLU,19.29,20.31,1.29,2.31,-0.001012,0.013318,0.012292,11464750.0,2285250.0,0.199328,69.910044,55.766215,2.025440,-0.417890,1.465345,1.167248,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
51129,2021-12-14,69.99,70.13,69.25,69.58,17502500,0.0,0.0,XLU,19.67,21.89,1.67,3.89,-0.000143,-0.005858,-0.006000,11547800.0,5954700.0,0.515657,69.944639,55.809268,1.896815,-0.192838,-0.656261,-0.671193,...,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
51130,2021-12-15,69.50,70.78,69.50,70.72,22682300,0.0,0.0,XLU,21.60,19.29,3.60,1.29,-0.001150,0.017554,0.016384,11639400.0,11042900.0,0.948752,69.990639,55.842824,2.206212,-0.452267,1.925723,1.574283,...,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


## Save Processed Data. 

In [16]:
# Save the dataframe into CSV. 
filepath = os.path.join(DATASET_DIR, "sector_price_history_processed.csv") 
df_tickers.to_csv(filepath, index=False) 

# Processing News Data. 

## Extract Keywords. 

## Convert To Wide Table. 

## Save Processed Data. 

# Data Summaries. 

# Not Important. Just For Reference / Exploration. 

## Mark The Dates For Each Event (With Python).

In [17]:
from pandas.tseries.holiday import (
    AbstractHolidayCalendar, Holiday, FR, SA, nearest_workday, 
    USMartinLutherKingJr, USPresidentsDay, GoodFriday, 
    USMemorialDay, USLaborDay, USColumbusDay, USThanksgivingDay
)


class TradingEvents(AbstractHolidayCalendar):
    rules = [
        Holiday("new_year", month=1, day=1, observance=None),
        USMartinLutherKingJr,
        Holiday("valentine", month=2, day=14, observance=None),
        USPresidentsDay,
        GoodFriday,
        USMemorialDay,
        Holiday("us_independence", month=7, day=4, observance=None),
        USLaborDay,
        USColumbusDay,
        Holiday("us_veterans", month=11, day=11, observance=None),
        USThanksgivingDay,
        Holiday("black_friday", month=11, day=1, offset=pd.DateOffset(weekday=FR(4))), 
        Holiday("cyber_monday", month=11, day=1, offset=[pd.DateOffset(weekday=SA(4)), pd.DateOffset(2)]), 
        Holiday("christmas", month=12, day=25, observance=None), 
    ]


def get_trading_event_dates(year:int):
    '''
    Purpose: 
        Identify dates for all events for a specific year. 
    
    Input  :
        year    : Int. The specific year to identify date of the event. 
        
    Return :
        List of dates for all the events for a specific year. 
    '''
    
    inst = TradingEvents() 
    return inst.holidays(dt.datetime(year, 1, 1), dt.datetime(year, 12, 31)) 


def consolidate_trading_event_dates(yr_start:int, yr_end:int): 
    '''
    Purpose: 
        Consolidate dates for all the events within the specified years. 
    
    Input  :
        yr_start    : Int. Starting year.
        yr_end      : Int. Ending year. 
        
    Return :
        Dateframe containing all the event dates. 
    '''

    event_names = [
        "new_year", "martin_lut_king", "valentine", "us_president", 
        "good_friday", "us_memorial", "us_independence", "labor", 
        "columbus", "us_veterans", "thanksgiving", "black_friday", 
        "cyber_monday", "christmas", 
    ]
    
    # Compile the dates into a list of list.
    generated_dates = [get_trading_event_dates(year) for year in range(yr_start, yr_end + 1, 1)] 
    return pd.DataFrame(generated_dates, columns=event_names) 


# Preview. 
consolidate_trading_event_dates(1998, 2021) 

Unnamed: 0,new_year,martin_lut_king,valentine,us_president,good_friday,us_memorial,us_independence,labor,columbus,us_veterans,thanksgiving,black_friday,cyber_monday,christmas
0,1998-01-01,1998-01-19,1998-02-14,1998-02-16,1998-04-10,1998-05-25,1998-07-04,1998-09-07,1998-10-12,1998-11-11,1998-11-26,1998-11-27,1998-11-30,1998-12-25
1,1999-01-01,1999-01-18,1999-02-14,1999-02-15,1999-04-02,1999-05-31,1999-07-04,1999-09-06,1999-10-11,1999-11-11,1999-11-25,1999-11-26,1999-11-29,1999-12-25
2,2000-01-01,2000-01-17,2000-02-14,2000-02-21,2000-04-21,2000-05-29,2000-07-04,2000-09-04,2000-10-09,2000-11-11,2000-11-23,2000-11-24,2000-11-27,2000-12-25
3,2001-01-01,2001-01-15,2001-02-14,2001-02-19,2001-04-13,2001-05-28,2001-07-04,2001-09-03,2001-10-08,2001-11-11,2001-11-22,2001-11-23,2001-11-26,2001-12-25
4,2002-01-01,2002-01-21,2002-02-14,2002-02-18,2002-03-29,2002-05-27,2002-07-04,2002-09-02,2002-10-14,2002-11-11,2002-11-22,2002-11-25,2002-11-28,2002-12-25
5,2003-01-01,2003-01-20,2003-02-14,2003-02-17,2003-04-18,2003-05-26,2003-07-04,2003-09-01,2003-10-13,2003-11-11,2003-11-24,2003-11-27,2003-11-28,2003-12-25
6,2004-01-01,2004-01-19,2004-02-14,2004-02-16,2004-04-09,2004-05-31,2004-07-04,2004-09-06,2004-10-11,2004-11-11,2004-11-25,2004-11-26,2004-11-29,2004-12-25
7,2005-01-01,2005-01-17,2005-02-14,2005-02-21,2005-03-25,2005-05-30,2005-07-04,2005-09-05,2005-10-10,2005-11-11,2005-11-24,2005-11-25,2005-11-28,2005-12-25
8,2006-01-01,2006-01-16,2006-02-14,2006-02-20,2006-04-14,2006-05-29,2006-07-04,2006-09-04,2006-10-09,2006-11-11,2006-11-23,2006-11-24,2006-11-27,2006-12-25
9,2007-01-01,2007-01-15,2007-02-14,2007-02-19,2007-04-06,2007-05-28,2007-07-04,2007-09-03,2007-10-08,2007-11-11,2007-11-22,2007-11-23,2007-11-26,2007-12-25


# Rule's Rules Computational Narratives. 

- __Rule 2: Document the process, not just the results:__ The processes for the initial investigation, data processing, and analysis are documented in addition to presenting the results. Comments are also included to explain the specific steps or breakdown.  

- __Rule 3: Use cell divisions to make steps clear:__ The processes for the initial investigation, data processing, and analysis are broken into separate cells to ensure that each process is digestible, understandable, and organised. All the configurations and functions related to each process are organised into a single cell to make it easier to locate them. Comments are given for each configuration and function to articulate the purpose and steps. 

- __Rule 4: Modularize code:__ Functions are used to adhere to the DRY (Don't Repeat Yourself) principle. Each function serves only a specific purpose so that each functionality is clear when readers reads or applies the function name. They can be used together with other function or reconfigured with different parameters since they are generalised. 

- __Rule 5: Record dependencies:__ The dependencies are recorded inside the `Pipfile` and `Pipfile.lock`. Readers or researchers can easily install all the libraries in their own virtual environment using `pipenv` or `venv`. Virtual environment and package manager is important to avoid polluting your own base libraries or dependencies conflict. I have also included a `Dockerfile` as an alternative option for readers or researchers to kickstart the project easily. However, they need publicly accessible IP using tool such as `ngrok` to access the Jupyterlab on the browser outside of the container. Docker is safer in case some libraries might be affected by the differences in OS. 

- __Rule 8: Share and explain your data:__ The data (unprocessed version) is shared with the lecturers or graders so that they can reproduce the results. The explanation for the data is provided briefly to help readers familiarise with the nature of the data, limitations, assumptions, definitions, unit of measurement, and others. And also to help them accurately evaluate the reasons behind the data processing and analysis better. 

- __Rule 9: Design your notebooks to be read, run, and explored:__ Readers or researchers can explore the notebook with lesser difficulty since the processes are organised into separate cells and modularised into separate functions. They can reconfigure the settings, tweak the functions, and print the results in-between the cells to evaluate the processess, learn about the analysis, or explore a different direction. Brief installation steps and dependency requirements are provided to help them start and replicate the project easily. 