# MADS Milestone 1 Project (Explore Visualisation). 

In [1]:
# Python module. 
import os, re
import pandas as pd 

# Change the current directory from (./notebook) to root directory. 
if re.match(r".+/notebook", os.getcwd()): 
	os.chdir("../..") 

# Custom module. 
from source.modules.process_tickerdata import * 
from source.modules.process_newsdata import *
from source.modules.consolidate_eventdates import * 
from source.modules.compute_aggregations import * 
from source.modules.explore_visuals import * 

# For clearing safe warnings. Not important. 
from IPython.core.display import clear_output
clear_output() 

## Configurations (general). 

In [2]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 200)

# For clearing the output. Not important. 
clear_output()

# Data Preparation. 

## Read Ticker Data. 

In [3]:
# Assign (use_csv) to (True) if you already processed and saved the data previously 
# so that it loads the data directly without reprocessing it again. 
# If this is your first time running this, assign to (False). 
use_csv = True

# Get and process ticker data. 
df_tickers = ProcessTickerData(use_csv=use_csv) 

# Save data if we aren't reading from the processed CSV file. 
if not use_csv: 
	df_tickers.write_to_csv() 

# Preview. 
df_tickers.df 

Read from (sector_price_history_processed_stg_1.csv)


Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med,tscore_c2o,tscore_o2c,tscore_c2c,vix_open,vix_close,vix_chg_c2c,vix_tscore_c2c
0,1998-12-22,12.09,12.09,11.97,12.02,55887,0.0,0.0,XLF,-0.004160,-0.005790,,,,,,,,24.05,22.78,-0.045264,
1,1998-12-23,11.97,12.20,11.97,12.20,78784,0.0,0.0,XLF,0.000000,0.019215,0.014975,,,,,,,21.89,20.21,-0.112818,
2,1998-12-24,12.20,12.28,12.16,12.28,43824,0.0,0.0,XLF,-0.000814,0.006557,0.006557,,,,,,,21.00,21.48,0.062840,
3,1998-12-28,12.27,12.27,12.09,12.12,51948,0.0,0.0,XLF,0.000825,-0.012225,-0.013029,,,,,,,22.92,23.50,0.094041,
4,1998-12-29,12.13,12.25,11.99,12.25,100819,0.0,0.0,XLF,-0.005714,0.009893,0.010726,,,,,,,23.68,22.18,-0.056170,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54170,2021-12-10,56.70,56.74,55.50,56.51,19524200,0.0,0.0,XLE,-0.009025,-0.003351,0.007488,26789050.0,-7264850.0,-0.271187,-0.813114,-0.214171,0.258881,21.27,18.69,-0.133920,-2.530976
54171,2021-12-13,56.00,56.17,54.67,54.94,28604000,0.0,0.0,XLE,-0.006189,-0.018929,-0.027783,27344050.0,1259950.0,0.046078,-0.591711,-1.137226,-1.345164,19.29,20.31,0.086677,-0.009505
54172,2021-12-14,54.60,55.55,54.52,54.71,28798800,0.0,0.0,XLE,-0.001462,0.002015,-0.004186,27952200.0,846600.0,0.030287,-0.229611,0.112317,-0.267088,19.67,21.89,0.077794,-0.106837
54173,2021-12-15,54.63,54.77,53.23,54.45,33858100,0.0,0.0,XLE,0.008632,-0.003295,-0.004752,28378450.0,5479650.0,0.193092,0.552860,-0.199646,-0.288803,21.60,19.29,-0.118776,-2.361829


# Data Aggregation. 

## Compute Aggregations. 

In [4]:
# Assign (use_csv) to (True) if you already processed and saved the data previously 
# so that it loads the data directly without reprocessing it again. 
# If this is your first time running this, assign to (False). 
use_csv = True

# Process news headlines. 
df_news_headlines = ProcessNewsData(use_csv=use_csv) 

# Consolidate all the occurring dates. 
df_tickers_with_events = ConsolidateDates(use_csv=use_csv) 

# Compute aggregations. 
df_aggregated_data = AggregateMeasures(use_csv=use_csv, ticker_event_dates=df_tickers_with_events) 

# Save data if we aren't reading from the processed CSV files. 
if not use_csv: 
	df_news_headlines.write_to_csv() 
	df_tickers_with_events.write_to_csv() 
	df_aggregated_data.write_to_csv() 

# Preview. 
df_aggregated_data.df 

Read from (news_headline_keywords.csv)
Read from (sector_price_history_processed_stg_1.csv)
Loading Event Dates
Read from (observance_dates_ext.csv)
Read from (santa_rally.csv)
Read from (triple_witching_week.csv)
Read from (economic_reported_date.csv)
Read from (news_headline_keywords.csv)
Read from (sector_price_history_processed_stg_2.csv)
Read from (sector_price_history_processed_stg_3.csv)


Unnamed: 0,ticker,factor,price_chg_c2c_dir_0,price_chg_c2c_dir_1,price_chg_c2o_dir_0,price_chg_c2o_dir_1,price_chg_o2c_dir_0,price_chg_o2c_dir_1,volume_pchg_from_med_abv_0,volume_pchg_from_med_abv_1,price_chg_c2o_mag_0,price_chg_c2o_mag_1,price_chg_c2o_mag_diff,price_chg_o2c_mag_0,price_chg_o2c_mag_1,price_chg_o2c_mag_diff,tscore_c2c_mag_0,tscore_c2c_mag_1,tscore_c2c_mag_diff,vix_tscore_c2c_mag_0,vix_tscore_c2c_mag_1,vix_tscore_c2c_mag_diff,price_chg_c2c_mag_0,price_chg_c2c_mag_1,price_chg_c2c_mag_diff,vix_chg_c2c_mag_0,vix_chg_c2c_mag_1,vix_chg_c2c_mag_diff,tscore_c2o_mag_0,tscore_c2o_mag_1,tscore_c2o_mag_diff,tscore_o2c_mag_0,tscore_o2c_mag_1,tscore_o2c_mag_diff
0,XHB,black_friday,0.508243,0.500000,0.536321,0.517857,0.488540,0.553571,0.501318,0.392857,0.006561,0.007237,0.000676,0.012309,0.013863,0.001555,0.746007,0.824538,0.078531,1.190777,1.194601,0.003825,0.013898,0.016297,0.002399,0.055074,0.059339,0.004265,0.711163,0.783040,0.071877,0.747474,0.792461,0.044987
1,XLB,black_friday,0.518762,0.583851,0.531389,0.515528,0.494666,0.546584,0.505510,0.472050,0.005560,0.005650,0.000090,0.008967,0.008262,-0.000705,0.744064,0.780107,0.036043,1.189757,1.141327,-0.048431,0.010722,0.011354,0.000632,0.050670,0.051788,0.001118,0.713366,0.697680,-0.015685,0.737750,0.698872,-0.038878
2,XLE,black_friday,0.512716,0.515528,0.533879,0.472050,0.496622,0.540373,0.518699,0.546584,0.006516,0.008067,0.001551,0.010443,0.010724,0.000281,0.762828,0.907229,0.144401,1.189757,1.141327,-0.048431,0.012559,0.015386,0.002827,0.050670,0.051788,0.001118,0.749367,0.931465,0.182098,0.774418,0.754115,-0.020303
3,XLF,black_friday,0.505780,0.527950,0.513961,0.509317,0.490932,0.472050,0.514905,0.416149,0.006612,0.007275,0.000663,0.009675,0.009423,-0.000252,0.731609,0.809053,0.077443,1.189757,1.141327,-0.048431,0.011646,0.013628,0.001982,0.050670,0.051788,0.001118,0.688449,0.710410,0.021962,0.735048,0.678256,-0.056792
4,XLI,black_friday,0.532278,0.534161,0.528721,0.521739,0.505156,0.540373,0.534598,0.422360,0.005528,0.005869,0.000341,0.007803,0.007556,-0.000248,0.737485,0.751660,0.014175,1.189757,1.141327,-0.048431,0.009330,0.009542,0.000212,0.050670,0.051788,0.001118,0.725296,0.760054,0.034758,0.738489,0.725366,-0.013123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,XLK,news_rate_hikes,0.533756,0.603604,0.538692,0.567568,0.507931,0.558559,0.513697,0.558559,0.006442,0.004057,-0.002385,0.009295,0.005858,-0.003437,0.732920,0.689011,-0.043909,1.191466,1.040517,-0.150949,0.011099,0.006863,-0.004236,0.050741,0.048669,-0.002072,0.700123,0.715159,0.015036,0.737435,0.715547,-0.021889
606,XLP,news_rate_hikes,0.515248,0.576577,0.508902,0.504505,0.503349,0.567568,0.522829,0.531532,0.003837,0.002570,-0.001267,0.006058,0.005639,-0.000419,0.728512,0.818412,0.089900,1.191466,1.040517,-0.150949,0.006781,0.005907,-0.000875,0.050741,0.048669,-0.002072,0.688730,0.694416,0.005686,0.727079,0.900153,0.173075
607,XLU,news_rate_hikes,0.527234,0.531532,0.535872,0.450450,0.489602,0.576577,0.524799,0.549550,0.004006,0.002458,-0.001549,0.007626,0.007806,0.000180,0.735952,0.973356,0.237405,1.191466,1.040517,-0.150949,0.008369,0.008559,0.000190,0.050741,0.048669,-0.002072,0.684369,0.751966,0.067596,0.736891,0.956441,0.219549
608,XLY,news_rate_hikes,0.527587,0.540541,0.528644,0.648649,0.519210,0.567568,0.517637,0.477477,0.005495,0.003699,-0.001795,0.008194,0.005479,-0.002715,0.743216,0.708997,-0.034219,1.191466,1.040517,-0.150949,0.009742,0.006355,-0.003387,0.050741,0.048669,-0.002072,0.696115,0.700525,0.004410,0.742013,0.722869,-0.019143


## Identify Convergence Across Different Metrics. 

In [5]:
df_identified_convergence, df_identified_condition = df_aggregated_data.identify_convergence() 
df_identified_convergence 

Unnamed: 0,ticker,factor,tscore_c2o_mag_1,tscore_c2c_mag_1,tscore_o2c_mag_1,vix_tscore_c2c_mag_1,price_chg_c2o_dir_1,price_chg_c2c_dir_1,price_chg_o2c_dir_1,volume_pchg_from_med_abv_1,influential
254,XLI,ism_pmi_manufacturer,0.0,1.0,1.0,1.0,0,0,0,1,1
521,XLB,fomc_presscf,0.0,1.0,1.0,1.0,0,0,0,1,1
524,XLI,fomc_presscf,0.0,1.0,0.0,1.0,0,0,0,1,1
525,XLK,fomc_presscf,0.0,1.0,0.0,1.0,0,0,0,1,1
526,XLP,fomc_presscf,0.0,1.0,1.0,1.0,0,0,0,1,1
527,XLU,fomc_presscf,0.0,1.0,0.0,1.0,0,0,0,1,1
541,XLB,opec,0.0,1.0,0.0,1.0,0,0,0,1,1
542,XLE,opec,0.0,1.0,1.0,1.0,0,0,0,1,1
543,XLF,opec,0.0,1.0,0.0,1.0,0,0,0,1,1
545,XLK,opec,0.0,1.0,1.0,1.0,0,0,0,1,1


In [6]:
# Collect identified factors. 
identified_factors = df_identified_convergence.loc[:, "factor"].unique() 

# Preview. 
identified_factors

array(['ism_pmi_manufacturer', 'fomc_presscf', 'opec'], dtype=object)

# Visualise Aggregates. 

## Identified Factors. 

In [7]:
plot_heatmap(df_identified_condition, x="ticker", y="factor", z="influential", factors=identified_factors, zlim=[0,1.5], format_text=".0f") \
	.properties(title="identified influential variables") 

## Analyse The Magnitude Of T-Scores For Price & VIX. 

In [8]:
plot_heatmap(df_aggregated_data.df, x="ticker", y="factor", z="tscore_c2c_mag_1", factors=identified_factors, zlim=[0.9,1.2] , format_text=".1f")

In [9]:
plot_heatmap(df_aggregated_data.df, x="ticker", y="factor", z="vix_tscore_c2c_mag_1", factors=identified_factors, zlim=[0.9,1.4] , format_text=".1f")

## Analyse The Probability Of Volume Rising Above Median. 

In [10]:
plot_heatmap(df_aggregated_data.df, x="ticker", y="factor", z="volume_pchg_from_med_abv_1", factors=identified_factors, zlim=[0.7,0.8] , format_text=".2f") \
	.properties(title="probability of volume exceeding rollmed")

## Analyse The Effect Of FOMC Over Time. 

In [11]:
tickers = ["XLB", "XLK", "XLU", "XLI", "XLP"]
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="fomc_presscf", measure="tscore_c2c", format_text=".1f") \
	.properties(title="FOMC Press Conference") 

## Analyse The Effect Of ISM PMI Manufacturer Over Time. 

In [12]:
tickers = ["XLI"]
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="ism_pmi_manufacturer", measure="tscore_c2c", format_text=".1f") \
	.properties(title="ISM PMI Manufacturing") 

## Analyse The Effect Of OPEC Over Time. 

In [13]:
tickers = ["XLE", "XLF", "XLK", "XLB", "XLP"]
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="opec", measure="tscore_c2c", format_text=".1f") \
	.properties(title="OPEC") 

## Analyse The Distribution During The FOMC Event. 

In [14]:
tickers = ["XLB", "XLK", "XLU", "XLI", "XLP"]
plot_boxplot(df_tickers_with_events.df, tickers=tickers, factor="fomc_presscf", measure="tscore_c2c", format_text=".1f") 

## Analyse The Distribution During The OPEC. 

In [15]:
tickers = ["XLB", "XLK", "XLF", "XLP", "XLE"]
plot_boxplot(df_tickers_with_events.df, tickers=tickers, factor="fomc_presscf", measure="tscore_c2c", format_text=".1f") 

## Analyse The Distribution During The ISM PMI Manufacturing Reporting Day. 

In [16]:
tickers = ["XLI"]
plot_boxplot(df_tickers_with_events.df, tickers=tickers, factor="ism_pmi_manufacturer", measure="tscore_c2c", format_text=".1f")