# MADS SIADS 591 Milestone 1 Project (Preprocessing Steps). 

In [1]:
# Python module. 
import os, re
import pandas as pd 
import altair as alt 

# Change the current directory from (./notebook) to root directory. 
if re.match(r".+/notebook", os.getcwd()): 
	os.chdir("..") 

# Custom module. 
from modules.process_tickerdata import * 
from modules.process_newsdata import *
from modules.consolidate_eventdates import * 
from modules.compute_aggregations import * 

# Recession. 
from config.config import RECESSIONS 

# For clearing safe warnings. Not important. 
from IPython.core.display import clear_output
clear_output() 

## Configurations (general). 

In [2]:
# Pandas DF config. 
pd.set_option("display.max_rows", 50, "display.max_columns", 50, "display.max_colwidth", 200)

# For clearing the output. Not important. 
clear_output()

# Data Preparation. 

## Read Ticker Data. 

In [3]:
use_csv = True

# Get and process ticker data. 
df_tickers = ProcessTickerData(use_csv=use_csv, start_date="1998-12-01", end_date="2021-12-17") 

# Save data if we aren't reading from the processed CSV file. 
if not use_csv: 
	df_tickers.write_to_csv() 

# Preview. 
df_tickers.df 

Read from (sector_price_history_processed_stg_1.csv)


Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits,ticker,price_chg_c2o,price_chg_o2c,price_chg_c2c,volume_rollmed,volume_diff_to_med,volume_pchg_from_med,bo_upper,bo_lower,tscore_bo,tscore_c2o,tscore_o2c,tscore_c2c,vix_open,vix_close,vix_chg_c2c,vix_tscore_c2c
0,1998-12-22,12.09,12.09,11.97,12.02,55887,0.0,0.0,XLF,,-0.005790,,,,,,,,,,,24.05,22.78,-0.045264,
1,1998-12-23,11.97,12.20,11.97,12.20,78784,0.0,0.0,XLF,-0.004160,0.019215,0.014975,,,,,,,,,,21.89,20.21,-0.112818,
2,1998-12-24,12.20,12.28,12.16,12.28,43824,0.0,0.0,XLF,0.000000,0.006557,0.006557,,,,,,,,,,21.00,21.48,0.062840,
3,1998-12-28,12.27,12.27,12.09,12.12,51948,0.0,0.0,XLF,-0.000814,-0.012225,-0.013029,,,,,,,,,,22.92,23.50,0.094041,
4,1998-12-29,12.13,12.25,11.99,12.25,100819,0.0,0.0,XLF,0.000825,0.009893,0.010726,,,,,,,,,,23.68,22.18,-0.056170,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56912,2021-12-10,56.70,56.74,55.50,56.51,19524200,0.0,0.0,XLE,0.010875,-0.003351,0.007488,26789050.0,-7264850.0,-0.271187,61.999383,25.533321,1.397864,0.717619,-0.214470,0.259242,21.27,18.69,-0.133920,-2.533107
56913,2021-12-13,56.00,56.17,54.67,54.94,28604000,0.0,0.0,XLE,-0.009025,-0.018929,-0.027783,27344050.0,1259950.0,0.046078,62.064044,25.594697,1.218627,-0.814246,-1.138809,-1.347036,19.29,20.31,0.086677,-0.008127
56914,2021-12-14,54.60,55.55,54.52,54.71,28798800,0.0,0.0,XLE,-0.006189,0.002015,-0.004186,27952200.0,846600.0,0.030287,62.125984,25.655071,1.186641,-0.592534,0.112473,-0.267460,19.67,21.89,0.077794,-0.105594
56915,2021-12-15,54.63,54.77,53.23,54.45,33858100,0.0,0.0,XLE,-0.001462,-0.003295,-0.004752,28378450.0,5479650.0,0.193092,62.182052,25.713892,1.151912,-0.229931,-0.199923,-0.289205,21.60,19.29,-0.118776,-2.363724


# Data Aggregation. 

## Compute Aggregations. 

In [4]:
use_csv = True

# Process news headlines. 
df_news_headlines = ProcessNewsData(use_csv=use_csv) 

# Consolidate all the occurring dates. 
df_tickers_with_events = ConsolidateDates(use_csv=use_csv) 

# Compute aggregations. 
df_aggregated_data = AggregateMeasures(use_csv=use_csv, ticker_event_dates=df_tickers_with_events) 

# Save data if we aren't reading from the processed CSV files. 
if not use_csv: 
	df_news_headlines.write_to_csv() 
	df_tickers_with_events.write_to_csv() 
	df_aggregated_data.write_to_csv() 

# Preview. 
df_aggregated_data.df 

Read from (news_headline_keywords.csv)
Read from (sector_price_history_processed_stg_1.csv)
Loading Event Dates
Read from (observance_dates_ext.csv)
Read from (santa_rally.csv)
Read from (triple_witching_week.csv)
Read from (economic_reported_date.csv)
Read from (news_headline_keywords.csv)
Read from (sector_price_history_processed_stg_2.csv)
Read from (sector_price_history_processed_stg_3.csv)


Unnamed: 0,ticker,factor,price_chg_c2o_dir_0,price_chg_c2o_dir_1,price_chg_o2c_dir_0,price_chg_o2c_dir_1,price_chg_c2c_dir_0,price_chg_c2c_dir_1,volume_pchg_from_med_dir_0,volume_pchg_from_med_dir_1,price_chg_c2o_mag_0,price_chg_c2o_mag_1,price_chg_c2o_mag_diff,tscore_bo_mag_0,tscore_bo_mag_1,tscore_bo_mag_diff,tscore_c2o_mag_0,tscore_c2o_mag_1,tscore_c2o_mag_diff,vix_tscore_c2c_mag_0,vix_tscore_c2c_mag_1,vix_tscore_c2c_mag_diff,tscore_c2c_mag_0,tscore_c2c_mag_1,tscore_c2c_mag_diff,tscore_o2c_mag_0,tscore_o2c_mag_1,tscore_o2c_mag_diff,vix_chg_c2c_mag_0,vix_chg_c2c_mag_1,vix_chg_c2c_mag_diff,price_chg_o2c_mag_0,price_chg_o2c_mag_1,price_chg_o2c_mag_diff,price_chg_c2c_mag_0,price_chg_c2c_mag_1,price_chg_c2c_mag_diff
0,XHB,black_friday,0.536064,0.526786,0.488540,0.553571,0.508243,0.500000,0.501318,0.392857,0.006554,0.007506,0.000952,1.273022,1.417851,0.144829,0.711433,0.809045,0.097612,1.191377,1.195270,0.003893,0.747041,0.825683,0.078642,0.748507,0.793562,0.045055,0.055074,0.059339,0.004265,0.012309,0.013863,0.001555,0.013898,0.016297,0.002399
1,XLB,black_friday,0.531389,0.515528,0.494666,0.546584,0.518762,0.583851,0.505510,0.472050,0.005547,0.006103,0.000556,1.287871,1.449058,0.161188,0.712826,0.751304,0.038479,1.190377,1.141885,-0.048492,0.745100,0.781193,0.036093,0.738771,0.699846,-0.038925,0.050670,0.051788,0.001118,0.008967,0.008262,-0.000705,0.010722,0.011354,0.000632
2,XLE,black_friday,0.534412,0.453416,0.496622,0.540373,0.512716,0.515528,0.518699,0.546584,0.006508,0.008352,0.001844,1.310783,1.217957,-0.092826,0.750192,0.940436,0.190244,1.190377,1.141885,-0.048492,0.763903,0.908488,0.144586,0.775494,0.755159,-0.020335,0.050670,0.051788,0.001118,0.010443,0.010724,0.000281,0.012559,0.015386,0.002827
3,XLF,black_friday,0.513783,0.515528,0.490932,0.472050,0.505780,0.527950,0.514905,0.416149,0.006599,0.007739,0.001140,1.287749,1.488940,0.201191,0.688748,0.734744,0.045996,1.190377,1.141885,-0.048492,0.732599,0.811003,0.078404,0.736092,0.678162,-0.057931,0.050670,0.051788,0.001118,0.009676,0.009412,-0.000264,0.011646,0.013643,0.001997
4,XLI,black_friday,0.528721,0.521739,0.505156,0.540373,0.532278,0.534161,0.534598,0.422360,0.005524,0.006020,0.000496,1.344455,1.445484,0.101029,0.726463,0.756200,0.029737,1.190377,1.141885,-0.048492,0.738508,0.752701,0.014193,0.739519,0.726363,-0.013155,0.050670,0.051788,0.001118,0.007803,0.007556,-0.000247,0.009330,0.009542,0.000212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,XLP,news_rate_hikes,0.511193,0.387387,0.503172,0.567568,0.515248,0.576577,0.522829,0.531532,0.003840,0.002395,-0.001445,1.390170,1.309677,-0.080493,0.689721,0.692892,0.003171,1.192091,1.040799,-0.151292,0.729532,0.819551,0.090019,0.728115,0.901411,0.173296,0.050741,0.048669,-0.002072,0.006058,0.005639,-0.000419,0.006782,0.005907,-0.000875
612,XLU,news_rate_hikes,0.536577,0.414414,0.489602,0.576577,0.527411,0.531532,0.524799,0.549550,0.004001,0.002754,-0.001247,1.375913,1.158757,-0.217156,0.682925,0.868598,0.185673,1.192091,1.040799,-0.151292,0.736987,0.974489,0.237502,0.737926,0.957505,0.219579,0.050741,0.048669,-0.002072,0.007626,0.007803,0.000177,0.008369,0.008557,0.000188
613,XLY,news_rate_hikes,0.531641,0.504505,0.519210,0.567568,0.527587,0.540541,0.517637,0.477477,0.005497,0.003592,-0.001905,1.386603,1.606712,0.220109,0.696387,0.733928,0.037541,1.192091,1.040799,-0.151292,0.744247,0.709998,-0.034249,0.743044,0.723902,-0.019142,0.050741,0.048669,-0.002072,0.008194,0.005479,-0.002715,0.009742,0.006355,-0.003387
614,XRT,news_rate_hikes,0.540919,0.414414,0.496701,0.558559,0.521119,0.522523,0.499730,0.549550,0.006058,0.004703,-0.001355,1.371267,1.267802,-0.103465,0.718394,0.812336,0.093943,1.193676,1.040799,-0.152877,0.772600,0.846924,0.074324,0.780204,0.798513,0.018309,0.055563,0.048669,-0.006894,0.010187,0.008231,-0.001955,0.011884,0.009788,-0.002096


## Identify Convergence Across Different Metrics. 

In [5]:
df_identify_convergence = df_aggregated_data.identify_covergence() 
df_identify_convergence

Unnamed: 0,ticker,factor,price_chg_c2o_dir_0,price_chg_c2o_dir_1,price_chg_o2c_dir_0,price_chg_o2c_dir_1,price_chg_c2c_dir_0,price_chg_c2c_dir_1,volume_pchg_from_med_dir_0,volume_pchg_from_med_dir_1,price_chg_c2o_mag_0,price_chg_c2o_mag_1,tscore_bo_mag_0,tscore_bo_mag_1,tscore_c2o_mag_0,tscore_c2o_mag_1,vix_tscore_c2c_mag_0,vix_tscore_c2c_mag_1,tscore_c2c_mag_0,tscore_c2c_mag_1,tscore_o2c_mag_0,tscore_o2c_mag_1,vix_chg_c2c_mag_0,vix_chg_c2c_mag_1,price_chg_o2c_mag_0,price_chg_o2c_mag_1,price_chg_c2c_mag_0,price_chg_c2c_mag_1
0,XHB,black_friday,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,XLB,black_friday,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,XLE,black_friday,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,XLF,black_friday,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,XLI,black_friday,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,XLP,news_rate_hikes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
612,XLU,news_rate_hikes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
613,XLY,news_rate_hikes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
614,XRT,news_rate_hikes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
cols = [
	"ticker", "factor", 
	"tscore_c2o_mag_1", "tscore_c2c_mag_1", "tscore_bo_mag_1", "vix_tscore_c2c_mag_1", 
	"price_chg_c2o_dir_1", "price_chg_c2c_dir_1", "volume_pchg_from_med_dir_1", 
]

# Set the conditions for identifying convergence. 
boo_conditions = \
	(df_identify_convergence["tscore_c2c_mag_1"] == 1.0) & \
	(df_identify_convergence["volume_pchg_from_med_dir_1"] == 1.0) 

# Mark influential variables. 
df_identify_convergence.loc[:, "influential"] = 0
df_identify_convergence.loc[boo_conditions, "influential"] = 1

# Preview. 
df_identify_convergence.loc[boo_conditions, cols + ["influential"]]

Unnamed: 0,ticker,factor,tscore_c2o_mag_1,tscore_c2c_mag_1,tscore_bo_mag_1,vix_tscore_c2c_mag_1,price_chg_c2o_dir_1,price_chg_c2c_dir_1,volume_pchg_from_med_dir_1,influential
2,XLE,black_friday,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
23,XLB,columbus,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1
24,XLE,columbus,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1
35,XLE,cyber_monday,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
265,XLB,ism_pmi_manufacturer,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1
268,XLI,ism_pmi_manufacturer,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1
490,XLP,gdp_advance_us,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1
517,XHB,fomc_presscf,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1
518,XLB,fomc_presscf,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
519,XLE,fomc_presscf,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [7]:
# Collect identified factors. 
identified_factors = df_identify_convergence.loc[boo_conditions, "factor"].unique()

# Preview. 
identified_factors

array(['black_friday', 'columbus', 'cyber_monday', 'ism_pmi_manufacturer',
       'gdp_advance_us', 'fomc_presscf', 'opec', 'news_interest_rate',
       'news_rate_hikes'], dtype=object)

# Visualise Aggregates. 

## Visualisation Functions. 

In [8]:
def plot_heatmap(df:pd.DataFrame, x:str, y:str, z:str, factors:list, zlim:list=[0,1.5], format_text:str=".0f"): 

    # Filter the factors. 
    df_fil = df.loc[df["factor"].isin(factors), [x,y,z]] 

    # Base encoding. 
    base = alt.Chart(df_fil) \
        .encode(
            x=alt.X(
                f"{x}:N", 
                axis=alt.Axis(title="", titleFontSize=14, labelFontSize=10, labelAngle=0), 
            ),
            y=alt.Y(
                f"{y}:N", 
                axis=alt.Axis(title="", titleFontSize=14, labelFontSize=10), 
            ), 
            tooltip=[
                alt.Tooltip(f"{z}:Q", title=z, format=format_text), 
            ], 
        ) \
        .properties(height=200, width=400) 

    # Visualisation approach. 
    heatmap = base \
        .mark_rect(opacity=1) \
        .encode(
            color=alt.Color(
                f"{z}:Q", 
                scale=alt.Scale(domain=zlim, scheme="blues", reverse=False),
                legend=alt.Legend(direction="vertical"), 
            )
        ) 

    # Annotation. 
    text = base \
        .mark_text(baseline="middle") \
        .encode(text=alt.Text(f"{z}:Q", format=format_text)) 

    return (heatmap + text).interactive() 

In [9]:
def plot_timeseries(df:pd.DataFrame, x:str, tickers:list, factor:str, measure:str, format_text:str=".1f"): 

    # For concatnating multiple visuals. 
    combined_plot = alt.vconcat() 

    # Columns you to visualise.
    cols_to_vis = [x, "ticker", factor, measure]

    # Consolidate the recession dates. 
    df_recessions = pd.DataFrame(RECESSIONS) 

    # For adding horizontal line. 
    df_hline_hilo = pd.DataFrame(data={"upper": [1], "lower": [-1]})

    # Plot multiple visuals. 
    for ticker in tickers: 
        df_fil = df.loc[(df[factor] == 1) & (df["ticker"] == ticker), cols_to_vis] 

        # Base encoding. 
        base = alt.Chart(df_fil) \
            .encode(
                x=alt.X(
                    f"{x}:T", 
                    axis=alt.Axis(title="", titleFontSize=10, labelFontSize=10, labelAngle=0), 
                ),
                y=alt.Y(
                    f"{measure}:Q", 
                    axis=alt.Axis(title=measure, titleFontSize=10, labelFontSize=10), 
                ), 
                tooltip=[
                    alt.Tooltip(f"{measure}:Q", title=measure, format=format_text), 
                ], 
            ) \
            .properties(title=ticker, height=75, width=600) 

        # Visualisation approach. 
        scatter = base.mark_point(opacity=1, color="orange", filled=True, size=50) 

        # Add horizontal lines. 
        hline_hi = alt.Chart(df_hline_hilo) \
            .mark_rule(color="black") \
            .encode(y="upper") 

        hline_lo = alt.Chart(df_hline_hilo) \
            .mark_rule(color="black") \
            .encode(y="lower") 

        # Highlight recession period. 
        recess_dotcom = alt.Chart(df_recessions) \
            .transform_filter(alt.datum.recession == "DotCom 2001") \
            .encode(x="date_start:T", x2="date_end:T") \
            .mark_rect(color="black", opacity=.2) 

        recess_debt = alt.Chart(df_recessions) \
            .transform_filter(alt.datum.recession == "DebtCrisis 2008") \
            .encode(x="date_start:T", x2="date_end:T") \
            .mark_rect(color="black", opacity=.2) 

        recess_covid = alt.Chart(df_recessions) \
            .transform_filter(alt.datum.recession == "Covid 2019") \
            .encode(x="date_start:T", x2="date_end:T") \
            .mark_rect(color="black", opacity=.2) 

        combined_plot &= (scatter + hline_hi + hline_lo + recess_dotcom + recess_debt + recess_covid).interactive() 

    return combined_plot

In [10]:
def plot_boxplot(df:pd.DataFrame, tickers:list, factor:str, measure:str, format_text:str=".1f"): 

    # For concatnating multiple visuals. 
    combined_plot = alt.vconcat() 

    # Columns you to visualise.
    cols_to_vis = ["ticker", factor, measure]

    # Plot multiple visuals. 
    for ticker in tickers: 
        df_fil = df.loc[df["ticker"] == ticker, cols_to_vis].sample(5000).copy() 

        # Base encoding. 
        base = alt.Chart(df_fil) \
            .encode(
                x=alt.X(
                    f"{measure}:Q", 
                    axis=alt.Axis(title=measure, titleFontSize=10, labelFontSize=10), 
                ), 
                y=alt.Y(
                    f"{factor}:N",  
                    axis=alt.Axis(title=factor, titleFontSize=10, labelFontSize=10, labelAngle=0), 
                ), 
            ) \
            .properties(title=ticker, height=100, width=400) 

        # Visualisation approach. 
        box = base.mark_boxplot() 

        combined_plot &= (box).interactive() 

    return combined_plot

## Identified Factors. 

In [11]:
plot_heatmap(df_identify_convergence, x="ticker", y="factor", z="influential", factors=identified_factors, zlim=[0,1.5], format_text=".0f")

## Analyse The Magnitude Of T-Scores For Price & VIX. 

In [12]:
plot_heatmap(df_aggregated_data.df, x="ticker", y="factor", z="tscore_c2c_mag_1", factors=identified_factors, zlim=[0.9,1.2] , format_text=".1f")

In [13]:
plot_heatmap(df_aggregated_data.df, x="ticker", y="factor", z="vix_tscore_c2c_mag_1", factors=identified_factors, zlim=[1.0,1.3] , format_text=".1f")

## Analyse The Probability Of Volume Rising Above Median. 

In [14]:
plot_heatmap(df_aggregated_data.df, x="ticker", y="factor", z="volume_pchg_from_med_dir_1", factors=identified_factors, zlim=[.3,.7] , format_text=".1f")

## Analyse The Effect Of FOMC Over Time. 

In [15]:
tickers = df_tickers_with_events.df["ticker"].unique() 
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="fomc_presscf", measure="tscore_c2c", format_text=".1f") 

## Analyse The Effect Of ISM PMI Manufacturer Over Time. 

In [16]:
tickers = ["XLB", "XLI"]
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="ism_pmi_manufacturer", measure="tscore_c2c", format_text=".1f")

In [17]:
tickers = df_tickers_with_events.df["ticker"].unique() 
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="ism_pmi_manufacturer", measure="tscore_c2c", format_text=".1f")

## Analyse The Effect Of OPEC Over Time. 

In [18]:
tickers = ["XLB", "XLE", "XLF", "XLK", "XLP", "XLU"]
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="opec", measure="tscore_c2c", format_text=".1f")

## Analyse The Effect Of Black Friday. 

In [19]:
tickers = ["XLE", "XLP", "XLY"]
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="black_friday", measure="tscore_c2c", format_text=".1f")

In [20]:
tickers = df_tickers_with_events.df["ticker"].unique() 
plot_timeseries(df_tickers_with_events.df, x="date", tickers=tickers, factor="black_friday", measure="tscore_c2c", format_text=".1f")

## Analyse The Distribution Of ISM PMI Manufacture. 

In [21]:
tickers = ["XLB", "XLI"]
plot_boxplot(df_tickers_with_events.df, tickers=tickers, factor="ism_pmi_manufacturer", measure="tscore_c2c", format_text=".1f")

# Rule's Rules Computational Narratives. 

- __Rule 2: Document the process, not just the results:__ The processes for the initial investigation, data processing, and analysis are documented in addition to presenting the results. Comments are also included to explain the specific steps or breakdown.  

- __Rule 3: Use cell divisions to make steps clear:__ The processes for the initial investigation, data processing, and analysis are broken into separate cells to ensure that each process is digestible, understandable, and organised. All the configurations and functions related to each process are organised into a single cell to make it easier to locate them. Comments are given for each configuration and function to articulate the purpose and steps. 

- __Rule 4: Modularize code:__ Functions are used to adhere to the DRY (Don't Repeat Yourself) principle. Each function serves only a specific purpose so that each functionality is clear when readers reads or applies the function name. They can be used together with other function or reconfigured with different parameters since they are generalised. 

- __Rule 5: Record dependencies:__ The dependencies are recorded inside the `Pipfile` and `Pipfile.lock`. Readers or researchers can easily install all the libraries in their own virtual environment using `pipenv` or `venv`. Virtual environment and package manager is important to avoid polluting your own base libraries or dependencies conflict. I have also included a `Dockerfile` as an alternative option for readers or researchers to kickstart the project easily. However, they need publicly accessible IP using tool such as `ngrok` to access the Jupyterlab on the browser outside of the container. Docker is safer in case some libraries might be affected by the differences in OS. 

- __Rule 8: Share and explain your data:__ The data (unprocessed version) is shared with the lecturers or graders so that they can reproduce the results. The explanation for the data is provided briefly to help readers familiarise with the nature of the data, limitations, assumptions, definitions, unit of measurement, and others. And also to help them accurately evaluate the reasons behind the data processing and analysis better. 

- __Rule 9: Design your notebooks to be read, run, and explored:__ Readers or researchers can explore the notebook with lesser difficulty since the processes are organised into separate cells and modularised into separate functions. They can reconfigure the settings, tweak the functions, and print the results in-between the cells to evaluate the processess, learn about the analysis, or explore a different direction. Brief installation steps and dependency requirements are provided to help them start and replicate the project easily. 