# Merge of Data Frames

In this notebook, we merge the data frames from different sources. After the merge, we perform some basic EDA tasks.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
from sklearn.preprocessing import StandardScaler

import importlib
import ipynbrc

In [None]:
importlib.reload(ipynbrc)
from ipynbrc import *

NB_NUMBER = 3

In [None]:
# Load all data frames

dfs_0 = []

# Frames for single stocks
for ticker in STOCK_TICKERS:
    csvname = ticker.lower() + ".csv"
    relpath = os.path.join(DIR_DATA_STOCKS, csvname)
    df_0 = load_df_from_csv(relpath, NB_NUMBER)
    df_0["date"] = pd.to_datetime(df_0["date"])
    dfs_0.append(df_0)

# Frame containing macro indicators
relpath = RELPATH_DATA_YF_MIF
df_0 = load_df_from_csv(relpath, NB_NUMBER)
dfs_0.append(df_0)

# Frame containing FED data
relpath = RELPATH_DATA_FED_CEI
df_0 = load_df_from_csv(relpath, NB_NUMBER)
dfs_0.append(df_0)

In [None]:
# Merge all data frames on date

df = reduce(
    lambda df_left, df_right:
    # XXX: Why inner join?
    pd.merge(df_left, df_right, on="date", how="inner"),
    dfs_0,
)
df.index.name = IDX

relpath = "main.csv"
store_df_as_csv(df, relpath, NB_NUMBER)

## Stocks from `yfinance`

In [None]:
# Plot date vs. each investment per stock

plt.figure(figsize=(12, 6))

cols = cartprod("invest", STOCK_TICKERS)

plt.plot(df["date"], df[cols])
plt.xlabel("Date")
plt.ylabel("Investment Value")
plt.title("Investment over Time per Stock")
plt.legend(STOCK_TICKERS, loc="upper left")  
plt.grid(True)
# Adjust layout to prevent overlapping labels
plt.tight_layout()  
plt.show()

## Market Indices/Indexoids from `yfinance`

In [None]:
# Prepare data subframe

df_idxs = df[
    [
        "date",
        "s&p500_index",
        "dow_jones_index",
        "nasdaq_composite",
        "russell2000_index",
        "vix_index",
        "dollar_index_dxy",
        "gold_futures",
        "wti_oil_futures",
        "copper_futures",
        "brent_crude_futures",
        "tech_sector_etf",
        "energy_sector_etf",
        "financial_sector_etf",
        "consumerdiscretionary_etf",
        "lithium_etf",
        "semiconductor_etf",
        "electricity_proxy",
    ]
]

# For plotting convencience, use date as index.
df_idxs = df_idxs.set_index("date")

In [None]:
# Index ranges

min = df_idxs.min().round(2)
max = df_idxs.max().round(2)
df_summary = pd.DataFrame({
    "Index": df_idxs.columns,
    "Min": min.values,
    "Max": max.values,
    "Range": (max - min).values
})
print("\nSummary: Index Ranges\n")
print(df_summary.to_markdown(index=False))

In [None]:
# Index Time Evolution -- Raw Scale

plt.figure(figsize=(14, 7))
df_idxs.plot(ax=plt.gca(), linewidth=2)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Value", fontsize=12)
plt.title("Macro Indicators Over Time (Raw Scale)")
plt.xticks(rotation=45)
plt.legend(loc="upper left", fontsize=10)
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Index Time Evolution -- Standardised Scale

scaler = StandardScaler()
df_idxs_scaled = scaler.fit_transform(df_idxs)
df_idxs_scaled = pd.DataFrame(df_idxs_scaled, columns=df_idxs.columns, index=df_idxs.index)

plt.figure(figsize=(20, 10))
df_idxs_scaled.plot(ax=plt.gca(), linewidth=2)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Scaled Value", fontsize=12)
plt.title("Macro Indicators Over Time (Standardised Scale)")
plt.legend(loc="upper left")
plt.grid(True, linestyle="--", alpha=0.6) 
plt.tight_layout()
plt.show()


## Market Indicators from `fred`

In [None]:
# Prepare data subframe

df_inds = df[
    [
        "date",
        "cpi",
        "fed_rate",
        "consumer_confidence",
        "vix_index",
        "oil",
        "nonfarm_payrolls",
        "treasury_yield",
        "industrial_production",
        "retail_sales",
        "pmi",
    ]
]

# For plotting convencience, use date as index.
df_inds = df_inds.set_index("date")

In [None]:
# Indicator ranges

min = df_inds.min().round(2)
max = df_inds.max().round(2)
df_summary = pd.DataFrame({
    "Indicator": df_inds.columns,
    "Min": min.values,
    "Max": max.values,
    "Range": (max - min).values
})
print("\n**Index Ranges**\n")
print(df_summary.to_markdown(index=False))

In [None]:
# Indicator Time Evolution -- Raw Scale

plt.figure(figsize=(20, 10))
df_inds.plot(ax=plt.gca(), linewidth=2)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Value", fontsize=12)
plt.title("Indicator Time Evolution (Raw Scale)")
plt.xticks(rotation=45)
plt.legend(loc="upper left", fontsize=10)
plt.grid(True, linestyle="--", alpha=0.6)

plt.tight_layout()
plt.show()

In [None]:
# Indicator Time Evolution -- Standardised Scale

scaler = StandardScaler()
df_inds_scaled = scaler.fit_transform(df_inds)
df_inds_scaled = pd.DataFrame(df_inds_scaled, columns=df_inds.columns, index=df_inds.index)

plt.figure(figsize=(20, 10))
df_inds_scaled.plot(ax=plt.gca(), linewidth=2)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Scaled Value", fontsize=12)
plt.title("Indicator Time Evolution (Standardised Scale)")
plt.legend(loc="upper left")
plt.grid(True, linestyle="--", alpha=0.6) 

plt.tight_layout()
plt.show()