# Feature Engineering

In this notebook, we amend the main data frame by some features that may be relevant for inferring conclusions and predictions from the data.

The time series composition will be added in the next notebook.

In [None]:
import numpy as np
import pandas as pd
import pandas_ta as ta

from signal_sigma.config.cfg_legacy import *

In [None]:
NB_NUMBER = 4

relpath = "main.csv"

# Original data frame
df = load_df_from_csv(relpath, NB_NUMBER)

df.shape

In [None]:
# Time Related Features -- 1

# 0 = Monday, ..., 6 = Sunday
df["weekday"] = df["date"].dt.dayofweek

# Calendar week (1-53)
df["calendar_week"] = df["date"].dt.isocalendar().week

# 1 = January, ..., 12 = December
df["month"] = df["date"].dt.month

# Quarter (1-4)
df["quarter"] = df["date"].dt.quarter

# Year
df["year"] = df["date"].dt.year

# Last day of month, quarter, year
df["is_month_end"] = df["date"].dt.is_month_end.astype(int)
df["is_quarter_end"] = df["date"].dt.is_quarter_end.astype(int)
df["is_year_end"] = df["date"].dt.is_year_end.astype(int)

In [None]:
# Time Related Features -- 2

# # WARNING: Not tested yet!

# # NOTE: Actually, at an earlier stage, holidays were already filtered
# # (due to filtering a subframe and inner joining the frames). But perhaps,
# # it better fits here.

# # US holidays indicator
# us_holidays = holidays.US()
# df["is_holiday"] = df["date"].apply(lambda dt: dt in us_holidays)

# # Trading day indicator
# df["is_trading_day"] = (~df["is_holiday"]) & (~df["weekday"].isin([5, 6]))

# # Last trading day of month, quarter, year
# # NOTE: It is decisive that the rows are already
# # ordered by ascending date!

# PERIODS = {
#     "M": "month",
#     "Q": "quarter",
#     "Y": "year",
# }

# is_trading_day = df["is_trading_day"] == True
# # Dates that are trading days
# left = df.loc[is_trading_day, "date"]

# for code, period in PERIODS.items():
#     # Corresponding maximum of (trading day) date when grouped
#     # by period.
#     right = df.loc[is_trading_day].groupby(
#         df.loc[is_trading_day, "date"].dt.to_period(code)
#     )["date"].transform("max")
#     # Whether date coincide with maximum day in its period group.
#     df.loc[is_trading_day, f"is_trading_{period}_end"] = (left == right).astype(int)
#     # Assign false to non-trading days dates (not yet adressed).
#     df[f"is_trading_{period}_end"] = df[f"is_trading_{period}_end"].fillna(0).astype(int)

In [None]:
# Investment -- Total and per Stock ratios

cols_0 = cartprod("invest", STOCK_TICKERS)

# Total Investment
df["invest_total"] = df[cols_0].sum(axis=1)

# Per Stock Ratio Investment
cols = cartprod("invest", STOCK_TICKERS, "ratio")
df[cols] = df[cols_0].div(df["invest_total"], axis=0)

In [None]:
# Investment -- Differencing and Lagging

cols_0 = cartprod("invest", STOCK_TICKERS, "ratio")
cols = cartprod(cols_0, "diff")
df[cols] = df[cols_0].diff()

df["invest_total_lag_1"] = df["invest_total"].shift(1)
df["invest_total_diff"] = df["invest_total"].diff()
df["invest_total_rolling_mean_5"] = df["invest_total"].rolling(window=5).mean()

In [None]:
# Primary (Backward) Differencing for Some Macroeconomical Indicators

cols_0 = [
    "cpi",
    "fed_rate",
    "consumer_confidence",
    "vix_index",
    "oil",
    "nonfarm_payrolls",
    "treasury_yield",
    "industrial_production",
    "retail_sales",
    "pmi",
    "s&p500_index",
    "dow_jones_index",
    "nasdaq_composite",
    "russell2000_index",
    "dollar_index_dxy",
    "gold_futures",
    "wti_oil_futures",
    "copper_futures",
    "brent_crude_futures",
    "tech_sector_etf",
    "energy_sector_etf",
    "financial_sector_etf",
    "consumerdiscretionary_etf",
    "lithium_etf",
    "semiconductor_etf",
    "electricity_proxy",
]

cols = cartprod(cols_0, "diff")
df[cols] = df[cols_0].diff()

In [None]:
# Test

STOCK_LAGS = [1, 3, 5, 10]

STOCK_COLS = ["close", "open", "high", "low"]

ROLLING_WINDOWS = [5, 20]

cols_0 = cartprod(STOCK_COLS, STOCK_TICKERS)

# Primary (Backward) Difference
cols = cartprod(cols_0, "diff")
df[cols] = df[cols_0].diff()

# Rolling Mean and Std
for window in ROLLING_WINDOWS:
    cols = cartprod(cols_0, f"rolling_mean_{window}")
    df[cols] = df[cols_0].rolling(window=window).mean()

    cols = cartprod(cols_0, f"rolling_std_{window}")
    df[cols] = df[cols_0].rolling(window=window).std()

# Lagged Features
for lag in STOCK_LAGS:
    cols = cartprod(cols_0, f"lag_{lag}")
    df[cols] = df[cols_0].shift(lag)

In [None]:
# Stock Specific Columns

STOCK_LAGS = [1, 3, 5, 10]

STOCK_COLS = ["close", "open", "high", "low"]

ROLLING_WINDOWS = [5, 20]

cols_0 = cartprod(STOCK_COLS, STOCK_TICKERS)

# Primary (Backward) Difference
cols = cartprod(cols_0, "diff")
df[cols] = df[cols_0].diff()

# Rolling Mean and Std
for window in ROLLING_WINDOWS:
    cols = cartprod(cols_0, f"rolling_mean_{window}")
    df[cols] = df[cols_0].rolling(window=window).mean()

    cols = cartprod(cols_0, f"rolling_std_{window}")
    df[cols] = df[cols_0].rolling(window=window).std()

# Lagged Features
for lag in STOCK_LAGS:
    cols = cartprod(cols_0, f"lag_{lag}")
    df[cols] = df[cols_0].shift(lag)

# RSI -- Relative Strength Index
# RSI measures the momentum of price movements, helping to determine
# whether an asset is overbought or oversold.
for ticker in STOCK_TICKERS:
    ser = df[f"close_{ticker}"]
    df[f"{ticker}_rsi"] = ta.rsi(ser, length=14)

# MACD -- Moving Average Convergence Divergence
# MACDs compare two moving averages to analyze price momentum.
for ticker in STOCK_TICKERS:
    ser = df[f"close_{ticker}"]
    macd = ta.macd(ser)
    df[f"{ticker}_MACD"] = macd["MACD_12_26_9"]
    df[f"{ticker}_MACD_signal"] = macd["MACDs_12_26_9"]
    df[f"{ticker}_MACD_hist"] = macd["MACDh_12_26_9"]

# Volume-based Features

cols_0 = cartprod("volume", STOCK_TICKERS)
cols = cartprod(cols_0, "log")
df[cols] = np.log1p(df[cols_0])

cols_0 = cartprod("volume", STOCK_TICKERS, "log")
cols = cartprod(cols_0, "diff")
df[cols] = df[cols_0].diff()

# Further Primary (Backward) Differences

cols_further = [
    "delta_price",
    "avg_price",
    "price_ratio",
    "invest",
]

cols_0 = cartprod(cols_further, STOCK_TICKERS)
cols = cartprod(cols_0, "diff")
df[cols] = df[cols_0].diff()

In [None]:
df.info()

df.isnull().sum()

In [None]:
# Final Cleaning Step
# Drop rows with NaN values caused by differencing, rolling, and shifting
df = df.dropna()

print(df.shape)

In [None]:
# Store final DataFrame on disk

relpath = "main.csv"
store_df_as_csv(df, relpath, NB_NUMBER)