In [1]:
import pandas as pd
import openpyxl
from functools import reduce
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso

import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_df = pd.read_csv('macroeconomic_indicators.csv')

## Feature Selection

PCA helps with feature selection by transforming the original features into principal components that capture the most variance in the dataset. Analyzing the principal components can provide insights into the relationships and importance of the original features, aiding in the identification of relevant features for prediction.

In [3]:
df = raw_df.drop(columns=['Date', 'Upper Band', 'Lower Band', 'Middle Band', 'SMA 10', 'SMA 50', 'SMA 200', 'EMA 10', 'EMA 50', 'EMA 200', 'DEMA 10', 'HT TRENDLINE'])

scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)
pca = PCA(n_components=0.95)
principal_components = pca.fit_transform(scaled_df)
print("Number of components chosen:", pca.n_components_)
print("Explained variance ratio:", pca.explained_variance_ratio_)

def show_top_features(components, feature_names, n_top):
    for i, component in enumerate(components):
        top_feature_indices = (-component).argsort()[:n_top]
        print(f"Principal Component {i + 1}:")
        for index in top_feature_indices:
            print(f"{feature_names[index]} ({component[index]:.3f})")
        print()

show_top_features(pca.components_, df.columns, 5)


Number of components chosen: 6
Explained variance ratio: [0.54776861 0.23302722 0.10269215 0.03255343 0.02501669 0.01556821]
Principal Component 1:
XLY (0.178)
XLV (0.177)
XLP (0.177)
GOOGL (0.176)
GOOG (0.176)

Principal Component 2:
1-MTH Treasury Maturity (0.241)
4-WK Treasury Bill (0.241)
6-MTH Treasury Bill (0.238)
3-MTH Treasury Bill (0.236)
26 Wk Bank Discount (0.236)

Principal Component 3:
XLE (0.317)
Oil (0.311)
XOM (0.309)
CVX (0.202)
IYR (0.180)

Principal Component 4:
XOM (0.340)
GBP:Pound (sterling) (0.315)
XLE (0.289)
EUR:Euro (0.287)
CVX (0.272)

Principal Component 5:
Oil (0.390)
BAA Corporate Bond (0.308)
AAA Corporate Bond (0.304)
10-YR Treasury Maturity (0.267)
1-YR Treasury Bill (0.251)

Principal Component 6:
Unemployment Rate (0.451)
KRW:Won (0.325)
GBP:Pound (sterling) (0.324)
Gold (0.254)
13 Wk Coupon Equiv (0.139)



The Recursive Feature Elimination (RFE) is a linear regression model, using methods such as Lasso or Ridge regression. This approach selects the most relevant features by recursively fitting the model and removing the least important features.

In [4]:
df2 = raw_df.drop(columns=['Date', 'Upper Band', 'Lower Band', 'Middle Band', 'SMA 10', 'SMA 50', 'SMA 200', 'EMA 10', 'EMA 50', 'EMA 200', 'DEMA 10', 'HT TRENDLINE'])
X = df2.drop(columns=['S&P 500'])
y = df2['S&P 500']

lasso = Lasso(alpha=0.1)
n_features_to_select = 5
rfe = RFE(estimator=lasso, n_features_to_select=n_features_to_select)
rfe.fit(X, y)
selected_features = rfe.support_
selected_feature_names = X.columns[selected_features]
print("Selected features:", selected_feature_names)
feature_ranking = rfe.ranking_

for fr, x in zip(feature_ranking, X.columns):
    print(f'{x}, Ranking: {fr}')

Selected features: Index(['AAA Corporate Bond', '1-YR Treasury Bill', 'QQQ', 'XLF', 'XLI'], dtype='object')
AAA Corporate Bond, Ranking: 1
BAA Corporate Bond, Ranking: 49
1-MTH Treasury Maturity, Ranking: 6
5-YR Treasury Maturity, Ranking: 25
10-YR Treasury Maturity, Ranking: 35
Gold, Ranking: 36
1-YR Treasury Bill, Ranking: 1
3-MTH Treasury Bill, Ranking: 16
4-WK Treasury Bill, Ranking: 47
6-MTH Treasury Bill, Ranking: 37
Unemployment Rate, Ranking: 40
4 WK Bank Discount, Ranking: 34
4 Wk Coupon Equiv, Ranking: 33
13 Wk Bank Discount, Ranking: 50
13 Wk Coupon Equiv, Ranking: 44
26 Wk Bank Discount, Ranking: 4
26 Wk Coupon Equiv, Ranking: 9
CNY:Renminbi, Ranking: 45
JPY:Yen, Ranking: 28
EUR:Euro, Ranking: 42
GBP:Pound (sterling), Ranking: 46
INR:Indian rupee, Ranking: 19
CAD:Canadian dollar, Ranking: 48
KRW:Won, Ranking: 38
Oil, Ranking: 13
NASDAQ, Ranking: 26
AAPL, Ranking: 21
ABBV, Ranking: 14
AMZN, Ranking: 10
BTC-USD, Ranking: 43
CVX, Ranking: 23
GOOG, Ranking: 12
GOOGL, Ranking: 1