In [116]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Embedded Method

In [117]:
# Plot correlation matrix
def correlation_matrix_plot(df, asset):
    correlation_matrix = df.select_dtypes(include=['int64', 'float64']).corr()
    plt.figure(figsize=(18, 16))
    sns.heatmap(correlation_matrix, annot=True, cmap='viridis', linewidths=0.9, fmt=".2f")
    plt.title(f'Correlation Matrix of {asset}')
    plt.tight_layout()
    plt.show()

In [118]:
def generate_feat_sel_bar_plot(feat_importances, asset_name_title):
    plt.figure(figsize=(12, 6))
    ax = feat_importances.plot(kind='bar', title=f'Feature Importances - {asset_name_title}')
    plt.ylabel('Importance Score')
    plt.xticks(rotation=90)

    # Add rounded importance values as labels on bars
    for p in ax.patches:
        ax.annotate(f"{p.get_height():.2f}", 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='bottom', fontsize=8, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [119]:
index_features_dict = {}
mutual_funds_features_dict = {}
gold_bonds_features_dict = {}

In [120]:
def filter_features(feat_importances, threshold=0.01):
    return feat_importances[feat_importances > threshold].index.tolist()

In [None]:
# Feature Selection Pipeline
def select_features_rf(df, asset_name_title="Asset", asset_name='', create_features=True, features_dict={}):
    # 1. Preprocessing and Feature Engineering
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)


    # Add indicators that improve models to predict better
    window=14

    df['Close_High_Ratio'] = (df['Close'] - df['High']) / df['High']
    df['Close_Low_Ratio'] = (df['Close'] - df['Low']) / df['Low']
    
    # Volatility Features
    df['Daily_Range'] = df['High'] - df['Low']
    df['ATR'] = df['Daily_Range'].rolling(window=window).mean()
    df['Rolling_Std_Dev'] = df['Close'].rolling(window=window).std()
    
    # Momentum & Trend Features
    df['RSI'] = 100 - (100 / (1 + (df['Close'].diff().clip(lower=0).rolling(window=window).mean() /
                                df['Close'].diff().clip(upper=0).abs().rolling(window=window).mean())))
    df['MACD'] = df['Close'].ewm(span=12, adjust=False).mean() - df['Close'].ewm(span=26, adjust=False).mean()
    
    # Candlestick Features
    df['Body_Size'] = abs(df['Close'] - df['Open'])
    df['Upper_Wick'] = df['High'] - df[['Open', 'Close']].max(axis=1)
    df['Lower_Wick'] = df[['Open', 'Close']].min(axis=1) - df['Low']

    # Define feature columns and target variable
    if create_features == True:
        features = ['Momentum_Indicator',
                    'High_Low_Change',
                    'High_Open_Change',
                    'Low_Open_Change',
                    'Percent_Change_In_Price',
                    'Close_High_Ratio',
                    'Close_Low_Ratio',
                    'Daily_Range',
                    'ATR',
                    'Rolling_Std_Dev',
                    'RSI',
                    'MACD',
                    'Body_Size',
                    'Upper_Wick',
                    'Lower_Wick',
        ]
    else:
        features = features_dict[asset_name]
    target = 'Close'

    new_df = df.dropna()
    X = new_df[features]
    y_without_reshape = new_df[target]

    y = y_without_reshape.values.reshape(-1, 1)

    scaler_features = StandardScaler()
    scaler_target = StandardScaler()
    scaled_X_data = scaler_features.fit_transform(X)
    scaled_y_data = scaler_target.fit_transform(y)

    training_size = int(len(X) * 0.8)

    X_train, X_test = scaled_X_data[:training_size], scaled_X_data[training_size:]
    y_train, y_test = scaled_y_data[:training_size], scaled_y_data[training_size:]

    # 2. Train a Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=123)
    rf.fit(X_train, y_train)

    # predictions = rf.predict(X_test)
    # r2 = r2_score(y_test, predictions)
    # print(f'R-squared: {r2}')

    # 3. Extract and Plot Feature Importances
    importances = rf.feature_importances_
    feat_importances = pd.Series(importances, index=features).sort_values(ascending=False)

    print(f"\n{asset_name_title} - Feature Importances:")
    print(feat_importances)
    # print(feat_importances[feat_importances > 0.01])

    # # 4. Plot feature importances as a bar chart
    # generate_feat_sel_bar_plot(feat_importances, asset_name_title)

    # # 5. Plot correlation matrix
    # correlation_matrix_plot(pd.concat([X, y_without_reshape], axis=1), asset_name_title)

    return feat_importances

### Feature Selection for Indexes

In [122]:
index_excel_file = '../Data/Index_Data.xlsx'
all_indexes = pd.read_excel(index_excel_file, sheet_name=None)

print("### Feature Selection for Each Index ###")
for sheet_name, df in all_indexes.items():
    asset_identifier = f"Index: {sheet_name}"
    print(f"\nProcessing {asset_identifier}")
    feat_imp = select_features_rf(df, asset_name_title=asset_identifier)
    selected_features = filter_features(feat_imp, threshold=0.01)
    index_features_dict[sheet_name] = selected_features

### Feature Selection for Each Index ###

Processing Index: NSEI

Index: NSEI - Feature Importances:
Daily_Range                0.434053
High_Low_Change            0.430273
ATR                        0.101916
MACD                       0.017914
RSI                        0.002857
Body_Size                  0.002632
Rolling_Std_Dev            0.002202
Close_High_Ratio           0.001314
Lower_Wick                 0.001273
Low_Open_Change            0.001151
Upper_Wick                 0.000989
Percent_Change_In_Price    0.000964
High_Open_Change           0.000936
Close_Low_Ratio            0.000908
Momentum_Indicator         0.000617
dtype: float64

Processing Index: NSEBANK

Index: NSEBANK - Feature Importances:
ATR                        0.411158
High_Low_Change            0.312306
Daily_Range                0.229520
MACD                       0.028679
Rolling_Std_Dev            0.002501
RSI                        0.002441
Close_High_Ratio           0.002109
Body_Size                 

### Feature Selection for Mutual Funds

In [123]:
mutual_funds_excel_file = '../Data/Mutual_Funds_Data.xlsx'
all_mutual_funds = pd.read_excel(mutual_funds_excel_file, sheet_name=None)

print("### Feature Selection for Each Mutual Fund ###")
for sheet_name, df in all_mutual_funds.items():
    asset_identifier = f"Mutual Fund: {sheet_name}"
    print(f"\nProcessing {asset_identifier}")
    feat_imp = select_features_rf(df, asset_name_title=asset_identifier)
    selected_features = filter_features(feat_imp, threshold=0.08)
    mutual_funds_features_dict[sheet_name] = selected_features

### Feature Selection for Each Mutual Fund ###

Processing Mutual Fund: 0P00005WL6

Mutual Fund: 0P00005WL6 - Feature Importances:
MACD                       0.360003
Rolling_Std_Dev            0.273001
RSI                        0.217891
Percent_Change_In_Price    0.149106
Momentum_Indicator         0.000000
High_Low_Change            0.000000
High_Open_Change           0.000000
Low_Open_Change            0.000000
Close_High_Ratio           0.000000
Close_Low_Ratio            0.000000
Daily_Range                0.000000
ATR                        0.000000
Body_Size                  0.000000
Upper_Wick                 0.000000
Lower_Wick                 0.000000
dtype: float64

Processing Mutual Fund: UTINEXT50

Mutual Fund: UTINEXT50 - Feature Importances:
ATR                        0.422833
Close_Low_Ratio            0.154189
High_Low_Change            0.067171
MACD                       0.059391
Daily_Range                0.051142
Rolling_Std_Dev            0.047133
Upper_Wick      

### Feature Selection for Gold Bonds

In [124]:
gold_bonds_file = '../Data/Gold_Bond_Data.xlsx'
gold_df = pd.read_excel(gold_bonds_file)

print("### Feature Selection for Gold Bonds ###")
feat_imp_gold = select_features_rf(gold_df, asset_name_title="Gold Bonds")
selected_features_gold = filter_features(feat_imp_gold, threshold=0.01)
gold_bonds_features_dict["Gold Bonds"] = selected_features_gold

### Feature Selection for Gold Bonds ###

Gold Bonds - Feature Importances:
Rolling_Std_Dev            0.401399
ATR                        0.204979
MACD                       0.178902
RSI                        0.101886
Percent_Change_In_Price    0.044597
High_Open_Change           0.012430
High_Low_Change            0.011446
Momentum_Indicator         0.008078
Daily_Range                0.007866
Body_Size                  0.007462
Close_Low_Ratio            0.005208
Close_High_Ratio           0.004761
Upper_Wick                 0.004550
Low_Open_Change            0.004196
Lower_Wick                 0.002238
dtype: float64


### Re-running the model after the feature selection

### for index

In [125]:
print("### Feature Selection for Each Index ###")
for sheet_name, df in all_indexes.items():
    asset_identifier = f"Index: {sheet_name}"
    print(f"\nProcessing {asset_identifier}")
    feat_imp = select_features_rf(df, asset_name_title=asset_identifier, asset_name=sheet_name, create_features=False, features_dict=index_features_dict)
    # selected_features = filter_features(feat_imp, threshold=0.01)
    # index_features_dict[sheet_name] = selected_features

### Feature Selection for Each Index ###

Processing Index: NSEI

Index: NSEI - Feature Importances:
Daily_Range        0.439364
High_Low_Change    0.435701
ATR                0.104267
MACD               0.020668
dtype: float64

Processing Index: NSEBANK

Index: NSEBANK - Feature Importances:
ATR                0.413346
High_Low_Change    0.319092
Daily_Range        0.236379
MACD               0.031183
dtype: float64

Processing Index: CNXIT

Index: CNXIT - Feature Importances:
ATR                0.762102
High_Low_Change    0.092435
MACD               0.081020
Daily_Range        0.064443
dtype: float64

Processing Index: BSESN

Index: BSESN - Feature Importances:
ATR                0.598352
High_Low_Change    0.217252
Daily_Range        0.171726
MACD               0.012670
dtype: float64

Processing Index: NIFTY_MIDCAP_100

Index: NIFTY_MIDCAP_100 - Feature Importances:
ATR                0.533557
High_Low_Change    0.215126
Daily_Range        0.192442
MACD               0.058876
dtype

### for mutual funds

In [126]:
print("### Feature Selection for Each Mutual Fund ###")
for sheet_name, df in all_mutual_funds.items():
    asset_identifier = f"Mutual Fund: {sheet_name}"
    print(f"\nProcessing {asset_identifier}")
    feat_imp = select_features_rf(df, asset_name_title=asset_identifier, asset_name=sheet_name, create_features=False, features_dict=mutual_funds_features_dict)
    # selected_features = filter_features(feat_imp, threshold=0.01)
    # mutual_funds_features_dict[sheet_name] = selected_features

### Feature Selection for Each Mutual Fund ###

Processing Mutual Fund: 0P00005WL6

Mutual Fund: 0P00005WL6 - Feature Importances:
MACD                       0.359896
Rolling_Std_Dev            0.273747
RSI                        0.217224
Percent_Change_In_Price    0.149133
dtype: float64

Processing Mutual Fund: UTINEXT50

Mutual Fund: UTINEXT50 - Feature Importances:
ATR                0.619279
Close_Low_Ratio    0.380721
dtype: float64

Processing Mutual Fund: 0P0000MLHH

Mutual Fund: 0P0000MLHH - Feature Importances:
Rolling_Std_Dev            0.368658
MACD                       0.317058
RSI                        0.197269
Percent_Change_In_Price    0.117016
dtype: float64

Processing Mutual Fund: 0P0000KV39

Mutual Fund: 0P0000KV39 - Feature Importances:
MACD                       0.379647
Rolling_Std_Dev            0.275726
RSI                        0.228243
Percent_Change_In_Price    0.116385
dtype: float64

Processing Mutual Fund: 0P00009J3K

Mutual Fund: 0P00009J3K - Feature 

### for gold bond

In [127]:
print("### Feature Selection for Gold Bonds ###")
feat_imp_gold = select_features_rf(gold_df, asset_name_title="Gold Bonds", asset_name='Gold Bonds', create_features=False, features_dict=gold_bonds_features_dict)
selected_features_gold = filter_features(feat_imp_gold, threshold=0.01)
gold_bonds_features_dict["Gold Bonds"] = selected_features_gold

### Feature Selection for Gold Bonds ###

Gold Bonds - Feature Importances:
Rolling_Std_Dev            0.406749
ATR                        0.211501
MACD                       0.182570
RSI                        0.105714
Percent_Change_In_Price    0.049142
High_Low_Change            0.023305
High_Open_Change           0.021020
dtype: float64


## Final feature selection

In [136]:
def create_final_feat_sel(feat_dict):
    feat_sel = set()
    for key in feat_dict:
        feat_sel.update(feat_dict[key])

    return list(feat_sel)

In [137]:
create_final_feat_sel(index_features_dict)

['High_Low_Change', 'Daily_Range', 'MACD', 'ATR', 'Rolling_Std_Dev']

In [138]:
create_final_feat_sel(mutual_funds_features_dict)

['Percent_Change_In_Price',
 'High_Low_Change',
 'RSI',
 'Daily_Range',
 'Close_Low_Ratio',
 'MACD',
 'ATR',
 'Rolling_Std_Dev']

In [139]:
create_final_feat_sel(gold_bonds_features_dict)

['Percent_Change_In_Price',
 'High_Low_Change',
 'RSI',
 'MACD',
 'ATR',
 'Rolling_Std_Dev',
 'High_Open_Change']

In [None]:
# df['RSI'] = 100 - (100 / (1 + (df['Close'].diff().clip(lower=0).rolling(window=14).mean() / df['Close'].diff().clip(upper=0).abs().rolling(window=14).mean())))
# df['Daily_Range'] = df['High'] - df['Low']
# df['Close_Low_Ratio'] = (df['Close'] - df['Low']) / df['Low']
# df['MACD'] = df['Close'].ewm(span=12, adjust=False).mean() - df['Close'].ewm(span=26, adjust=False).mean()
# df['ATR'] = df['Daily_Range'].rolling(window=14).mean()
# df['Rolling_Std_Dev'] = df['Close'].rolling(window=14).std()