In [45]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
import pandas as pd
import numpy as np

In addition to applying PCA to reduce dimensionality, we also perform Elastic Net feature selection on the original feature groups.  


To avoid overloading the feature selection step with redundant or narrowly constructed signals, we select only a representative subset of technical indicators that capture the two main concepts: price momentum and short-term volatility positioning.  
This allows Elastic Net to focus on the essential technical drivers without being overwhelmed by highly collinear variations of similar indicators

In [46]:
# load data
df = pd.read_csv('shortened_oil_data.csv')
df = df.dropna().copy()
print(df.shape)
df.head()

y=df['Crude_Oil']

# 1. fundamental features
fundamental_cols = df.columns[1:24]
fundamental_cols = [col for col in fundamental_cols if col != 'Crude_Oil']

# 2. technical features
tech_ENet = [
    'Dist_SMA_20', 'Dist_SMA_50', 
    'BB_Position', 
    'MACD_Hist', 
    'Ret_1d_Lag1', 'Ret_1d_Lag2', 
    'RSI_14_Lag1', 'MACD_Hist_Lag1'
]

# 3. EIA weekly features
eia_cols = df.columns[58:]
print("Number of EIA features:", len(eia_cols))

(1168, 468)
Number of EIA features: 410


In [47]:
def ENet_feature_selection(X, y, feature_names, group_name, l1_ratios=[0.1,0.3,0.5,0.7,0.9,1]):
    # Standardization
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    enet = ElasticNetCV(
    l1_ratio=[0.1,0.3,0.5,0.7,0.9,1],
    alphas=np.logspace(-4, 0, 50),  
    cv=5,
    max_iter=300000,             
    tol=1e-3,                  
    n_jobs=-1,
    random_state=42
    ).fit(X_scaled, y)


    enet_coef = pd.Series(enet.coef_, index=feature_names)
    enet_selected = enet_coef[enet_coef != 0].sort_values(key=np.abs, ascending=False)

    print("\nElastic Net Results:")
    print(f"Optimal alpha: {enet.alpha_:.6f}, Optimal l1_ratio: {enet.l1_ratio_}")
    print(f"Number of selected features: {len(enet_selected)}")
    print(enet_selected)
    
    return enet_selected


In [48]:
X_fund = df[fundamental_cols]
enet_fund = ENet_feature_selection(
    X_fund, y, fundamental_cols, group_name="Fundamental Indicators"
)


Elastic Net Results:
Optimal alpha: 0.002947, Optimal l1_ratio: 1.0
Number of selected features: 18
Gasoline               10.306603
Crack_Spread_321       -7.741170
Heating_Oil             6.614359
Brent_Oil               2.357834
Gold_Oil_Ratio          1.067023
Gold                   -1.014846
Service_Oil_Ratio      -0.567969
Oil_Services            0.431667
SP500                   0.291657
DXY                    -0.159929
US10Y                   0.091786
Transportation         -0.051580
USD_CAD                 0.046595
Transport_Oil_Ratio    -0.038543
Oil_VIX                -0.020034
Copper                  0.015415
Natural_Gas             0.008313
Junk_Bond              -0.001173
dtype: float64


In [49]:
X_tech = df[tech_ENet]
enet_tech = ENet_feature_selection(
    X_tech, y, tech_ENet, group_name="Technical Indicators"
)


Elastic Net Results:
Optimal alpha: 0.390694, Optimal l1_ratio: 0.9
Number of selected features: 3
Dist_SMA_50       4.745272
MACD_Hist_Lag1   -0.629159
MACD_Hist        -0.380273
dtype: float64


In [50]:
X_eia = df[eia_cols]
enet_eia = ENet_feature_selection(
    X_eia, y, eia_cols, group_name="EIA Weekly Indicators"
)


Elastic Net Results:
Optimal alpha: 1.000000, Optimal l1_ratio: 1.0
Number of selected features: 28
US_Stocks_Ex_SPR_Crude_PetProd                           -3.802344
US_Crude_SPR_StkChg                                      -2.535021
PADD4_Stocks_Distillate_GT_MedS                           2.258129
PADD3_Stocks_Distillate_GT_MedS                          -1.434144
4W_US_Imp_Residual                                        1.327335
4W_PADD4_Blender_NetProd_FinGas                           1.091354
US_SupAdj_Petroleum                                       0.875280
4W_PADD3_Commercial_Crude_Imp_Ex_SPR                      0.869553
Central_Atlantic_Stocks_Distillate_LowS                  -0.605241
4W_PADD1_RefBl_NetProd_Residual                           0.573774
4W_US_Imp_Crude_PetProd                                   0.517437
PADD5_Stocks_TotGas                                       0.507634
4W_US_Imp_Crude                                           0.483404
PADD2_Stocks_Ex_SPR_Crude   

In [None]:
import pandas as pd

summary_data = [
    {
        "Feature Group": "Fundamental",
        "Original Features": len(fundamental_cols),
        "Selected Features": len(enet_fund),
        "Optimal Alpha": round(0.002947, 6),     # or enet_fund_alpha if stored
        "Optimal L1 Ratio": 1.0,
        "Top Selected Features": ", ".join(enet_fund.index[:5])
    },
    {
        "Feature Group": "Technical",
        "Original Features": len(tech_ENet),
        "Selected Features": len(enet_tech),
        "Optimal Alpha": round(0.390694, 6),
        "Optimal L1 Ratio": 0.9,
        "Top Selected Features": ", ".join(enet_tech.index[:5])
    },
    {
        "Feature Group": "EIA Weekly",
        "Original Features": len(eia_cols),
        "Selected Features": len(enet_eia),
        "Optimal Alpha": 1.0,
        "Optimal L1 Ratio": 1.0,
        "Top Selected Features": ", ".join(enet_eia.index[:5])
    }
]

summary_table = pd.DataFrame(summary_data)
summary_table


Unnamed: 0,Feature Group,Original Features,Selected Features,Optimal Alpha,Optimal L1 Ratio,Top Selected Features
0,Fundamental,22,18,0.002947,1.0,"Gasoline, Crack_Spread_321, Heating_Oil, Brent..."
1,Technical,8,3,0.390694,0.9,"Dist_SMA_50, MACD_Hist_Lag1, MACD_Hist"
2,EIA Weekly,410,28,1.0,1.0,"US_Stocks_Ex_SPR_Crude_PetProd, US_Crude_SPR_S..."


In [55]:
# Convert index objects to actual Python lists
fund_selected_cols = list(enet_fund.index)
tech_selected_cols = list(enet_tech.index)
eia_selected_cols = list(enet_eia.index)

all_selected_cols = fund_selected_cols + tech_selected_cols + eia_selected_cols
print("Total selected features:", len(all_selected_cols))
df_selected = df[["Date", "Crude_Oil"] + all_selected_cols].copy()
df_selected.head()


Total selected features: 49


Unnamed: 0,Date,Crude_Oil,Gasoline,Crack_Spread_321,Heating_Oil,Brent_Oil,Gold_Oil_Ratio,Gold,Service_Oil_Ratio,Oil_Services,...,US_Blender_NetProd_Gas_Finished_Conv_Greater_Than_Ed55,PADD2_Stocks_Distillate_LowS,PADD3_Stocks_Distillate_Greater_Than_HighS,PADD1_RefBl_NetProd_Residual,4W_US_Imp_Distillate_GT_2000_ppm_Sulfur,US_Ref_NetProd_Kerosene,US_Stocks_Ex_SPR_Crude,New_England_Stocks_TotGas,PADD4_RefBl_NetProd_FinGas,4W_US_Crude_SPR_StkChg
60,2021-03-31,59.16,1.9533,20.3306,1.7713,63.540001,28.968899,1713.800049,3.059359,180.991669,...,7.0,28137.0,4388.0,37.0,0.0,-5.0,501835.0,4050.0,311.0,0.0
61,2021-04-01,61.450001,2.0223,20.816799,1.8316,64.860001,28.096013,1726.5,3.032566,186.351166,...,7.0,28137.0,4388.0,37.0,0.0,-5.0,501835.0,4050.0,311.0,0.0
62,2021-04-05,58.650002,1.9611,21.074398,1.7724,62.150002,29.445865,1727.0,3.095488,181.550369,...,6.0,28493.0,4240.0,33.0,0.0,4.0,498313.0,3819.0,311.0,0.0
63,2021-04-06,59.330002,1.9663,20.843799,1.7941,62.740002,29.352772,1741.5,3.032239,179.90274,...,6.0,28493.0,4240.0,33.0,0.0,4.0,498313.0,3819.0,311.0,0.0
64,2021-04-07,59.77,1.9518,20.190999,1.8079,63.16,29.113267,1740.099976,3.021324,180.584518,...,6.0,28493.0,4240.0,33.0,0.0,4.0,498313.0,3819.0,311.0,0.0


In [56]:
output_path = "enet_oil_data.csv"
df_selected.to_csv(output_path, index=False)

print(f"Saved ENet-selected feature dataset to: {output_path}")

Saved ENet-selected feature dataset to: enet_oil_data.csv
