##### Variance Inflation Factor (VIF)

VIF measures how much the variance of an estimated regression coefficient increases if the predictors are correlated. A VIF value greater than 10 is often considered indicative of multicollinearity.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = joined_dataset.copy()

pce = X['PCE']

# Exclude 'PCE' from VIF calculation but keep it in the dataset
X_without_PCE = X.drop(columns=['PCE']) if 'PCE' in X.columns else X.copy()

# Handle missing (NaN) and infinite (inf) values
X_without_PCE = X_without_PCE.fillna(X_without_PCE.mean())


In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X_without_PCE.columns
vif_data["VIF"] = [variance_inflation_factor(X_without_PCE.values, i) for i in range(X_without_PCE.shape[1])]
vif_data = vif_data.sort_values(by='VIF', ascending=False)
print(vif_data)


                                               feature         VIF
48                 Personal Cons. Expend.: Chain Index  945.259947
41                                 PPI: Finished Goods  576.456603
42                        PPI: Finished Consumer Goods  512.761422
51                        Personal Cons. Exp: Services  422.030946
16           All Employees: Goods-Producing Industries  248.483816
50                Personal Cons. Exp: Nondurable goods  227.218059
15                        All Employees: Total nonfarm  217.046088
57                              Secondary_Sector_Index  209.832962
52               Avg Hourly Earnings : Goods-Producing  168.465171
28           S&P s Common Stock Price Index: Composite  120.265499
29         S&P s Common Stock Price Index: Industrials  116.082784
5                                             IP Index   83.667152
47                                     CPI : All Items   82.649983
63                                Final_Products_Index   80.97

In [None]:

ignored_columns =[]
#ignored_columns = ["Civilian Unemployment Rate","Civilian Employment","Housing Starts: Total New Privately Owned","Avg Weekly Hours : Goods-Producing",
#                  "Avg Weekly Hours : Manufacturing","Personal Cons. Expend.: Chain Index","Civilian Labor Force","All Employees: Total nonfarm"]  # Add any other columns you want to ignore in this list

while True:
    # Calculate VIF for features not in the ignored list
    vif_data = pd.DataFrame()
    vif_data["feature"] = [column for column in X_without_PCE.columns if column not in ignored_columns]
    vif_data["VIF"] = [variance_inflation_factor(X_without_PCE[vif_data["feature"]].values, i) for i in range(len(vif_data["feature"]))]

    # Find and drop the variable with the highest VIF if it's not in the ignored list
    max_vif = vif_data["VIF"].max()
    if max_vif > 10:  # Assuming 10 as the threshold
        feature_to_drop = vif_data.sort_values("VIF", ascending=False).iloc[0]["feature"]
        if feature_to_drop not in ignored_columns:
            X_without_PCE.drop(columns=[feature_to_drop], inplace=True)
            print(f"Dropping {feature_to_drop} with VIF: {max_vif}")
        else:
            # If the top VIF feature is in the ignore list, remove it from the VIF dataframe and continue
            vif_data = vif_data[vif_data["feature"] != feature_to_drop]
    else:
        break



Dropping Personal Cons. Expend.: Chain Index with VIF: 945.2599469950171
Dropping PPI: Finished Goods with VIF: 571.6269822542896
Dropping All Employees: Goods-Producing Industries with VIF: 243.69387620699257
Dropping All Employees: Total nonfarm with VIF: 208.15206109763997
Dropping Avg Hourly Earnings : Goods-Producing with VIF: 151.5031443172232
Dropping S&P s Common Stock Price Index: Composite with VIF: 116.19512062415967
Dropping IP Index with VIF: 76.96138342360842
Dropping Moody s Aaa Corporate Bond Minus FEDFUNDS with VIF: 66.26520811106896
Dropping CPI : All Items with VIF: 63.90356767364602
Dropping Final_Products_Index with VIF: 56.67837591607776
Dropping Civilian Employment with VIF: 36.838458475666975
Dropping Monetary_Aggregates_Index with VIF: 24.045518416428898
Dropping Long_Term_Rate_Index with VIF: 19.30531102795037
Dropping PPI: Finished Consumer Goods with VIF: 15.256176381812494
Dropping Retail and Food Services Sales with VIF: 14.430489209931803
Dropping Persona

In [None]:

# update the refined_dataset
# Add 'PCE' back to the dataset if it was removed initially
# refined_dataset = pd.concat([X_without_PCE, pce], axis=1) 
# refined_dataset.head()