In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

## Multicolinearity

Multicollinearity refers to a situation in which more than two explanatory variables in a multiple regression model are highly correlated.


## Variance inflation factor (VIF)

$$
VIF(\hat{\beta}_j) = \frac{1}{1 - R_{X_j|X_{-j}}^2}

$$

VIF is a measure of how much the variance of the estimated regression coefficient $\hat{\beta}_j$ is "inflated" by the existence of correlation among the predictor variables in the model.

**VIF over 10 is considered problematic.**

In [23]:
tickers = ('CL=F', 'XOM', 'CVX', 'BP', 'COP', 'EOG')

names = ['WTI', 'Exxon', 'Chevron', 'BP', 'ConocoPhillips', 'EOG']

close_df = yf.download(tickers, period='2Y')['Adj Close'].rename(columns=dict(zip(tickers, names)))

returns_df = np.log(close_df).diff().dropna()

y = returns_df['WTI']
X = returns_df.drop(columns='WTI')

model = LinearRegression().fit(X, y)

# summary
print('Intercept: ', model.intercept_)
print('Coefficients: ', model.coef_)
print('R^2: ', model.score(X, y))

# VIF
for i, col in enumerate(X.columns):
    y = X[col]
    X_ = X.drop(columns=col)
    model = LinearRegression().fit(X_, y)
    print(col, 1/(1 - model.score(X_, y)))



[*********************100%%**********************]  6 of 6 completed
Intercept:  -0.0007803560085233736
Coefficients:  [ 0.13789318  0.12268835 -0.07957984  0.17178929  0.53835575]
R^2:  0.4351120694764826
BP 2.5043445930585744
ConocoPhillips 7.03989492746011
Chevron 4.765221080845815
EOG 5.220742719434334
Exxon 5.446449193345612
