$$ TAX = \alpha _0 + \alpha _1 RM + \alpha_2 NOX + ... + \alpha _{12}LSTAT $$
$$ VIF _{TAX} = \frac{1}{(1 - R _{TAX} ^2)} $$

Step 1:
    Regression is being run on all features against tax:
        $$ TAX = \alpha _0 + \alpha _1 RM + \alpha_2 NOX + ... + \alpha _{12}LSTAT $$

Step 2: The R-Squared of the regresssion is used to calculate to calculate the <b>Variance Inflation Factor (VIF)</b>:
$$ VIF _{TAX} = \frac{1}{(1 - R _{TAX} ^2)} $$

In [3]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()

In [6]:
import pandas as pd
data = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)
data['PRICE'] = boston_dataset.target

In [8]:
import numpy as np
prices = np.log(data['PRICE'])

In [9]:
features = data.drop('PRICE', axis=1)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, prices,
                                                    test_size=0.2, random_state=10)

In [16]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [17]:
X_incl_const = sm.add_constant(X_train)

In [20]:
type(X_incl_const)

pandas.core.frame.DataFrame

In [21]:
type(X_incl_const.values)

numpy.ndarray

In [22]:
variance_inflation_factor(exog=X_incl_const.values, exog_idx=1)

1.7145250443932485

In [31]:
len(X_incl_const.columns)

14

In [33]:
for idx in range(len(X_incl_const.columns)):
    print(X_incl_const.columns[idx])

const
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT


In [27]:
X_incl_const.shape

(404, 14)

In [28]:
X_incl_const.shape[1]

14

In [34]:
for idx in range(X_incl_const.shape[1]):
    print(X_incl_const.columns[idx])

const
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT


In [37]:
for idx in range(len(X_incl_const.columns)):
    print(variance_inflation_factor(exog=X_incl_const.values, exog_idx=idx))

597.5487126763895
1.7145250443932485
2.3328224265597584
3.943448822674636
1.0788133385000578
4.410320817897635
1.8404053075678568
3.3267660823099408
4.222923410477865
7.314299817005058
8.508856493040817
1.839911632651406
1.3386713255364715
2.812544292793034


In [38]:
for idx in range(X_incl_const.shape[1]):
    print(variance_inflation_factor(exog=X_incl_const.values, exog_idx=idx))    

597.5487126763895
1.7145250443932485
2.3328224265597584
3.943448822674636
1.0788133385000578
4.410320817897635
1.8404053075678568
3.3267660823099408
4.222923410477865
7.314299817005058
8.508856493040817
1.839911632651406
1.3386713255364715
2.812544292793034


In [40]:
vif = [] # empty list
for idx in range(X_incl_const.shape[1]):
    vif.append(variance_inflation_factor(exog=X_incl_const.values, exog_idx=idx))    


In [44]:
vif

[597.5487126763895,
 1.7145250443932485,
 2.3328224265597584,
 3.943448822674636,
 1.0788133385000578,
 4.410320817897635,
 1.8404053075678568,
 3.3267660823099408,
 4.222923410477865,
 7.314299817005058,
 8.508856493040817,
 1.839911632651406,
 1.3386713255364715,
 2.812544292793034]

In [46]:
vif = [variance_inflation_factor(exog=X_incl_const.values, exog_idx=idx) 
       for idx in range(X_incl_const.shape[1])]

In [45]:
vif

[597.5487126763895,
 1.7145250443932485,
 2.3328224265597584,
 3.943448822674636,
 1.0788133385000578,
 4.410320817897635,
 1.8404053075678568,
 3.3267660823099408,
 4.222923410477865,
 7.314299817005058,
 8.508856493040817,
 1.839911632651406,
 1.3386713255364715,
 2.812544292793034]

In [48]:
pd.DataFrame( { 'coef_name': X_incl_const.columns, 'vif': np.around(vif, 2)} )

Unnamed: 0,coef_name,vif
0,const,597.55
1,CRIM,1.71
2,ZN,2.33
3,INDUS,3.94
4,CHAS,1.08
5,NOX,4.41
6,RM,1.84
7,AGE,3.33
8,DIS,4.22
9,RAD,7.31


### Any feature with a VIF of over 10 is considered problematic.