In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Variable Selection

In [2]:
df_numeric = pd.read_csv('data/df_numeric.csv')

In [3]:
df_numeric

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,...,MoSold,YrSold,SalePrice,GarageYrBlt_missing_ind,LotFrontage_missing_ind,MasVnrArea_missing_ind,1stFlrSF_log,1stFlr_2ndFlr_SF,OverallGrade,SimplGarageQual
0,60,65.0,8450,2,4,4,3,7,5,2003,...,2,2008,208500,0,0,0,6.752270,1710,35,1
1,20,80.0,9600,2,4,4,3,6,8,1976,...,5,2007,181500,0,0,0,7.140453,1262,48,1
2,60,68.0,11250,2,3,4,3,7,5,2001,...,9,2008,223500,0,0,0,6.824374,1786,35,1
3,70,60.0,9550,2,3,4,3,7,5,1915,...,2,2006,140000,0,0,0,6.867974,1717,35,1
4,60,84.0,14260,2,3,4,3,8,5,2000,...,12,2008,250000,0,0,0,7.043160,2198,40,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,60,62.0,7917,2,4,4,3,6,5,1999,...,8,2007,175000,0,0,0,6.859615,1647,30,1
1454,20,85.0,13175,2,4,4,3,6,6,1978,...,2,2010,210000,0,0,0,7.636752,2073,36,1
1455,70,66.0,9042,2,4,4,3,7,9,1941,...,5,2010,266500,0,0,0,7.080026,2340,63,1
1456,20,68.0,9717,2,4,4,3,5,6,1950,...,4,2010,142125,0,0,0,6.982863,1078,30,1


In [4]:
y = df_numeric.SalePrice
df_numeric.drop("SalePrice",axis=1, inplace=True)

In [5]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(0.1)
df_transformed = vt.fit_transform(df_numeric)

In [6]:
# columns we have selected
# get_support() is method of VarianceThreshold and stores boolean of each variable in the numpy array.
selected_columns = df_numeric.columns[vt.get_support()]
# transforming an array back to a data-frame preserves column labels
df_transformed = pd.DataFrame(df_transformed, columns = selected_columns)

In [7]:
# step 1
df_corr = df_transformed.corr().abs()

# step 2
indices = np.where(df_corr > 0.8) 
indices = [(df_corr.index[x], df_corr.columns[y]) 
for x, y in zip(*indices)
    if x != y and x < y]

# step 3
for idx in indices: #each pair
    try:
        df_transformed.drop(idx[1], axis = 1, inplace=True)
    except KeyError:
        pass

In [8]:
print(indices)

[('TotalBsmtSF', '1stFlrSF'), ('GrLivArea', 'TotRmsAbvGrd'), ('GrLivArea', '1stFlr_2ndFlr_SF'), ('TotRmsAbvGrd', '1stFlr_2ndFlr_SF'), ('GarageCars', 'GarageArea'), ('GarageQual', 'GarageCond')]


In [9]:
from sklearn.feature_selection import f_regression, SelectKBest
skb = SelectKBest(f_regression, k=10)
X = skb.fit_transform(df_transformed, y)

In [10]:
# this will give us the position of top 10 columns
skb.get_support()
# column names
df_transformed.columns[skb.get_support()]
X = pd.DataFrame(X,columns=df_transformed.columns[skb.get_support()])

# Linear Regression

In [11]:
X = sm.add_constant(X) # adding a constant

In [12]:
# create a Python object that represents linear regression

lin_reg = sm.OLS(y,X)

In [13]:
# fit with training data

model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.834
Method:                 Least Squares   F-statistic:                     732.0
Date:                Wed, 22 Jun 2022   Prob (F-statistic):               0.00
Time:                        12:30:44   Log-Likelihood:                -17206.
No. Observations:                1458   AIC:                         3.443e+04
Df Residuals:                    1447   BIC:                         3.449e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const        -8.992e+05   8.93e+04    -10.069   

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
regressor = LinearRegression()
regressor.fit(X, y)

In [16]:
print(regressor.coef_)

[     0.           5507.54189138    392.2863556   14466.78601472
    920.78618122     42.13854481     66.85496149 -11218.59562134
  11469.89475761   9314.43585305   1078.19597724]


In [17]:
regressor.score(X,y)

0.8349492071391903