### 0. Data

In [1]:
import pandas as pd

In [2]:
house = pd.read_excel("http://byungwan.com/class/House_Prices.xls")

In [3]:
house.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood
0,1,114300,1790,2,2,2,No,East
1,2,114200,2030,4,2,3,No,East
2,3,114800,1740,3,2,1,No,East
3,4,94700,1980,3,2,3,No,East
4,5,119800,2130,3,3,3,No,East


In [4]:
# create dummies using pandas
house2 = pd.get_dummies(house, ["Brick","Neighborhood"], drop_first = True, dtype = int)

In [5]:
house2.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick_Yes,Neighborhood_North,Neighborhood_West
0,1,114300,1790,2,2,2,0,0,0
1,2,114200,2030,4,2,3,0,0,0
2,3,114800,1740,3,2,1,0,0,0
3,4,94700,1980,3,2,3,0,0,0
4,5,119800,2130,3,3,3,0,0,0


In [6]:
# simple regression -> 1 independent variable, multiple regression -> multiple independent variable

### 1. Simple linear regression

In [7]:
y = house2.iloc[:, 1] # Price

In [8]:
x = house2.iloc[:, 2] # SqFt

In [9]:
import statsmodels.api as sm

In [10]:
reg = sm.OLS(y,x).fit()

In [11]:
print(reg.summary())

# Adj. R-square is a penalty for using multiple variables

                                 OLS Regression Results                                
Dep. Variable:                  Price   R-squared (uncentered):                   0.972
Model:                            OLS   Adj. R-squared (uncentered):              0.972
Method:                 Least Squares   F-statistic:                              4390.
Date:                Thu, 25 Sep 2025   Prob (F-statistic):                   2.30e-100
Time:                        11:04:53   Log-Likelihood:                         -1463.3
No. Observations:                 128   AIC:                                      2929.
Df Residuals:                     127   BIC:                                      2932.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [12]:
# p-value interpretation : checking our hypothesis
# estimate the beta -> find a linear funcation that can explain the relationship / minimize the SSE

# usually use 0.05 as a threshold, so if p-value is less than that, we can say the number has statistical significance
# in this case, we can trust that number -> have significant relationship

In [13]:
# There in so alpha in this reg.summary (no constant)

### 1.1 w/ constant

In [14]:
x_c = sm.add_constant(x)

In [15]:
reg2 = sm.OLS(y,x_c).fit()
print(reg2.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.306
Model:                            OLS   Adj. R-squared:                  0.300
Method:                 Least Squares   F-statistic:                     55.50
Date:                Thu, 25 Sep 2025   Prob (F-statistic):           1.30e-11
Time:                        11:04:53   Log-Likelihood:                -1463.2
No. Observations:                 128   AIC:                             2930.
Df Residuals:                     126   BIC:                             2936.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.009e+04    1.9e+04     -0.532      0.5

### 2. Multiple linear regression

In [19]:
house2.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick_Yes,Neighborhood_North,Neighborhood_West
0,1,114300,1790,2,2,2,0,0,0
1,2,114200,2030,4,2,3,0,0,0
2,3,114800,1740,3,2,1,0,0,0
3,4,94700,1980,3,2,3,0,0,0
4,5,119800,2130,3,3,3,0,0,0


In [20]:
y = house2.iloc[:, 1]

In [21]:
x2 = house2.iloc[:, 2:]

In [22]:
x2.head()

Unnamed: 0,SqFt,Bedrooms,Bathrooms,Offers,Brick_Yes,Neighborhood_North,Neighborhood_West
0,1790,2,2,2,0,0,0
1,2030,4,2,3,0,0,0
2,1740,3,2,1,0,0,0
3,1980,3,2,3,0,0,0
4,2130,3,3,3,0,0,0


In [23]:
reg3 = sm.OLS(y, x2).fit()

In [24]:
print(reg3.summary())

                                 OLS Regression Results                                
Dep. Variable:                  Price   R-squared (uncentered):                   0.995
Model:                            OLS   Adj. R-squared (uncentered):              0.994
Method:                 Least Squares   F-statistic:                              3239.
Date:                Thu, 25 Sep 2025   Prob (F-statistic):                   2.09e-134
Time:                        11:06:55   Log-Likelihood:                         -1356.7
No. Observations:                 128   AIC:                                      2727.
Df Residuals:                     121   BIC:                                      2747.
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [31]:
# Those house as Brick_Yes is more expensive by (1.73e+04)
# Houses in North is not different from those that are from the East bc it is not significant

In [26]:
# no missing, no text data, check R-square, check p-values, interpret the coefficient

In [32]:
# Usually including more variable is better -> increase explanatory power, the accuracy will increase

In [33]:
# Based on our logic -> why some houses are more expensive than others
# Data-driven model -> let the model tell us or not / start from the scratch

In [29]:
# Logic-driven -> maybe the data is not available..
# start from data-driven -> see what is available -> include what seems to be important

In [30]:
# at least 30 obs to do statistical testing

In [34]:
# In order to run linear regression, all the assumption should be met

### 4.Multicollinearity

In [36]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [39]:
x2.values
# .values -> back to array

array([[1790,    2,    2,    2,    0,    0,    0],
       [2030,    4,    2,    3,    0,    0,    0],
       [1740,    3,    2,    1,    0,    0,    0],
       [1980,    3,    2,    3,    0,    0,    0],
       [2130,    3,    3,    3,    0,    0,    0],
       [1780,    3,    2,    2,    0,    1,    0],
       [1830,    3,    3,    3,    1,    0,    1],
       [2160,    4,    2,    2,    0,    0,    1],
       [2110,    4,    2,    3,    0,    0,    0],
       [1730,    3,    3,    3,    0,    0,    0],
       [2030,    3,    2,    3,    1,    0,    0],
       [1870,    2,    2,    2,    1,    0,    0],
       [1910,    3,    2,    4,    0,    1,    0],
       [2150,    3,    3,    5,    1,    1,    0],
       [2590,    4,    3,    4,    0,    0,    1],
       [1780,    4,    2,    1,    0,    0,    1],
       [2190,    3,    3,    4,    1,    0,    0],
       [1990,    3,    3,    4,    0,    1,    0],
       [1700,    2,    2,    1,    1,    0,    0],
       [1920,    3,    3,    2,

In [43]:
vif(x2.values, 0)

np.float64(57.60775699450441)

In [45]:
x2.shape[1]

7

In [53]:
vif_factors = [vif(x2.values, i) for i in range(0, x2.shape[1])]
vif_factors

[np.float64(57.60775699450441),
 np.float64(31.028044452273534),
 np.float64(35.632901553298055),
 np.float64(10.591655827582377),
 np.float64(1.6377236928141587),
 np.float64(2.121364879232265),
 np.float64(2.4362969178112834)]

In [54]:
x2.columns

Index(['SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Brick_Yes',
       'Neighborhood_North', 'Neighborhood_West'],
      dtype='object')

In [55]:
pd.DataFrame(vif_factors)

Unnamed: 0,0
0,57.607757
1,31.028044
2,35.632902
3,10.591656
4,1.637724
5,2.121365
6,2.436297


In [56]:
pd.DataFrame({"Variable": x2.columns, "VIF Factor": vif_factors})

Unnamed: 0,Variable,VIF Factor
0,SqFt,57.607757
1,Bedrooms,31.028044
2,Bathrooms,35.632902
3,Offers,10.591656
4,Brick_Yes,1.637724
5,Neighborhood_North,2.121365
6,Neighborhood_West,2.436297
