# Linear Regression

### Content:
- Regressing House Price Change on Covid Mortality Rates
- Same Regression, Controlled for State

In [1]:
#import libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
#import merged dataset
df = pd.read_pickle('/Users/kimballwightman/Desktop/Projects/Covid & House Prices Analysis/02 Data/Prepared Data/df_merge.pkl')

In [3]:
df.head()

Unnamed: 0,fips,county,state,deaths,PopTotal,zip_three,index change,state_fips,deaths_pc
0,1001.0,Autauga,Alabama,5683.0,58805,360,16.7625,1,9664.144205
1,1001.0,Autauga,Alabama,5683.0,58805,367,23.985,1,9664.144205
2,1003.0,Baldwin,Alabama,12446.0,231767,364,23.985,1,5370.048368
3,1003.0,Baldwin,Alabama,12446.0,231767,365,34.115,1,5370.048368
4,1005.0,Barbour,Alabama,2035.0,25223,360,16.7625,1,8068.033144


### Regressing House Price Change on Covid Mortality Rates

In [4]:
#run linear regression of inex_change on deaths_pc
X = df['deaths_pc'].values.reshape(-1,1)
y = df['index change'].values.reshape(-1,1)

In [5]:
X = sm.add_constant(X)

In [6]:
#use heteroskadisticity-robust covariance type
reg1 = sm.OLS(y,X).fit(cov_type='HC1')

In [7]:
print(reg1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     52.26
Date:                Wed, 10 Aug 2022   Prob (F-statistic):           5.33e-13
Time:                        14:01:47   Log-Likelihood:                -29620.
No. Observations:                7546   AIC:                         5.924e+04
Df Residuals:                    7544   BIC:                         5.926e+04
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.3185      0.199    152.379      0.0

### Same Regression, Controlled for State

In [8]:
#create dummy variables for each state code
df_dummies = pd.get_dummies(df, prefix = ['State'], columns = ['state_fips'], drop_first=True)

In [9]:
df_dummies.head()

Unnamed: 0,fips,county,state,deaths,PopTotal,zip_three,index change,deaths_pc,State_02,State_04,...,State_46,State_47,State_48,State_49,State_50,State_51,State_53,State_54,State_55,State_56
0,1001.0,Autauga,Alabama,5683.0,58805,360,16.7625,9664.144205,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001.0,Autauga,Alabama,5683.0,58805,367,23.985,9664.144205,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1003.0,Baldwin,Alabama,12446.0,231767,364,23.985,5370.048368,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1003.0,Baldwin,Alabama,12446.0,231767,365,34.115,5370.048368,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1005.0,Barbour,Alabama,2035.0,25223,360,16.7625,8068.033144,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df_dummies.columns

Index(['fips', 'county', 'state', 'deaths', 'PopTotal', 'zip_three',
       'index change', 'deaths_pc', 'State_02', 'State_04', 'State_05',
       'State_06', 'State_08', 'State_09', 'State_10', 'State_11', 'State_12',
       'State_13', 'State_15', 'State_16', 'State_17', 'State_18', 'State_19',
       'State_20', 'State_21', 'State_22', 'State_23', 'State_24', 'State_25',
       'State_26', 'State_27', 'State_28', 'State_29', 'State_30', 'State_31',
       'State_32', 'State_33', 'State_34', 'State_35', 'State_36', 'State_37',
       'State_38', 'State_39', 'State_40', 'State_41', 'State_42', 'State_44',
       'State_45', 'State_46', 'State_47', 'State_48', 'State_49', 'State_50',
       'State_51', 'State_53', 'State_54', 'State_55', 'State_56'],
      dtype='object')

In [12]:
X = df_dummies[['deaths_pc', 'State_02', 'State_04',
       'State_05', 'State_06', 'State_08', 'State_09', 'State_10', 'State_11',
       'State_12', 'State_13', 'State_15', 'State_16', 'State_17', 'State_18',
       'State_19', 'State_20', 'State_21', 'State_22', 'State_23', 'State_24',
       'State_25', 'State_26', 'State_27', 'State_28', 'State_29', 'State_30',
       'State_31', 'State_32', 'State_33', 'State_34', 'State_35', 'State_36',
       'State_37', 'State_38', 'State_39', 'State_40', 'State_41', 'State_42',
       'State_44', 'State_45', 'State_46', 'State_47', 'State_48', 'State_49',
       'State_50', 'State_51', 'State_53', 'State_54', 'State_55', 'State_56']].values

In [13]:
X

array([[9.66414421e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.66414421e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.37004837e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.45476773e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.81782694e+04, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.08218777e+03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [14]:
X = sm.add_constant(X)

In [15]:
reg2 = sm.OLS(y,X).fit(cov_type='HC1')

In [16]:
print(reg2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.654
Model:                            OLS   Adj. R-squared:                  0.652
Method:                 Least Squares   F-statistic:                     3157.
Date:                Wed, 10 Aug 2022   Prob (F-statistic):               0.00
Time:                        14:02:47   Log-Likelihood:                -25642.
No. Observations:                7546   AIC:                         5.139e+04
Df Residuals:                    7494   BIC:                         5.175e+04
Df Model:                          51                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         25.1518      0.423     59.487      0.0

In [17]:
#summary statistics
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

Unnamed: 0,deaths,PopTotal,zip_three,index change,deaths_pc
count,7546.0,7546.0,7546.0,7546.0,7546.0
mean,35028.6,200404.0,522.897,29.3264,11405.1
std,115846.0,656575.0,224.109,12.3014,11610.3
min,0.0,64.0,100.0,-6.6175,0.0
25%,748.0,14786.5,335.0,21.715,3668.3
50%,2674.0,37548.0,508.0,25.8475,7430.69
75%,11853.0,132813.0,703.0,33.0038,14705.6
max,1331980.0,10014000.0,999.0,103.92,105261.0
