# First, reset all your Python and load pandas and statsmodels

In [1]:
%reset -f

In [2]:
import pandas          as pd
import statsmodels.api as sm

# Question 1

## load data into Python

In [3]:
data = pd.read_csv('econmath.csv')
data = data.dropna()

## check the first 5 observations of the data

In [4]:
data.head()

Unnamed: 0,age,work,study,econhs,colgpa,hsgpa,acteng,actmth,act,mathscr,male,calculus,attexc,attgood,fathcoll,mothcoll,score
0,23,15.0,10.0,0,3.4909,3.355,24.0,26.0,27.0,10,1,1,0,0,1,1,84.43
1,23,0.0,22.5,1,2.1,3.219,23.0,20.0,24.0,9,1,0,0,0,0,1,57.380001
2,21,25.0,12.0,0,3.0851,3.306,21.0,24.0,21.0,8,1,1,1,0,0,1,66.389999
3,22,30.0,40.0,0,2.6805,3.977,31.0,28.0,31.0,10,0,1,0,1,1,1,81.150002
4,22,25.0,15.0,1,3.7454,3.89,28.0,31.0,32.0,8,1,1,0,1,0,1,95.900002


## list all the variable names of the data

In [5]:
list(data.columns)

['age',
 'work',
 'study',
 'econhs',
 'colgpa',
 'hsgpa',
 'acteng',
 'actmth',
 'act',
 'mathscr',
 'male',
 'calculus',
 'attexc',
 'attgood',
 'fathcoll',
 'mothcoll',
 'score']

## compute the min, max, mean and sd of 3 specific variables

### specify which variable you want to compute the summary statistics on

In [6]:
var_name = ['actmth', 'acteng', 'score']

### mean

In [7]:
data[var_name].mean()

actmth    23.211302
acteng    22.594595
score     72.608734
dtype: float64

### sd

In [8]:
data[var_name].std()

actmth     3.773354
acteng     3.788735
score     13.304494
dtype: float64

### min

In [9]:
data[var_name].min()

actmth    12.000000
acteng    12.000000
score     20.309999
dtype: float64

### max

In [10]:
data[var_name].max()

actmth    36.000000
acteng    34.000000
score     98.440002
dtype: float64

## estimate the regression model 

## $$ score = \beta_0 + \beta_1 \cdot colgpa + \beta_2 \cdot actmth \beta_1 \cdot acteng + u$$

In [11]:
X = data[['colgpa', 'actmth', 'acteng']]
X = sm.add_constant(X)
Y = data[['score']]

In [12]:
OLS_model = sm.OLS(Y, X)

OLS_result = OLS_model.fit()
print(OLS_result.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.397
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                     177.9
Date:                Mon, 24 Aug 2020   Prob (F-statistic):           1.31e-88
Time:                        16:31:54   Log-Likelihood:                -3055.2
No. Observations:                 814   AIC:                             6118.
Df Residuals:                     810   BIC:                             6137.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         16.1740      2.800      5.776      0.0

### standardize each variable

In [13]:
X_st = data[['colgpa', 'actmth', 'acteng']]
X_st = (X_st - X_st.mean()) / X_st.std()
X_st = sm.add_constant(X_st)
Y_st = data[['score']]
Y_st = (Y_st - Y_st.mean())/Y_st.std()

### check the mean 

In [14]:
X_st.mean()

const     1.000000e+00
colgpa    8.188372e-15
actmth    1.421195e-16
acteng    3.397501e-16
dtype: float64

In [15]:
Y_st.mean()

score   -6.565864e-16
dtype: float64

### check the sd

In [16]:
X_st.std()

const     0.0
colgpa    1.0
actmth    1.0
acteng    1.0
dtype: float64

In [17]:
Y_st.std()

score    1.0
dtype: float64

### re-estimate the model after standardization

In [18]:
OLS_st = sm.OLS(Y_st, X_st)

OLS_st_result = OLS_st.fit()
print(OLS_st_result.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.397
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                     177.9
Date:                Mon, 24 Aug 2020   Prob (F-statistic):           1.31e-88
Time:                        16:31:56   Log-Likelihood:                -948.48
No. Observations:                 814   AIC:                             1905.
Df Residuals:                     810   BIC:                             1924.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.106e-16      0.027  -2.24e-14      1.0

# Question 2

## load data into Python

In [22]:
data2 = pd.read_csv('hprice2.csv')

## compute the min, max, mean and sd of specific variables

In [23]:
var_name2 = ['price', 'sqrmtr', 'lotsize', 'bdrms']

In [24]:
data2[var_name2].mean()

price      292649.372881
sqrmtr        233.347458
lotsize      1058.042373
bdrms           3.567797
dtype: float64

In [25]:
data2[var_name2].std()

price      98449.884623
sqrmtr        64.412052
lotsize     1110.507980
bdrms          0.831833
dtype: float64

In [26]:
data2[var_name2].min()

price      111000
sqrmtr        134
lotsize       125
bdrms           2
dtype: int64

In [27]:
data2[var_name2].max()

price      815000
sqrmtr        446
lotsize     11585
bdrms           7
dtype: int64

## regression $$price = \beta_0 + \beta_1 \cdot sqrmtr + \beta_2 \cdot lotsize + \beta_3 \cdot bdrms + u$$

In [28]:
X2 = data2[['sqrmtr', 'lotsize', 'bdrms']]
X2 = sm.add_constant(X2)
Y2 = data2[['price']]

In [29]:
OLS2 = sm.OLS(Y2, X2)

OLS2 = OLS2.fit()
print(OLS2.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.637
Model:                            OLS   Adj. R-squared:                  0.627
Method:                 Least Squares   F-statistic:                     66.59
Date:                Mon, 24 Aug 2020   Prob (F-statistic):           5.96e-25
Time:                        16:32:20   Log-Likelihood:                -1463.9
No. Observations:                 118   AIC:                             2936.
Df Residuals:                     114   BIC:                             2947.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.175e+04   2.64e+04     -0.445      0.6

## standardize all variables and re-estimate the model

In [30]:
X2_st = data2[['sqrmtr', 'lotsize', 'bdrms']]
X2_st = (X2_st - X2_st.mean()) / X2_st.std()
X2_st = sm.add_constant(X2_st)

Y2_st = data2[['price']]
Y2_st = (Y2_st - Y2_st.mean()) / Y2_st.std()

In [31]:
OLS2_st = sm.OLS(Y2_st, X2_st)

OLS2_st = OLS2_st.fit()
print(OLS2_st.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.637
Model:                            OLS   Adj. R-squared:                  0.627
Method:                 Least Squares   F-statistic:                     66.59
Date:                Mon, 24 Aug 2020   Prob (F-statistic):           5.96e-25
Time:                        16:32:21   Log-Likelihood:                -107.20
No. Observations:                 118   AIC:                             222.4
Df Residuals:                     114   BIC:                             233.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.163e-17      0.056   7.41e-16      1.0

## predict the price of a house with $sqrmtr = 280, lotsize = 776, bdrms = 6$

In [32]:
OLS2.predict([1, 280, 766, 6])

array([365844.99255784])

## if the actual price is 320000, compute the prediction error of the house with $sqrmtr = 280, lotsize = 776, bdrms = 6$ 

In [33]:
320000 - OLS2.predict([1, 280, 766, 6])

array([-45844.99255784])