In [1]:
# Importing all required modules
%pylab inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.api import abline_plot
import patsy
import seaborn as sns
sns.set(context='notebook', style='whitegrid', palette='deep', font='sans-serif', font_scale=1, rc=None)
import urllib2 as url

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Importing stata data set
sales = pd.read_stata("SI Sales Old.dta")

In [3]:
#Question 1
# recreating regression results seen in class
# regression on price given unit size
mod = smf.ols(formula='price ~ unit_size', data = sales).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.302
Model:                            OLS   Adj. R-squared:                  0.302
Method:                 Least Squares   F-statistic:                 1.372e+04
Date:                Wed, 30 Sep 2015   Prob (F-statistic):               0.00
Time:                        14:21:23   Log-Likelihood:            -4.3166e+05
No. Observations:               31680   AIC:                         8.633e+05
Df Residuals:                   31678   BIC:                         8.633e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept    1.35e+05   2542.335     53.118      0.0

In [4]:
# regression on price given unit size and land size
mod = smf.ols(formula='price ~ unit_size + land_size', data = sales).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.387
Model:                            OLS   Adj. R-squared:                  0.387
Method:                 Least Squares   F-statistic:                     9985.
Date:                Wed, 30 Sep 2015   Prob (F-statistic):               0.00
Time:                        14:21:25   Log-Likelihood:            -4.2961e+05
No. Observations:               31680   AIC:                         8.592e+05
Df Residuals:                   31677   BIC:                         8.593e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept   1.179e+05   2397.598     49.177      0.0

In [5]:
# regression on price given unit size, land size and age of unit
mod = smf.ols(formula='price ~ unit_size + land_size + age', data = sales).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.393
Model:                            OLS   Adj. R-squared:                  0.393
Method:                 Least Squares   F-statistic:                     6840.
Date:                Wed, 30 Sep 2015   Prob (F-statistic):               0.00
Time:                        14:21:27   Log-Likelihood:            -4.2945e+05
No. Observations:               31680   AIC:                         8.589e+05
Df Residuals:                   31676   BIC:                         8.589e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept   1.496e+05   2940.848     50.854      0.0

In [6]:
# regression on price given unit size, land size, age of unit and values in tod hill
mod = smf.ols(formula='price ~ unit_size + land_size + age + todt', data = sales).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.406
Model:                            OLS   Adj. R-squared:                  0.406
Method:                 Least Squares   F-statistic:                     5421.
Date:                Wed, 30 Sep 2015   Prob (F-statistic):               0.00
Time:                        14:21:28   Log-Likelihood:            -4.2910e+05
No. Observations:               31680   AIC:                         8.582e+05
Df Residuals:                   31675   BIC:                         8.582e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept   1.674e+05   2985.210     56.076      0.0

In [8]:
#Question 2
# Creating a dummy variable for linear trend using the minimum of the sales year dataframe as a base
#(N-1) to start with 1 as the minimum year
x = min(sales['sales_year'])-1

# creating a new list an populating it with linear values 
a = []
for n in sales['sales_year']:
    b = (n - x)
    a.append(b)

#Converting the list to a Dataframe and merging it with the sales dataframe
sales_year_trend = pd.DataFrame(a, columns=['sales_year_trend'])
sales1 = pd.merge(sales, sales_year_trend, left_index=True,right_index=True)

In [9]:
# regression on price given unit size, land size, age of unit and values in tod hill
mod = smf.ols(formula='price ~ unit_size + land_size + age + todt + sales_year_trend', data = sales1).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.412
Model:                            OLS   Adj. R-squared:                  0.412
Method:                 Least Squares   F-statistic:                     4431.
Date:                Wed, 30 Sep 2015   Prob (F-statistic):               0.00
Time:                        14:22:08   Log-Likelihood:            -4.2896e+05
No. Observations:               31680   AIC:                         8.579e+05
Df Residuals:                   31674   BIC:                         8.580e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------
Intercept         1.435e+05   3295.435  

- For this regression we can interpret the estimated coeffient of the sales year trend, as a relationship in which an increase of one year of the sales year of the unit (starting from 2003 as year 1), has an effect of rising the sales price in aproximatelly USD 6.325 for each year.
- The 95% confidence interval of this regression for this coeficient is from 5.586 to 7.064, which means that the result is within the 95% confidence interval.
- Taking as a reference the result of the R squared coefficient, which went from 0.406 to 0.412, we can affirm that including the linear trend indicator improved the model fit to texplain the sales price.

In [10]:
# Question 3 

Since the scale (and proportion for that matter) in the two variables is changed at the same time (unit size and land size), I would initially think that we would not expect a change in the coeficient of the age variable, but only in the coeficients of those two variables (unit size and land size) keeping all the other relative unchanged.

In [11]:
#Defining new columns with imperial Square feet values for unit size and land size
sales1['unit_size_sqrft'] = sales1['unit_size'].mul(10.76391)
sales1['land_size_sqrft'] = sales1['land_size'].mul(10.76391)

In [12]:
# regression on price given unit size, land size, age of unit and values in tod hill
mod = smf.ols(formula='price ~ unit_size_sqrft + land_size_sqrft + age + todt + sales_year_trend', data = sales1).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.412
Model:                            OLS   Adj. R-squared:                  0.412
Method:                 Least Squares   F-statistic:                     4431.
Date:                Wed, 30 Sep 2015   Prob (F-statistic):               0.00
Time:                        14:23:14   Log-Likelihood:            -4.2896e+05
No. Observations:               31680   AIC:                         8.579e+05
Df Residuals:                   31674   BIC:                         8.580e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------
Intercept         1.435e+05   3295.435  

As expected, the only change in the coefficients in the regression was related to the land size and unit size variables, because of the change in the unit and proportions used, in each of them.

In [13]:
#Question Number 4

In [14]:
# Generate normal error and multiple variables (10000 draws)
e1 = np.random.normal(0, 1, 10000)  
x1 = np.random.normal(0, 1, 10000)
x2 = np.random.normal(0, 1, 10000)

In [15]:
# DGPs for y1 
y1 = 1 + x1 + x2 + e1

In [16]:
# Calculating the correlation matrix between x1 and x2
correlx1x2 = np.corrcoef(x1, x2)
correlx1x2

array([[ 1.        , -0.00369723],
       [-0.00369723,  1.        ]])

The correlation between the two variables tends to 0.

In [17]:
# Creating dataframes for y1, X1 and X2 variables and merging them

x1pd = pd.DataFrame(x1)
x2pd = pd.DataFrame(x2)
y1pd = pd.DataFrame(y1)

Data = pd.merge(pd.merge(y1pd, x1pd, left_index=True, right_index=True), x2pd, left_index=True, right_index=True)  


In [18]:
#Run OLS regression on y1 for x1 only
mod1 = smf.ols(formula='y1 ~ x1', data=Data).fit()
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:                     y1   R-squared:                       0.337
Model:                            OLS   Adj. R-squared:                  0.337
Method:                 Least Squares   F-statistic:                     5076.
Date:                Wed, 30 Sep 2015   Prob (F-statistic):               0.00
Time:                        14:24:23   Log-Likelihood:                -17566.
No. Observations:               10000   AIC:                         3.514e+04
Df Residuals:                    9998   BIC:                         3.515e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.9947      0.014     70.952      0.0

When we run this regression without taking in account the x2 variable the resulting coeficient for x1 is close to be 1, which would could lead to an erroneousinterpretation to demostrate almost perfect positive correlation between y1 and x1.

In [19]:
# Generate normal error and multiple variables (10000 draws)
e1 = np.random.normal(0, 1, 10000)  
z1 = np.random.normal(0, 1, 10000)
w1 = np.random.normal(0, 1, 10000)
n1 = np.random.normal(0, 1, 10000)

x1 = z1 + n1
x2 = -z1 + w1


In [20]:
# DGPs for y1 
y1 = 1 + x1 + x2 + e1

In [21]:
# Calculating the correlation matrix between x1 and x2
correlx1x2 = np.corrcoef(x1, x2)
correlx1x2

array([[ 1.        , -0.50933165],
       [-0.50933165,  1.        ]])

In this case, due to the interdependance on the variable z1 in the two variables x1 and x2, we have a negative correlation of -0.5 among the two variables

In [22]:
# Creating dataframes for y1, X1 and X2 variables and merging them

x1pd = pd.DataFrame(x1)
x2pd = pd.DataFrame(x2)
y1pd = pd.DataFrame(y1)

Data = pd.merge(pd.merge(y1pd, x1pd, left_index=True, right_index=True), x2pd, left_index=True, right_index=True)  

In [23]:
#Run OLS regression on y1 for x1 only
mod1 = smf.ols(formula='y1 ~ x1', data=Data).fit()
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:                     y1   R-squared:                       0.166
Model:                            OLS   Adj. R-squared:                  0.166
Method:                 Least Squares   F-statistic:                     1988.
Date:                Wed, 30 Sep 2015   Prob (F-statistic):               0.00
Time:                        14:25:36   Log-Likelihood:                -18721.
No. Observations:               10000   AIC:                         3.745e+04
Df Residuals:                    9998   BIC:                         3.746e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.0161      0.016     64.576      0.0

In this case, we wun run a mistakenly regression of y1 only taking in account variable x1, we see that the coeficcient is still positive like in the first case, but now is closer to 0.5, which reduces the seemable correlation between the two variables (y1 on x1).

Given the two experiments,we could probably deduce than when simulating a multi-variable regression model, and erroneously lefting behind one of more explanatory variables, the included ones in the regression seem to be getting some portion of the explaining capacity on the dependent variable, and that this proportion is directly linked to the level of correlation between the included explanatory variable and the one left outside the regression, altough we would require some additional prove to affirm this theory as a certain truth. 