In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

In [2]:
import numpy as np
from scipy import stats
import statsmodels
import statsmodels.api as sm
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd

# Chapter 4

## 3

### i
You can interpret this coefficient such that a 1% increase in sales is equivalent to a .321 increase in R&D as a percentage of sales.

### ii

In [122]:
def t_stat(coefficient, se, null):
    return (coefficient - null) / se

In [123]:
t_stat(.321, .216, 0)

1.4861111111111112

Since the t-value is greater than 2.045 (the critical t value for a model with 60 dof), we cannot reject the null that R&D intensity does not change with sales.
### iii

It might be economically large even though it has a large standard error n may not be statistically significant.

### iv

In [124]:
t_stat(.050, .046, 0)

1.0869565217391306

Since critical t value for a two-sided 5% test is 1.96, we cannot reject the null that R&D intensity does not change with profit margin.


## 4

### i
$$H_0: \beta_{pctstu} = 0$$
$$H_a: \beta_{pctstu} \neq 0$$

### ii
I expect both coefficients to be positive.

### iii
They are not adjusting correctly for the fact that this is a log-log model.

### iv


In [126]:
t_stat(.0056, .0017, 0)

3.294117647058824

Since the t-value is greater than 2.66 (the critical t value for a model with 60 dof), we can say that the factor is significant.

## 10

### i

In [127]:
t_stat(.321, .201, 0), t_stat(.043, .078, 0), t_stat(-.0051, .0047, 0), t_stat(.0035, .0022, 0)

(1.5970149253731343,
 0.5512820512820512,
 -1.0851063829787235,
 1.5909090909090908)

None of the above factors have t-values greater than 1.960, so we can conclude that none are significant.

### ii

In [128]:
t_stat(.327, .203, 0), t_stat(.069, .080, 0), t_stat(-4.74, 3.39, 0), t_stat(7.24, 6.31, 0)

(1.6108374384236452, 0.8625, -1.3982300884955752, 1.1473851030110935)

My conclusions do not change.

### iii

No, that would not be possible because of firms with negative earnings, furthermore it would not be prudent to exclude them from the model either.

### iv
The evidence among these factors is weak.

## C1

### i

There is a $\beta_1$ change in $voteA$ for each percent change in $expendA$.

### ii

$H_0: \beta_1 = \beta_2$

### iii

In [85]:
vote_data = pd.read_stata('VOTE1.DTA')

In [86]:
vote_data.head()

Unnamed: 0,state,district,democA,voteA,expendA,expendB,prtystrA,lexpendA,lexpendB,shareA
0,AL,7,1,68,328.29599,8.737,41,5.793916,2.167567,97.407669
1,AK,1,0,62,626.377014,402.47699,60,6.439952,5.997638,60.881039
2,AZ,2,1,73,99.607002,3.065,55,4.601233,1.120048,97.014763
3,AZ,3,0,69,319.690002,26.281,64,5.767352,3.268846,92.403702
4,AR,3,0,75,159.220993,60.054001,66,5.070293,4.095244,72.612473


In [87]:
vote_a = vote_data['voteA']
factors = vote_data[['lexpendA', 'lexpendB', 'prtystrA']]
factors = sm.add_constant(factors)
model = sm.OLS(vote_a, factors)
results = model.fit()

In [88]:
results.summary()

0,1,2,3
Dep. Variable:,voteA,R-squared:,0.793
Model:,OLS,Adj. R-squared:,0.789
Method:,Least Squares,F-statistic:,215.2
Date:,"Mon, 14 Nov 2016",Prob (F-statistic):,1.76e-57
Time:,04:48:27,Log-Likelihood:,-596.86
No. Observations:,173,AIC:,1202.0
Df Residuals:,169,BIC:,1214.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,45.0789,3.926,11.481,0.000,37.328 52.830
lexpendA,6.0833,0.382,15.919,0.000,5.329 6.838
lexpendB,-6.6154,0.379,-17.463,0.000,-7.363 -5.868
prtystrA,0.1520,0.062,2.450,0.015,0.030 0.274

0,1,2,3
Omnibus:,8.9,Durbin-Watson:,1.604
Prob(Omnibus):,0.012,Jarque-Bera (JB):,8.832
Skew:,0.493,Prob(JB):,0.0121
Kurtosis:,3.505,Cond. No.,344.0


Both $expendA$ and $expendB$ have high t-stats and low p-values indicating they are significant factors. Furthermore their signs are opposite giving credence to the $H_0$ though not confirming because the value of the coefficient for $expendB$ has a slightly greater magnitude than $expendA$.

### iv

My model would be $\beta_1 = \beta_2$

In [89]:
results.t_test('lexpendA = lexpendB')

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
c0            12.6987      0.543     23.384      0.000        11.627    13.771

The t value is highly significant and the p value is low, we can therefore reject the null hypothesis.

## C5

### i

In [90]:
mlb_data = pd.read_stata('MLB1.DTA')

In [91]:
mlb_data.head()

Unnamed: 0,salary,teamsal,nl,years,games,atbats,runs,hits,doubles,triples,...,runsyr,percwhte,percblck,perchisp,blckpb,hispph,whtepw,blckph,hisppb,lsalary
0,6329213.0,38407380.0,1,12,1705,6705,1076,1939,320,67,...,89.666664,70.277969,18.844229,10.877804,0.0,0.0,70.277969,0.0,0.0,15.660686
1,3375000.0,38407380.0,1,8,918,3333,407,863,156,38,...,50.875,70.277969,18.844229,10.877804,18.844229,0.0,0.0,10.877804,0.0,15.031906
2,3100000.0,38407380.0,1,5,751,2807,370,840,148,18,...,74.0,70.277969,18.844229,10.877804,0.0,0.0,70.277969,0.0,0.0,14.946913
3,2900000.0,38407380.0,1,8,1056,3337,405,816,143,18,...,50.625,70.277969,18.844229,10.877804,0.0,0.0,70.277969,0.0,0.0,14.880221
4,1650000.0,38407380.0,1,12,1196,3603,437,928,19,16,...,36.416668,70.277969,18.844229,10.877804,18.844229,0.0,0.0,10.877804,0.0,14.316286


In [94]:
salary = mlb_data['lsalary']
factors = mlb_data[['years', 'gamesyr', 'bavg', 'hrunsyr', 'rbisyr']]

factors = sm.add_constant(factors)
model = sm.OLS(salary, factors)
results = model.fit()

In [97]:
results.summary()

0,1,2,3
Dep. Variable:,lsalary,R-squared:,0.628
Model:,OLS,Adj. R-squared:,0.622
Method:,Least Squares,F-statistic:,117.1
Date:,"Mon, 14 Nov 2016",Prob (F-statistic):,2.94e-72
Time:,04:53:09,Log-Likelihood:,-385.11
No. Observations:,353,AIC:,782.2
Df Residuals:,347,BIC:,805.4
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,11.1924,0.289,38.752,0.000,10.624 11.760
years,0.0689,0.012,5.684,0.000,0.045 0.093
gamesyr,0.0126,0.003,4.742,0.000,0.007 0.018
bavg,0.0010,0.001,0.887,0.376,-0.001 0.003
hrunsyr,0.0144,0.016,0.899,0.369,-0.017 0.046
rbisyr,0.0108,0.007,1.500,0.134,-0.003 0.025

0,1,2,3
Omnibus:,6.816,Durbin-Watson:,1.265
Prob(Omnibus):,0.033,Jarque-Bera (JB):,10.198
Skew:,-0.068,Prob(JB):,0.0061
Kurtosis:,3.821,Cond. No.,2090.0


In [98]:
factors = mlb_data[['years', 'gamesyr', 'bavg', 'hrunsyr']]
factors = sm.add_constant(factors)
model = sm.OLS(salary, factors)
results = model.fit()

In [99]:
results.summary()

0,1,2,3
Dep. Variable:,lsalary,R-squared:,0.625
Model:,OLS,Adj. R-squared:,0.621
Method:,Least Squares,F-statistic:,145.2
Date:,"Mon, 14 Nov 2016",Prob (F-statistic):,6.980000000000001e-73
Time:,04:54:25,Log-Likelihood:,-386.25
No. Observations:,353,AIC:,782.5
Df Residuals:,348,BIC:,801.8
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,11.0209,0.266,41.476,0.000,10.498 11.544
years,0.0677,0.012,5.592,0.000,0.044 0.092
gamesyr,0.0158,0.002,10.079,0.000,0.013 0.019
bavg,0.0014,0.001,1.331,0.184,-0.001 0.004
hrunsyr,0.0359,0.007,4.964,0.000,0.022 0.050

0,1,2,3
Omnibus:,7.369,Durbin-Watson:,1.244
Prob(Omnibus):,0.025,Jarque-Bera (JB):,11.24
Skew:,-0.085,Prob(JB):,0.00362
Kurtosis:,3.858,Cond. No.,1900.0


The t-value and coefficient of $hrunsyr$ increase.

### ii

In [104]:
factors = mlb_data[['years', 'gamesyr', 'hrunsyr', 'rbisyr', 'runsyr', 'bavg', 'fldperc', 'sbasesyr']]
factors = sm.add_constant(factors)
model = sm.OLS(salary, factors)
results = model.fit()

In [111]:
results.summary()

0,1,2,3
Dep. Variable:,lsalary,R-squared:,0.639
Model:,OLS,Adj. R-squared:,0.631
Method:,Least Squares,F-statistic:,76.18
Date:,"Mon, 14 Nov 2016",Prob (F-statistic):,1.63e-71
Time:,05:08:04,Log-Likelihood:,-379.61
No. Observations:,353,AIC:,777.2
Df Residuals:,344,BIC:,812.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,10.4929,2.015,5.207,0.000,6.529 14.457
years,0.0702,0.012,5.850,0.000,0.047 0.094
gamesyr,0.0072,0.003,2.294,0.022,0.001 0.013
hrunsyr,0.0175,0.016,1.095,0.274,-0.014 0.049
rbisyr,0.0033,0.008,0.429,0.668,-0.012 0.019
runsyr,0.0165,0.005,3.017,0.003,0.006 0.027
bavg,0.0004,0.001,0.380,0.704,-0.002 0.003
fldperc,0.0010,0.002,0.492,0.623,-0.003 0.005
sbasesyr,-0.0056,0.006,-1.001,0.317,-0.016 0.005

0,1,2,3
Omnibus:,1.216,Durbin-Watson:,1.292
Prob(Omnibus):,0.545,Jarque-Bera (JB):,0.966
Skew:,0.084,Prob(JB):,0.617
Kurtosis:,3.194,Cond. No.,53500.0


### ii
If we take $\alpha = 0.05$ the significant factors are $const, years, gamesyr, runsyr$.

### iii

In [115]:
A = np.identity(len(results.params))[-3:,:]

In [116]:
results.f_test(A)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[ 0.43901553]]), p=0.7252331043926552, df_denom=344, df_num=3>

The factors $bavg, fldperc, sbasesyr$ are not jointly significant the p value above is greater than 0.05.

## C6

### i
$H_0: \beta_{exper} = \beta_{tenure}$

In [117]:
wage_data = pd.read_stata('WAGE2.DTA')

In [118]:
wage_data.head()

Unnamed: 0,wage,hours,IQ,KWW,educ,exper,tenure,age,married,black,south,urban,sibs,brthord,meduc,feduc,lwage
0,769,40,93,35,12,11,2,31,1,0,0,1,1,2.0,8.0,8.0,6.645091
1,808,50,119,41,18,11,16,37,1,0,0,1,1,,14.0,14.0,6.694562
2,825,40,108,46,14,11,9,33,1,0,0,1,1,2.0,14.0,14.0,6.715384
3,650,40,96,32,12,13,7,32,1,0,0,1,4,3.0,12.0,12.0,6.476973
4,562,40,74,27,11,14,5,34,1,0,0,1,10,6.0,6.0,11.0,6.331502


In [119]:
wage = wage_data['lwage']
factors = wage_data[['educ', 'exper', 'tenure']]
factors = sm.add_constant(factors)
model = sm.OLS(wage, factors)
results = model.fit()

In [120]:
results.summary()

0,1,2,3
Dep. Variable:,lwage,R-squared:,0.155
Model:,OLS,Adj. R-squared:,0.152
Method:,Least Squares,F-statistic:,56.97
Date:,"Mon, 14 Nov 2016",Prob (F-statistic):,8.119999999999999e-34
Time:,05:14:41,Log-Likelihood:,-438.84
No. Observations:,935,AIC:,885.7
Df Residuals:,931,BIC:,905.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,5.4967,0.111,49.731,0.000,5.280 5.714
educ,0.0749,0.007,11.495,0.000,0.062 0.088
exper,0.0153,0.003,4.549,0.000,0.009 0.022
tenure,0.0134,0.003,5.170,0.000,0.008 0.018

0,1,2,3
Omnibus:,20.917,Durbin-Watson:,1.769
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30.558
Skew:,-0.214,Prob(JB):,2.31e-07
Kurtosis:,3.775,Cond. No.,170.0


### ii

In [121]:
results.t_test('exper = tenure')

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
c0             0.0020      0.005      0.412      0.681        -0.007     0.011

While the coefficients for both factors are close to each other, a closer look at the t test of the two shows that we cannot reject the null hypothesis that the value of the coefficients is the same.