# 14. Advanced Panel Data Methods

## 14.1. Fixed Effects Estimation

$y_{it} = \beta_0 + \beta_1x_{it1} + . . .  + \beta_kx_{itk} + a_i + u_{it}$

$\bar{y_i} = \beta_0 + \beta_1\bar{x}_{i1} + . . . + \beta_k\bar{x}_{ik} + a_i + \bar{u_i}$

$\ddot{y} = y_{it} - \bar{y_i} $

$\ddot{y} = \beta_1\ddot{x}_{it1} + . . . +  \beta_k\ddot{x}_{itk} + \ddot{u}_{it}$


### Exapmle 14.2: Has the Return to Education Changed over Time

In [1]:
import wooldridge as woo
import pandas as pd
import linearmodels as plm
import scipy.stats as stats
from supplementaryFunctions import *

In [2]:
def LM_summary(results, round_dig = 4):
    summary = {"$\\beta $":results.params, 
           "$t$": results.tstats,
           "$$P>|t|$$":results.pvalues,
           # calculate standard errors by taking the square root of the variance values 
           # along the diagonal of the covariance matrix 
          "$SE$":results.std_errors}
    summary = pd.DataFrame(summary)
    # add r^2 using index name
    summary.index.name = "$$r^2: "+str(round(results.rsquared,round_dig)) + "$$"
    return summary

#### The authors use *drop_absorbed* quite freely. This is problematic since it makes us lazy in identifying source of multicolinearity.

In [3]:
wagepan = woo.dataWoo("wagepan")
wagepan.set_index(["nr","year"], drop = False, inplace = True)
formula = "lwage ~ married + union + C(year)*educ + EntityEffects"
reg = plm.PanelOLS.from_formula(formula = formula,
                               data = wagepan, drop_absorbed=True)
results = reg.fit()
LM_summary(results)

Variables have been fully absorbed and have removed from the regression:

educ

  results = reg.fit()


Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1708$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C(year)[T.1980],1.362459,83.90311,0.0,0.016238
C(year)[T.1981],1.340043,9.230656,0.0,0.145173
C(year)[T.1982],1.356698,9.348067,0.0,0.145131
C(year)[T.1983],1.372888,9.456084,0.0,0.145186
C(year)[T.1984],1.446833,9.961677,0.0,0.14524
C(year)[T.1985],1.412184,9.731511,0.0,0.145115
C(year)[T.1986],1.428065,9.840444,0.0,0.145122
C(year)[T.1987],1.452904,10.006096,0.0,0.145202
married,0.05482,2.97734,0.002926,0.018413
union,0.082978,4.267104,2e-05,0.019446


In [4]:
# names = ["lwage", "married","union" , "educ"]
# X, y = build_X_y_matrices(wagepan, names, log_vars = None)#, constant = True)

# reg = plm.PanelOLS(y, X, entity_effects = True, time_effects = True, drop_absorbed = True)
# results = reg.fit()
# LM_summary(results)

## 14.2. Random Effects Models

$\dot{y}_{it} = y_{it} - \theta \bar{y}_i$

$\dot{y}_{it} = \beta_0(1-\theta) + \beta_1\dot{x}_{it1} + . . . + \beta_k\dot{x}_{itk} + \dot{\nu}_{it}$

$\nu_{it} = a_i + u_{it}$

$\theta = 1 - \sqrt{\frac{\theta_u^2}{\theta_u^2 + T\theta_a^2}}$

In [5]:
wagepan = woo.dataWoo("wagepan")
wagepan.groupby("nr").var()

Unnamed: 0_level_0,year,agric,black,bus,construc,ent,exper,fin,hisp,poorhlth,...,union,lwage,d81,d82,d83,d84,d85,d86,d87,expersq
nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13,6.0,0.0,0.0,0.214286,0.000000,0.000,6.0,0.000000,0.0,0.0,...,0.125000,0.681418,0.125,0.125,0.125,0.125,0.125,0.125,0.125,510.0
17,6.0,0.0,0.0,0.000000,0.267857,0.000,6.0,0.000000,0.0,0.0,...,0.000000,0.009753,0.125,0.125,0.125,0.125,0.125,0.125,0.125,1374.0
18,6.0,0.0,0.0,0.000000,0.000000,0.000,6.0,0.000000,0.0,0.0,...,0.000000,0.186106,0.125,0.125,0.125,0.125,0.125,0.125,0.125,1374.0
45,6.0,0.0,0.0,0.125000,0.125000,0.000,6.0,0.000000,0.0,0.0,...,0.214286,0.049972,0.125,0.125,0.125,0.125,0.125,0.125,0.125,750.0
110,6.0,0.0,0.0,0.125000,0.000000,0.000,6.0,0.214286,0.0,0.0,...,0.125000,0.009836,0.125,0.125,0.125,0.125,0.125,0.125,0.125,1758.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12451,6.0,0.0,0.0,0.000000,0.000000,0.125,6.0,0.000000,0.0,0.0,...,0.000000,0.137637,0.125,0.125,0.125,0.125,0.125,0.125,0.125,750.0
12477,6.0,0.0,0.0,0.000000,0.000000,0.000,6.0,0.000000,0.0,0.0,...,0.000000,0.001519,0.125,0.125,0.125,0.125,0.125,0.125,0.125,1374.0
12500,6.0,0.0,0.0,0.267857,0.000000,0.000,6.0,0.000000,0.0,0.0,...,0.000000,0.198548,0.125,0.125,0.125,0.125,0.125,0.125,0.125,1374.0
12534,6.0,0.0,0.0,0.000000,0.000000,0.000,6.0,0.000000,0.0,0.0,...,0.000000,0.026982,0.125,0.125,0.125,0.125,0.125,0.125,0.125,750.0


In [6]:
isv_nr = (wagepan.groupby("nr").var() == 0)
noVar_nr = isv_nr.all(axis = 0)
# noVar_nr,
isv_nr.columns[noVar_nr]

Index(['black', 'hisp', 'educ'], dtype='object')

In [7]:
isv_t = wagepan.groupby("nr").var() == 0
noVar_t = isv_t.all(axis = 0)
# isv_t, 
isv_t.columns[noVar_t]

Index(['black', 'hisp', 'educ'], dtype='object')

In [8]:
wagepan.set_index(["nr", "year"], drop = False, inplace = True)

In [9]:
formula = "lwage ~ educ + black + hisp + exper + I(exper**2) + married + union + C(year)"

reg_pooled =plm.PooledOLS.from_formula(formula = formula,
                                    data = wagepan)
results_pooled = reg_pooled.fit()
LM_summary(results_pooled)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1893$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C(year)[T.1980],0.092056,1.17613,0.2396076,0.07827
C(year)[T.1981],0.150376,1.793474,0.07296674,0.083846
C(year)[T.1982],0.15483,1.733458,0.08308526,0.089319
C(year)[T.1983],0.154068,1.632282,0.1026926,0.094388
C(year)[T.1984],0.182523,1.843739,0.0652892,0.098996
C(year)[T.1985],0.201302,1.952284,0.05096858,0.103111
C(year)[T.1986],0.234015,2.191982,0.02843369,0.10676
C(year)[T.1987],0.265889,2.416628,0.01570597,0.110025
I(exper**2),-0.002412,-2.941264,0.003286019,0.00082
black,-0.139234,-5.904869,3.798544e-09,0.02358


In [10]:
# just time effects same as pooled with time effects
reg_te = plm.PanelOLS.from_formula(formula = formula,
                                    data = wagepan)
results_te =reg_te.fit()
LM_summary(results_te)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1893$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C(year)[T.1980],0.092056,1.17613,0.2396076,0.07827
C(year)[T.1981],0.150376,1.793474,0.07296674,0.083846
C(year)[T.1982],0.15483,1.733458,0.08308526,0.089319
C(year)[T.1983],0.154068,1.632282,0.1026926,0.094388
C(year)[T.1984],0.182523,1.843739,0.0652892,0.098996
C(year)[T.1985],0.201302,1.952284,0.05096858,0.103111
C(year)[T.1986],0.234015,2.191982,0.02843369,0.10676
C(year)[T.1987],0.265889,2.416628,0.01570597,0.110025
I(exper**2),-0.002412,-2.941264,0.003286019,0.00082
black,-0.139234,-5.904869,3.798544e-09,0.02358


In [11]:
formula = "lwage ~ I(exper**2) + married + union + C(year)"
# Time and Entity Effects
reg_fe = plm.PanelOLS.from_formula(formula = formula + " + EntityEffects",
                                    data = wagepan)
results_fe = reg_fe.fit()
LM_summary(results_fe)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1806$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C(year)[T.1980],1.426019,77.748354,0.0,0.018341
C(year)[T.1981],1.57721,72.965647,0.0,0.021616
C(year)[T.1982],1.678989,63.258282,0.0,0.026542
C(year)[T.1983],1.780462,53.439178,0.0,0.033318
C(year)[T.1984],1.916133,45.98163,0.0,0.041672
C(year)[T.1985],2.043501,39.646046,0.0,0.051544
C(year)[T.1986],2.191515,34.771405,0.0,0.063026
C(year)[T.1987],2.351043,30.866912,0.0,0.076167
I(exper**2),-0.005185,-7.361196,2.220446e-13,0.000704
married,0.04668,2.549386,0.01083019,0.01831


In [18]:
formula = "lwage ~ educ + black + hisp + exper + I(exper**2) + married + union + C(year)"
reg_re = plm.RandomEffects.from_formula(formula = formula,
                                    data = wagepan)

results_re = reg_re.fit()
LM_summary(results_re)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1806$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C(year)[T.1980],0.023414,0.154606,0.8771388,0.151441
C(year)[T.1981],0.063837,0.398786,0.6900706,0.160079
C(year)[T.1982],0.054265,0.321115,0.7481389,0.168989
C(year)[T.1983],0.043602,0.244962,0.8064974,0.177995
C(year)[T.1984],0.066427,0.355113,0.7225219,0.187057
C(year)[T.1985],0.081109,0.413565,0.6792131,0.196122
C(year)[T.1986],0.115241,0.561652,0.5743819,0.205182
C(year)[T.1987],0.15825,0.73862,0.4601778,0.214251
I(exper**2),-0.004729,-6.862314,7.731593e-12,0.000689
black,-0.139379,-2.9054,0.003686359,0.047972


#### We can compare all results in a single table

In [20]:
from linearmodels.panel import compare

compare({"FE": results_fe, "RE": results_re, "Pooled": results_pooled})

0,1,2,3
,FE,RE,Pooled
Dep. Variable,lwage,lwage,lwage
Estimator,PanelOLS,RandomEffects,PooledOLS
No. Observations,4360,4360,4360
Cov. Est.,Unadjusted,Unadjusted,Unadjusted
R-squared,0.1806,0.1806,0.1893
R-Squared (Within),0.1806,0.1799,0.1692
R-Squared (Between),-0.0052,0.1853,0.2066
R-Squared (Overall),0.0807,0.1828,0.1893
F-statistic,83.851,68.409,72.459


### Hausman Test 
#### From Appendix: Script 14.4: Example-HausmTest.py

In [13]:
b_fe = results_fe.params
b_fe_cov = results_fe.cov
b_re = results_re.params
b_re_cov = results_re.cov

common_coef = set(results_fe.params.index).intersection(results_re.params.index)
common_coef

{'C(year)[T.1980]',
 'C(year)[T.1981]',
 'C(year)[T.1982]',
 'C(year)[T.1983]',
 'C(year)[T.1984]',
 'C(year)[T.1985]',
 'C(year)[T.1986]',
 'C(year)[T.1987]',
 'I(exper**2)',
 'married',
 'union'}

In [14]:
b_diff = results_fe.params[common_coef].sub(results_re.params[common_coef])
df = len(b_diff)
b_diff

  b_diff = results_fe.params[common_coef].sub(results_re.params[common_coef])
  b_diff = results_fe.params[common_coef].sub(results_re.params[common_coef])


C(year)[T.1980]    1.402605
C(year)[T.1986]    2.076274
C(year)[T.1981]    1.513373
C(year)[T.1985]    1.962392
C(year)[T.1982]    1.624725
union             -0.025865
C(year)[T.1983]    1.736860
C(year)[T.1987]    2.192793
I(exper**2)       -0.000456
married           -0.017134
C(year)[T.1984]    1.849707
Name: parameter, dtype: float64

In [15]:
b_cov_diff = b_fe_cov.loc[common_coef, common_coef].sub(b_re_cov.loc[common_coef, common_coef])
b_cov_diff

  b_cov_diff = b_fe_cov.loc[common_coef, common_coef].sub(b_re_cov.loc[common_coef, common_coef])
  b_cov_diff = b_fe_cov.loc[common_coef, common_coef].sub(b_re_cov.loc[common_coef, common_coef])


Unnamed: 0,C(year)[T.1980],C(year)[T.1986],C(year)[T.1981],C(year)[T.1985],C(year)[T.1982],union,C(year)[T.1983],C(year)[T.1987],I(exper**2),married,C(year)[T.1984]
C(year)[T.1980],-0.022598,-0.029378,-0.023814,-0.028336,-0.024998,-7.180325e-05,-0.026143,-0.030377,-1.882321e-05,-0.0001981415,-0.027256
C(year)[T.1986],-0.029378,-0.038127,-0.031141,-0.036976,-0.032788,-0.0001188561,-0.0343,-0.03914,-6.415442e-05,-0.000411616,-0.035698
C(year)[T.1981],-0.023814,-0.031141,-0.025158,-0.030043,-0.026455,-8.043469e-05,-0.027697,-0.03218,-2.627847e-05,-0.0002348176,-0.028894
C(year)[T.1985],-0.028336,-0.036976,-0.030043,-0.035807,-0.03165,-0.0001118637,-0.033137,-0.038024,-5.64978e-05,-0.0003767402,-0.034524
C(year)[T.1982],-0.024998,-0.032788,-0.026455,-0.03165,-0.027853,-8.933812e-05,-0.029178,-0.033852,-3.376926e-05,-0.0002700965,-0.030444
union,-7.2e-05,-0.000119,-8e-05,-0.000112,-8.9e-05,5.40463e-05,-9.7e-05,-0.000129,-9.915058e-08,7.736228e-07,-0.000105
C(year)[T.1983],-0.026143,-0.0343,-0.027697,-0.033137,-0.029178,-9.706718e-05,-0.030572,-0.035373,-4.130821e-05,-0.0003068184,-0.031892
C(year)[T.1987],-0.030377,-0.03914,-0.03218,-0.038024,-0.033852,-0.0001291239,-0.035373,-0.040102,-7.184795e-05,-0.0004471107,-0.036766
I(exper**2),-1.9e-05,-6.4e-05,-2.6e-05,-5.6e-05,-3.4e-05,-9.915058e-08,-4.1e-05,-7.2e-05,2.128958e-08,1.85887e-07,-4.9e-05
married,-0.000198,-0.000412,-0.000235,-0.000377,-0.00027,7.736228e-07,-0.000307,-0.000447,1.85887e-07,5.377188e-05,-0.000342


In [16]:
b_var_diff = pd.Series(np.diag(b_cov_diff), index=[b_cov_diff.index])
b_var_diff

C(year)[T.1980]   -2.259812e-02
C(year)[T.1986]   -3.812734e-02
C(year)[T.1981]   -2.515791e-02
C(year)[T.1985]   -3.580718e-02
C(year)[T.1982]   -2.785277e-02
union              5.404630e-05
C(year)[T.1983]   -3.057219e-02
C(year)[T.1987]   -4.010217e-02
I(exper**2)        2.128958e-08
married            5.377188e-05
C(year)[T.1984]   -3.325389e-02
dtype: float64

In [21]:
stat = abs(b_diff.T @ np.linalg.inv(b_cov_diff) @ b_diff)
p_val = 1 - stats.chi2.cdf(stat, df)
stat, p_val

(43.427071175710296, 9.150613851205414e-06)

In [33]:
def hausman_test(results_fe, results_re):
    b_fe = results_fe.params
    b_fe_cov = results_fe.cov
    b_re = results_re.params
    b_re_cov = results_re.cov
    common_coef = set(results_fe.params.index).intersection(results_re.params.index)    
    b_diff = results_fe.params[common_coef].sub(results_re.params[common_coef])
    df = len(b_diff)
    b_cov_diff = b_fe_cov.loc[common_coef, common_coef].sub(b_re_cov.loc[common_coef, common_coef])
    b_var_diff = pd.Series(np.diag(b_cov_diff), index=[b_cov_diff.index])
    h_results = {"Hausman":{}}
    h_results["Hausman"]["t"] = abs(b_diff.T @ np.linalg.inv(b_cov_diff) @ b_diff)
    h_results["Hausman"]["p"] = 1 - stats.chi2.cdf(stat, df)
    return pd.DataFrame(h_results)
hausman_test(results_fe, results_re).round(3)

  b_diff = results_fe.params[common_coef].sub(results_re.params[common_coef])
  b_diff = results_fe.params[common_coef].sub(results_re.params[common_coef])
  b_cov_diff = b_fe_cov.loc[common_coef, common_coef].sub(b_re_cov.loc[common_coef, common_coef])
  b_cov_diff = b_fe_cov.loc[common_coef, common_coef].sub(b_re_cov.loc[common_coef, common_coef])


Unnamed: 0,Hausman
p,0.0
t,43.427
