# Problem Set 5 (Jugal Marfatia)

## Question 1.  

## Import data and description

In [1]:
from linearmodels.datasets import wage_panel
import statsmodels.api as sm
import pandas as pd
import numpy as np

print(wage_panel.DESCR)


F. Vella and M. Verbeek (1998), "Whose Wages Do Unions Raise? A Dynamic Model
of Unionism and Wage Rate Determination for Young Men," Journal of Applied
Econometrics 13, 163-183.

nr                       person identifier
year                     1980 to 1987
black                    =1 if black
exper                    labor market experience
hisp                     =1 if Hispanic
hours                    annual hours worked
married                  =1 if married
educ                     years of schooling
union                    =1 if in union
lwage                    log(wage)
expersq                  exper^2
occupation               Occupation code



## Set dataframe index

In [2]:
data = wage_panel.load()
year = pd.Categorical(data.year)
nr = pd.Categorical(data.nr)

data = data.set_index(['nr', 'year'])
data['year'] = year
data['nr'] = nr

data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,black,exper,hisp,hours,married,educ,union,lwage,expersq,occupation,year,nr
nr,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
13,1980,0,1,0,2672,0,14,0,1.19754,1,9,1980,13
13,1981,0,2,0,2320,0,14,1,1.85306,4,9,1981,13
13,1982,0,3,0,2940,0,14,0,1.344462,9,9,1982,13
13,1983,0,4,0,2960,0,14,0,1.433213,16,9,1983,13
13,1984,0,5,0,3071,0,14,0,1.568125,25,5,1984,13


## 1. A. Fixed efffects regression with clutered standard errors and unadjusted

In [3]:
from linearmodels import PanelOLS
exog_vars = ['expersq','union','married']
exog = sm.add_constant(data[exog_vars])

mod = PanelOLS(data.lwage, exog, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_entity=True) # Clustered Std. Errors
print(res)

res = mod.fit()
print(res)# Non- Clustered Std. Errors

                          PanelOLS Estimation Summary                           
Dep. Variable:                  lwage   R-squared:                        0.1365
Estimator:                   PanelOLS   R-squared (Between):             -0.0674
No. Observations:                4360   R-squared (Within):               0.1365
Date:                Thu, Oct 04 2018   R-squared (Overall):              0.0270
Time:                        13:01:00   Log-likelihood                   -1439.0
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      200.87
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                  F(3,3812)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             114.78
                            

## 1. B. Create Y hat variable as decribed in the problem set.

In [4]:
data = data.join(data.groupby(['nr'])['lwage', 'expersq', 'union', 'married'].mean(), rsuffix='_mean')

data['lwage_hat'] = data['lwage'] - data['lwage_mean']
data['expersq_hat'] = data['expersq'] - data['expersq_mean']
data['union_hat'] = data['union'] - data['union_mean']
data['married_hat'] = data['married'] - data['married_mean']

data.head()

Defaulting to column, but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,black,exper,hisp,hours,married,educ,union,lwage,expersq,occupation,year,nr,lwage_mean,expersq_mean,union_mean,married_mean,lwage_hat,expersq_hat,union_hat,married_hat
nr,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13,1980,0,1,0,2672,0,14,0,1.19754,1,9,1980,13,1.255652,25.5,0.125,0.0,-0.058112,-24.5,-0.125,0.0
13,1981,0,2,0,2320,0,14,1,1.85306,4,9,1981,13,1.255652,25.5,0.125,0.0,0.597408,-21.5,0.875,0.0
13,1982,0,3,0,2940,0,14,0,1.344462,9,9,1982,13,1.255652,25.5,0.125,0.0,0.08881,-16.5,-0.125,0.0
13,1983,0,4,0,2960,0,14,0,1.433213,16,9,1983,13,1.255652,25.5,0.125,0.0,0.177561,-9.5,-0.125,0.0
13,1984,0,5,0,3071,0,14,0,1.568125,25,5,1984,13,1.255652,25.5,0.125,0.0,0.312473,-0.5,-0.125,0.0


## 1. B. Regression of the variables with clustered errors and unadjusted.

In [5]:
exog_vars = ['expersq_hat','union_hat','married_hat']
exog = sm.add_constant(data[exog_vars])

mod = PanelOLS(data.lwage_hat, exog)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res)

res = mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:              lwage_hat   R-squared:                        0.1365
Estimator:                   PanelOLS   R-squared (Between):             -0.3905
No. Observations:                4360   R-squared (Within):               0.1365
Date:                Thu, Oct 04 2018   R-squared (Overall):              0.1365
Time:                        13:01:00   Log-likelihood                   -1439.0
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      229.54
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                  F(3,4356)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             114.78
                            

## 1. C. Regression with individual dummy variables for each "i" with clustered errors and unadjusted.


In [6]:
from linearmodels.panel import PooledOLS

exog_vars = ['expersq','union','married', 'nr']
exog = sm.add_constant(data[exog_vars])
mod = PooledOLS(data.lwage, exog)


pooled_res = mod.fit(cov_type='clustered', cluster_entity=True)
print(pooled_res.params[0:4])
print(" ")

print("Unadjusted Standard Errors")
pooled_res = mod.fit()
print(pooled_res.params[0:4])

const      1.150980
expersq    0.003699
union      0.082762
married    0.107343
Name: parameter, dtype: float64
 
Unadjusted Standard Errors
const      1.150980
expersq    0.003699
union      0.082762
married    0.107343
Name: parameter, dtype: float64


###  *In the above I have included the dummy for each individual but have not included in the results output as they are too many. 

## We get the same parameter estimates for 
### *** expersq, union, married***
## from all 3 techniques because they are theoretically equivalent and the clustered standard errors have little effect on the confidence interval. 