# CPS1988 dataset

Recommendation: Use `statsmodels` and `pandas` for this exercise.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

## Exercise a)

Briefly describe the data set:

- Name the dependent variable and the independent variables.
- Which scales of measurement do the variables belong to (e.g., nominal, ordinal, interval or ratio)?
- Does the data set consist of cross-sectional, time-series or panel data?

In [None]:
# Import and print the data
data = pd.read_csv("CPS1988.csv")
print(data.head())

## Exercise b)

Plot the dependent variable against each independent variable and transform the variables if necessary.

In [None]:
independent_variables = data.columns.drop("wage")

fig, axs = plt.subplots(3, 2)
for var, ax in zip(independent_variables, axs.reshape(-1)):
    ax.scatter(data[var], data["wage"])
    ax.set_xlabel(var)
fig.supylabel("Wage");
fig.tight_layout()


Which transformations would you carry out and why? 

Estimate the following model:

$$
  \ln( \hat{\text{wage}}_i ) = \hat\beta_0 + \hat\beta_1 \cdot \text{education}_i + \hat\beta_2 \cdot \text{ethnicity}_i + \hat\beta_3 \cdot \text{experience}_i\\
  \phantom{=} + \hat\beta_4 \cdot \text{experience}_i^2.
$$

In [None]:
# Build input data
X = data[["education", "ethnicity", "experience"]]
X["ethnicity"] = (X["ethnicity"] == "afam").astype(float)
X["experience^2"] = np.square(data["experience"])
X = sm.add_constant(X)

print(X.head())

In [None]:
# Fit model
model = sm.OLS(np.log(data["wage"]), X)
results = model.fit()
print(results.summary())

## Exercise c)

Interpret the model.
- Which variables are statistically significant?
- Is the entire model statistically significant?
- What is the explanatory power of the model and why?
- Interpret each regression coefficient.

## Exercise d)

Now consider the following alternative model:

$$
    \ln( \widehat{\text{wage}}_i ) = \hat\beta_0 + \hat\beta_1 \cdot \text{education}_i + \hat\beta_2 \cdot \text{ethnicity}_i + \hat\beta_3 \cdot \text{education}_i \cdot \text{ethnicity}_i\\
    \phantom{=} + \hat\beta_4 \cdot \text{experience}_i + \hat\beta_5 \cdot \text{experience}_i^2.
$$

What is the difference between both models?

## Exercise e)

Repeat c) with the alternative model.

In [None]:
# Build input data
X = data[["education", "ethnicity", "experience"]]
X["ethnicity"] = (X["ethnicity"] == "afam").astype(float)
X["ed*eth"] = X["education"] * X["ethnicity"]
X["experience^2"] = np.square(data["experience"])
X = sm.add_constant(X)

print(X.head())

In [None]:
# Fit model
model = sm.OLS(np.log(data["wage"]), X)
results = model.fit()
print(results.summary())