In [4]:
library(data.table)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(plm)
library(ivreg)
library(ggeffects)
library(haven)

In [5]:
data <- read_dta("soep_lebensz_en.dta")
data <- as.data.table(data)

In [6]:
# Construct has_kids binary variable
# has_kids = 1 if person has any children (no_kids > 0), 0 otherwise
data[, has_kids := ifelse(no_kids > 0, 1, 0)]

# Create categorical variables for regression
data[, health_cat := factor(health_org)]
data[, year_factor := factor(year)]
data[, sex_factor := factor(sex, labels = c("male", "female"))]

# Check the has_kids variable
table(data$has_kids, useNA = "ifany")


   0    1 <NA> 
7428 4588  906 

In [10]:
# Load additional packages for regression analysis
library(lmtest)
library(sandwich)

# Clean data for regression (remove missing values)
data_clean <- data[!is.na(has_kids) & !is.na(satisf_std) & !is.na(education) & !is.na(health_org)]

# ============================================================================
# POOLED OLS REGRESSION
# ============================================================================

ols_model <- lm(
  satisf_std ~ has_kids + sex_factor + education + health_cat + year_factor,
  data = data_clean
)

# Get standard errors clustered at individual level
ols_coef <- coeftest(ols_model, vcov = vcovCL(ols_model, cluster = data_clean$id))

cat("POOLED OLS REGRESSION\n")
ols_coef

POOLED OLS REGRESSION



t test of coefficients:

                   Estimate Std. Error  t value  Pr(>|t|)    
(Intercept)      -1.4696226  0.1005546 -14.6152 < 2.2e-16 ***
has_kids         -0.1468156  0.0273686  -5.3644 8.293e-08 ***
sex_factorfemale  0.0614782  0.0274062   2.2432 0.0249029 *  
education         0.0238177  0.0052391   4.5461 5.524e-06 ***
health_cat2       0.7241917  0.0838475   8.6370 < 2.2e-16 ***
health_cat3       1.1849977  0.0831723  14.2475 < 2.2e-16 ***
health_cat4       1.5169296  0.0834787  18.1715 < 2.2e-16 ***
health_cat5       1.8118611  0.0885232  20.4676 < 2.2e-16 ***
year_factor2001  -0.0271567  0.0207891  -1.3063 0.1914810    
year_factor2002  -0.0827616  0.0221974  -3.7284 0.0001937 ***
year_factor2003  -0.1387162  0.0238563  -5.8147 6.251e-09 ***
year_factor2004  -0.2424405  0.0249236  -9.7273 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


In [12]:
# ============================================================================
# FIXED EFFECTS REGRESSION
# ============================================================================

# Create panel data structure
pdata <- pdata.frame(data_clean, index = c("id", "year"))

# Fixed effects regression (individual effects)
fe_model <- plm(
  satisf_std ~ has_kids + sex_factor + education + health_cat + year_factor,
  data = pdata,
  model = "within",
  effect = "individual"
)

cat("\nFIXED EFFECTS REGRESSION (Within Transformation)\n")
summary(fe_model)


FIXED EFFECTS REGRESSION (Within Transformation)


Oneway (individual) effect Within Model

Call:
plm(formula = satisf_std ~ has_kids + sex_factor + education + 
    health_cat + year_factor, data = pdata, effect = "individual", 
    model = "within")

Unbalanced Panel: n = 3289, T = 1-5, N = 10659

Residuals:
    Min.  1st Qu.   Median  3rd Qu.     Max. 
-3.96902 -0.26311  0.00000  0.30716  3.47311 

Coefficients:
                 Estimate Std. Error  t-value  Pr(>|t|)    
has_kids         0.042085   0.047069   0.8941   0.37129    
education       -0.017705   0.032337  -0.5475   0.58404    
health_cat2      0.388783   0.057805   6.7258 1.877e-11 ***
health_cat3      0.720040   0.059953  12.0101 < 2.2e-16 ***
health_cat4      0.909834   0.062198  14.6280 < 2.2e-16 ***
health_cat5      1.058753   0.068856  15.3763 < 2.2e-16 ***
year_factor2001 -0.051938   0.020384  -2.5480   0.01085 *  
year_factor2002 -0.126262   0.021739  -5.8081 6.580e-09 ***
year_factor2003 -0.184022   0.022783  -8.0773 7.682e-16 ***
year_factor2004 -0.295234   0.02

In [None]:
# Load required libraries
# ============================================================================
# INTERPRETATION: COMPARISON OF OLS vs FE
# ============================================================================

# Extract has_kids coefficients from both models
ols_has_kids <- ols_coef[2, "Estimate"]
fe_has_kids <- coef(fe_model)["has_kids"]
difference <- ols_has_kids - fe_has_kids

cat("\n\nCOMPARISON OF COEFFICIENTS\n")
cat(strrep("=", 70), "\n\n")
cat("Pooled OLS coefficient on has_kids:    ", round(ols_has_kids, 6), "\n")
cat("Fixed Effects coefficient on has_kids: ", round(fe_has_kids, 6), "\n")
cat("Difference (OLS - FE):                  ", round(difference, 6), "\n\n")

cat("INTERPRETATION:\n")
cat(strrep("=", 70), "\n")
cat("The difference between OLS and FE coefficients reveals information about\n")
cat("the unobserved effect f_i and its relationship with having kids.\n\n")

cat("In the OLS regression, has_kids has a NEGATIVE effect (-0.147):\n")
cat("- People with children have 0.147 lower standardized life satisfaction\n")
cat("- This reflects both the TRUE causal effect AND potential selection bias\n\n")

cat("In the FE regression, has_kids has a SMALL POSITIVE effect (+0.042):\n")
cat("- Within individuals, having children is associated with 0.042 higher\n")
cat("  standardized life satisfaction\n")
cat("- This estimates the causal effect controlling for time-invariant\n")
cat("  individual characteristics (f_i)\n\n")

cat("KEY INSIGHT:\n")
cat("The large difference of -0.189 indicates SUBSTANTIAL NEGATIVE SELECTION:\n")
cat("- People who have children tend to have LOWER unobserved life satisfaction\n")
cat("  (lower f_i) compared to those without children\n")
cat("- This negative correlation between f_i and has_kids biases the OLS\n")
cat("  coefficient downward\n")
cat("- The true causal effect is likely closer to the FE estimate (+0.042)\n")
cat("- Without controlling for f_i, OLS picks up both selection bias and\n")
cat("  the causal effect, resulting in a more negative coefficient\n\n")

cat("Mathematically:\n")
cat("OLS coeff = True causal effect + Bias due to Cov(has_kids, f_i)\n")
cat("-0.147    ≈        +0.042         +        -0.189\n")
cat("The large negative bias suggests negative selection into parenthood\n")
cat("for life satisfaction propensity.\n")



COMPARISON OF COEFFICIENTS


: [1m[33mError[39m in `"=" * 70`:[22m
[33m![39m non-numeric argument to binary operator