# Basics of Panel Data: FE vs FD (Gapminder)

*(Empirical-only notebook: implements all code chunks from the slide deck in both R and Python.)*


In [None]:
!pip -q install rpy2
%load_ext rpy2.ipython

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

# For panel fixed effects in Python
from linearmodels.panel import PanelOLS


In [None]:
%%R
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, lubridate, causaldata, fixest, plm, modelsummary, lmtest)


## Load data (Python)


In [None]:
import plotly.express as px

reg_dat_py = px.data.gapminder().copy()
reg_dat_py["lgdp_pc"] = np.log(reg_dat_py["gdpPercap"])

reg_dat_py.head()


## Load data (R)


In [None]:
%%R
library(causaldata)
reg.dat <- causaldata::gapminder %>%
  mutate(lgdp_pc = log(gdpPercap))

head(reg.dat)


## Fixed effects (Python: country FE via PanelOLS)


In [None]:
# PanelOLS expects a MultiIndex (entity, time)
panel = reg_dat_py.set_index(["country", "year"])

y = panel["lifeExp"]
X = panel[["lgdp_pc"]]
X = sm.add_constant(X)  # constant is fine; entity effects absorb it

fe_py = PanelOLS(y, X, entity_effects=True)
fe_res_py = fe_py.fit(cov_type="clustered", cluster_entity=True)

fe_res_py.summary


## Fixed effects (R: fixest)


In [None]:
%%R
library(fixest)
library(modelsummary)

m1 <- feols(lifeExp ~ lgdp_pc | country, data=reg.dat)

modelsummary(list("Default FE"=m1),
             shape=term + statistic ~ model,
             gof_map=NA,
             coef_rename=c("lgdp_pc"="Log GDP per Capita"))


## Manual within estimator (Python: demean within country, no intercept)


In [None]:
reg_dm = reg_dat_py.copy()
reg_dm["lifeExp_dm"] = reg_dm["lifeExp"] - reg_dm.groupby("country")["lifeExp"].transform("mean")
reg_dm["lgdp_pc_dm"] = reg_dm["lgdp_pc"] - reg_dm.groupby("country")["lgdp_pc"].transform("mean")

m2_py = smf.ols("lifeExp_dm ~ 0 + lgdp_pc_dm", data=reg_dm).fit(
    cov_type="cluster",
    cov_kwds={"groups": reg_dm["country"]}
)
m2_py.summary()


## Manual within estimator (R: demean within country, cluster by country)


In [None]:
%%R
library(lmtest)

reg.dat_dm <- reg.dat %>%
  group_by(country) %>%
  mutate(lgdp_pc = lgdp_pc - mean(lgdp_pc, na.rm=TRUE),
         lifeExp = lifeExp - mean(lifeExp, na.rm=TRUE)) %>%
  ungroup()

m2 <- lm(lifeExp ~ 0 + lgdp_pc, data=reg.dat_dm)

modelsummary(list("Default FE"=m1, "Manual FE"=m2),
             shape=term + statistic ~ model,
             gof_map=NA,
             coef_rename=c("lgdp_pc"="Log GDP per Capita"),
             vcov = ~country)


## First differences (Python: manual FD, no intercept)


In [None]:
reg_fd = reg_dat_py.sort_values(["country", "year"]).copy()
reg_fd["lifeExp_fd"] = reg_fd.groupby("country")["lifeExp"].diff()
reg_fd["lgdp_pc_fd"] = reg_fd.groupby("country")["lgdp_pc"].diff()

reg_fd = reg_fd.dropna(subset=["lifeExp_fd", "lgdp_pc_fd"])

m_fd_py = smf.ols("lifeExp_fd ~ 0 + lgdp_pc_fd", data=reg_fd).fit(
    cov_type="cluster",
    cov_kwds={"groups": reg_fd["country"]}
)
m_fd_py.summary()


## First differences (R: plm FD)


In [None]:
%%R
library(plm)

m3 <- plm(lifeExp ~ 0 + lgdp_pc, model="fd", index=c("country","year"), data=reg.dat)

modelsummary(list("Default FE"=m1, "Manual FE"=m2, "Default FD"=m3),
             shape=term + statistic ~ model,
             gof_map=NA,
             coef_rename=c("lgdp_pc"="Log GDP per Capita"))


## Manual first differences (R: explicit lag)


In [None]:
%%R
reg.dat_fd <- reg.dat %>%
  group_by(country) %>%
  arrange(country, year) %>%
  mutate(fd_lifeexp = lifeExp - dplyr::lag(lifeExp),
         lgdp_pc    = lgdp_pc - dplyr::lag(lgdp_pc)) %>%
  na.omit() %>%
  ungroup()

m4 <- lm(fd_lifeexp ~ 0 + lgdp_pc, data=reg.dat_fd)

modelsummary(list("Default FE"=m1, "Manual FE"=m2, "Default FD"=m3, "Manual FD"=m4),
             shape=term + statistic ~ model,
             gof_map=NA,
             coef_rename=c("lgdp_pc"="Log GDP per Capita"))


## FE and FD on the same time period (Python)


In [None]:
# Match FE sample to FD sample by dropping the first observed year within each country
reg_dat2_py = reg_dat_py.sort_values(["country", "year"]).copy()
reg_dat2_py["has_lag"] = reg_dat2_py.groupby("country")["year"].rank(method="first") > 1
reg_dat2_py = reg_dat2_py.loc[reg_dat2_py["has_lag"]].copy()

panel2 = reg_dat2_py.set_index(["country", "year"])
y2 = panel2["lifeExp"]
X2 = sm.add_constant(panel2[["lgdp_pc"]])

fe_py_same = PanelOLS(y2, X2, entity_effects=True).fit(cov_type="clustered", cluster_entity=True)

# FD on same sample (already constructed above); ensure alignment
reg_fd2 = reg_dat2_py.sort_values(["country", "year"]).copy()
reg_fd2["lifeExp_fd"] = reg_fd2.groupby("country")["lifeExp"].diff()
reg_fd2["lgdp_pc_fd"] = reg_fd2.groupby("country")["lgdp_pc"].diff()
reg_fd2 = reg_fd2.dropna(subset=["lifeExp_fd", "lgdp_pc_fd"])

fd_py_same = smf.ols("lifeExp_fd ~ 0 + lgdp_pc_fd", data=reg_fd2).fit(
    cov_type="cluster",
    cov_kwds={"groups": reg_fd2["country"]}
)

print("Python FE (same sample):")
print(fe_py_same.summary.tables[1])
print("\nPython FD (same sample):")
print(fd_py_same.summary().tables[1])


## FE and FD on the same time period (R)


In [None]:
%%R
reg.dat2 <- reg.dat %>%
  inner_join(reg.dat_fd %>% select(country, year), by=c("country","year"))

m5 <- feols(lifeExp ~ lgdp_pc | country, data=reg.dat2)

modelsummary(list("Default FE (same sample)"=m5, "Default FD"=m3, "Manual FD"=m4),
             shape=term + statistic ~ model,
             gof_map=NA,
             coef_rename=c("lgdp_pc"="Log GDP per Capita"))
