Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,51 @@ results = did.fit(
)
```

### Fixed Effects

Use `fixed_effects` for low-dimensional categorical controls (creates dummy variables):

```python
# State and industry fixed effects
results = did.fit(
data,
outcome='sales',
treatment='treated',
time='post',
fixed_effects=['state', 'industry']
)

# Access fixed effect coefficients
state_coefs = {k: v for k, v in results.coefficients.items() if k.startswith('state_')}
```

Use `absorb` for high-dimensional fixed effects (more efficient, uses within-transformation):

```python
# Absorb firm-level fixed effects (efficient for many firms)
results = did.fit(
data,
outcome='sales',
treatment='treated',
time='post',
absorb=['firm_id']
)
```

Combine covariates with fixed effects:

```python
results = did.fit(
data,
outcome='sales',
treatment='treated',
time='post',
covariates=['size', 'age'], # Linear controls
fixed_effects=['industry'], # Low-dimensional FE (dummies)
absorb=['firm_id'] # High-dimensional FE (absorbed)
)
```

### Cluster-Robust Standard Errors

```python
Expand Down Expand Up @@ -222,12 +267,25 @@ DifferenceInDifferences(

| Method | Description |
|--------|-------------|
| `fit(data, outcome, treatment, time, formula, covariates)` | Fit the DiD model |
| `fit(data, outcome, treatment, time, ...)` | Fit the DiD model |
| `summary()` | Get formatted summary string |
| `print_summary()` | Print summary to stdout |
| `get_params()` | Get estimator parameters (sklearn-compatible) |
| `set_params(**params)` | Set estimator parameters (sklearn-compatible) |

**fit() Parameters:**

| Parameter | Type | Description |
|-----------|------|-------------|
| `data` | DataFrame | Input data |
| `outcome` | str | Outcome variable column name |
| `treatment` | str | Treatment indicator column (0/1) |
| `time` | str | Post-treatment indicator column (0/1) |
| `formula` | str | R-style formula (alternative to column names) |
| `covariates` | list | Linear control variables |
| `fixed_effects` | list | Categorical FE columns (creates dummies) |
| `absorb` | list | High-dimensional FE (within-transformation) |

### DiDResults

**Attributes:**
Expand Down
72 changes: 64 additions & 8 deletions diff_diff/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ def fit(
treatment: str = None,
time: str = None,
formula: str = None,
covariates: list = None
covariates: list = None,
fixed_effects: list = None,
absorb: list = None
) -> DiDResults:
"""
Fit the Difference-in-Differences model.
Expand All @@ -124,7 +126,15 @@ def fit(
R-style formula (e.g., "outcome ~ treated * post").
If provided, overrides outcome, treatment, and time parameters.
covariates : list, optional
List of covariate column names to include in the regression.
List of covariate column names to include as linear controls.
fixed_effects : list, optional
List of categorical column names to include as fixed effects.
Creates dummy variables for each category (drops first level).
Use for low-dimensional fixed effects (e.g., industry, region).
absorb : list, optional
List of categorical column names for high-dimensional fixed effects.
Uses within-transformation (demeaning) instead of dummy variables.
More efficient for large numbers of categories (e.g., firm, individual).

Returns
-------
Expand All @@ -135,6 +145,18 @@ def fit(
------
ValueError
If required parameters are missing or data validation fails.

Examples
--------
Using fixed effects (dummy variables):

>>> did.fit(data, outcome='sales', treatment='treated', time='post',
... fixed_effects=['state', 'industry'])

Using absorbed fixed effects (within-transformation):

>>> did.fit(data, outcome='sales', treatment='treated', time='post',
... absorb=['firm_id'])
"""
# Parse formula if provided
if formula is not None:
Expand All @@ -147,10 +169,35 @@ def fit(
# Validate inputs
self._validate_data(data, outcome, treatment, time, covariates)

# Validate fixed effects and absorb columns
if fixed_effects:
for fe in fixed_effects:
if fe not in data.columns:
raise ValueError(f"Fixed effect column '{fe}' not found in data")
if absorb:
for ab in absorb:
if ab not in data.columns:
raise ValueError(f"Absorb column '{ab}' not found in data")

# Handle absorbed fixed effects (within-transformation)
working_data = data.copy()
absorbed_vars = []
n_absorbed_effects = 0

if absorb:
# Apply within-transformation for each absorbed variable
vars_to_demean = [outcome] + (covariates or [])
for ab_var in absorb:
n_absorbed_effects += working_data[ab_var].nunique() - 1
for var in vars_to_demean:
group_means = working_data.groupby(ab_var)[var].transform("mean")
working_data[var] = working_data[var] - group_means
absorbed_vars.append(ab_var)

# Extract variables
y = data[outcome].values.astype(float)
d = data[treatment].values.astype(float)
t = data[time].values.astype(float)
y = working_data[outcome].values.astype(float)
d = working_data[treatment].values.astype(float)
t = working_data[time].values.astype(float)

# Validate binary variables
validate_binary(d, "treatment")
Expand All @@ -166,9 +213,18 @@ def fit(
# Add covariates if provided
if covariates:
for cov in covariates:
X = np.column_stack([X, data[cov].values.astype(float)])
X = np.column_stack([X, working_data[cov].values.astype(float)])
var_names.append(cov)

# Add fixed effects as dummy variables
if fixed_effects:
for fe in fixed_effects:
# Create dummies, drop first category to avoid multicollinearity
dummies = pd.get_dummies(data[fe], prefix=fe, drop_first=True)
for col in dummies.columns:
X = np.column_stack([X, dummies[col].values.astype(float)])
var_names.append(col)

# Fit OLS
coefficients, residuals, fitted, r_squared = self._fit_ols(X, y)

Expand All @@ -190,8 +246,8 @@ def fit(
att = coefficients[att_idx]
se = np.sqrt(vcov[att_idx, att_idx])

# Compute test statistics
df = len(y) - X.shape[1]
# Compute test statistics (adjust df for absorbed fixed effects)
df = len(y) - X.shape[1] - n_absorbed_effects
t_stat = att / se
p_value = compute_p_value(t_stat, df=df)
conf_int = compute_confidence_interval(att, se, self.alpha, df=df)
Expand Down
176 changes: 176 additions & 0 deletions tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,3 +276,179 @@ def test_is_significant_property(self, simple_did_data):
assert isinstance(results.is_significant, bool)
# With true effect, should be significant
assert results.is_significant


class TestFixedEffects:
"""Tests for fixed effects functionality."""

@pytest.fixture
def panel_data_with_fe(self):
"""Create panel data with fixed effects."""
np.random.seed(42)
n_units = 50
n_periods = 4
n_states = 5

data = []
for unit in range(n_units):
state = unit % n_states
is_treated = unit < n_units // 2
# State-level effect
state_effect = state * 2.0

for period in range(n_periods):
post = 1 if period >= 2 else 0

y = 10.0 + state_effect + period * 0.5
if is_treated and post:
y += 3.0 # True ATT

y += np.random.normal(0, 0.5)

data.append({
"unit": unit,
"state": f"state_{state}",
"period": period,
"treated": int(is_treated),
"post": post,
"outcome": y,
})

return pd.DataFrame(data)

def test_fixed_effects_dummy(self, panel_data_with_fe):
"""Test fixed effects using dummy variables."""
did = DifferenceInDifferences()
results = did.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post",
fixed_effects=["state"]
)

assert results is not None
assert did.is_fitted_
# ATT should still be close to 3.0
assert abs(results.att - 3.0) < 1.0

def test_fixed_effects_coefficients_include_dummies(self, panel_data_with_fe):
"""Test that dummy coefficients are included in results."""
did = DifferenceInDifferences()
results = did.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post",
fixed_effects=["state"]
)

# Should have state dummy coefficients
state_coefs = [k for k in results.coefficients.keys() if k.startswith("state_")]
assert len(state_coefs) == 4 # 5 states - 1 (dropped first)

def test_absorb_fixed_effects(self, panel_data_with_fe):
"""Test absorbed (within-transformed) fixed effects."""
did = DifferenceInDifferences()
results = did.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post",
absorb=["unit"]
)

assert results is not None
assert did.is_fitted_
# ATT should still be close to 3.0
assert abs(results.att - 3.0) < 1.0

def test_fixed_effects_vs_no_fe(self, panel_data_with_fe):
"""Test that FE produces different (usually better) estimates."""
did_no_fe = DifferenceInDifferences()
did_with_fe = DifferenceInDifferences()

results_no_fe = did_no_fe.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post"
)

results_with_fe = did_with_fe.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post",
fixed_effects=["state"]
)

# Both should estimate positive ATT
assert results_no_fe.att > 0
assert results_with_fe.att > 0

# FE model should have higher R-squared (explains more variance)
assert results_with_fe.r_squared >= results_no_fe.r_squared

def test_invalid_fixed_effects_column(self, panel_data_with_fe):
"""Test error when fixed effects column doesn't exist."""
did = DifferenceInDifferences()
with pytest.raises(ValueError, match="not found"):
did.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post",
fixed_effects=["nonexistent_column"]
)

def test_invalid_absorb_column(self, panel_data_with_fe):
"""Test error when absorb column doesn't exist."""
did = DifferenceInDifferences()
with pytest.raises(ValueError, match="not found"):
did.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post",
absorb=["nonexistent_column"]
)

def test_multiple_fixed_effects(self, panel_data_with_fe):
"""Test multiple fixed effects."""
# Add another categorical variable
panel_data_with_fe["industry"] = panel_data_with_fe["unit"] % 3

did = DifferenceInDifferences()
results = did.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post",
fixed_effects=["state", "industry"]
)

assert results is not None
# Should have both state and industry dummies
state_coefs = [k for k in results.coefficients.keys() if k.startswith("state_")]
industry_coefs = [k for k in results.coefficients.keys() if k.startswith("industry_")]
assert len(state_coefs) > 0
assert len(industry_coefs) > 0

def test_covariates_with_fixed_effects(self, panel_data_with_fe):
"""Test combining covariates with fixed effects."""
# Add a continuous covariate
panel_data_with_fe["size"] = np.random.normal(100, 10, len(panel_data_with_fe))

did = DifferenceInDifferences()
results = did.fit(
panel_data_with_fe,
outcome="outcome",
treatment="treated",
time="post",
covariates=["size"],
fixed_effects=["state"]
)

assert results is not None
assert "size" in results.coefficients