diff --git a/README.md b/README.md index 365738ef..d478be52 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,51 @@ results = did.fit( ) ``` +### Fixed Effects + +Use `fixed_effects` for low-dimensional categorical controls (creates dummy variables): + +```python +# State and industry fixed effects +results = did.fit( + data, + outcome='sales', + treatment='treated', + time='post', + fixed_effects=['state', 'industry'] +) + +# Access fixed effect coefficients +state_coefs = {k: v for k, v in results.coefficients.items() if k.startswith('state_')} +``` + +Use `absorb` for high-dimensional fixed effects (more efficient, uses within-transformation): + +```python +# Absorb firm-level fixed effects (efficient for many firms) +results = did.fit( + data, + outcome='sales', + treatment='treated', + time='post', + absorb=['firm_id'] +) +``` + +Combine covariates with fixed effects: + +```python +results = did.fit( + data, + outcome='sales', + treatment='treated', + time='post', + covariates=['size', 'age'], # Linear controls + fixed_effects=['industry'], # Low-dimensional FE (dummies) + absorb=['firm_id'] # High-dimensional FE (absorbed) +) +``` + ### Cluster-Robust Standard Errors ```python @@ -222,12 +267,25 @@ DifferenceInDifferences( | Method | Description | |--------|-------------| -| `fit(data, outcome, treatment, time, formula, covariates)` | Fit the DiD model | +| `fit(data, outcome, treatment, time, ...)` | Fit the DiD model | | `summary()` | Get formatted summary string | | `print_summary()` | Print summary to stdout | | `get_params()` | Get estimator parameters (sklearn-compatible) | | `set_params(**params)` | Set estimator parameters (sklearn-compatible) | +**fit() Parameters:** + +| Parameter | Type | Description | +|-----------|------|-------------| +| `data` | DataFrame | Input data | +| `outcome` | str | Outcome variable column name | +| `treatment` | str | Treatment indicator column (0/1) | +| `time` | str | Post-treatment indicator column (0/1) | +| `formula` | str | R-style formula (alternative to column names) | +| `covariates` | list | Linear control variables | +| `fixed_effects` | list | Categorical FE columns (creates dummies) | +| `absorb` | list | High-dimensional FE (within-transformation) | + ### DiDResults **Attributes:** diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index ea459d37..cf1099fa 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -105,7 +105,9 @@ def fit( treatment: str = None, time: str = None, formula: str = None, - covariates: list = None + covariates: list = None, + fixed_effects: list = None, + absorb: list = None ) -> DiDResults: """ Fit the Difference-in-Differences model. @@ -124,7 +126,15 @@ def fit( R-style formula (e.g., "outcome ~ treated * post"). If provided, overrides outcome, treatment, and time parameters. covariates : list, optional - List of covariate column names to include in the regression. + List of covariate column names to include as linear controls. + fixed_effects : list, optional + List of categorical column names to include as fixed effects. + Creates dummy variables for each category (drops first level). + Use for low-dimensional fixed effects (e.g., industry, region). + absorb : list, optional + List of categorical column names for high-dimensional fixed effects. + Uses within-transformation (demeaning) instead of dummy variables. + More efficient for large numbers of categories (e.g., firm, individual). Returns ------- @@ -135,6 +145,18 @@ def fit( ------ ValueError If required parameters are missing or data validation fails. + + Examples + -------- + Using fixed effects (dummy variables): + + >>> did.fit(data, outcome='sales', treatment='treated', time='post', + ... fixed_effects=['state', 'industry']) + + Using absorbed fixed effects (within-transformation): + + >>> did.fit(data, outcome='sales', treatment='treated', time='post', + ... absorb=['firm_id']) """ # Parse formula if provided if formula is not None: @@ -147,10 +169,35 @@ def fit( # Validate inputs self._validate_data(data, outcome, treatment, time, covariates) + # Validate fixed effects and absorb columns + if fixed_effects: + for fe in fixed_effects: + if fe not in data.columns: + raise ValueError(f"Fixed effect column '{fe}' not found in data") + if absorb: + for ab in absorb: + if ab not in data.columns: + raise ValueError(f"Absorb column '{ab}' not found in data") + + # Handle absorbed fixed effects (within-transformation) + working_data = data.copy() + absorbed_vars = [] + n_absorbed_effects = 0 + + if absorb: + # Apply within-transformation for each absorbed variable + vars_to_demean = [outcome] + (covariates or []) + for ab_var in absorb: + n_absorbed_effects += working_data[ab_var].nunique() - 1 + for var in vars_to_demean: + group_means = working_data.groupby(ab_var)[var].transform("mean") + working_data[var] = working_data[var] - group_means + absorbed_vars.append(ab_var) + # Extract variables - y = data[outcome].values.astype(float) - d = data[treatment].values.astype(float) - t = data[time].values.astype(float) + y = working_data[outcome].values.astype(float) + d = working_data[treatment].values.astype(float) + t = working_data[time].values.astype(float) # Validate binary variables validate_binary(d, "treatment") @@ -166,9 +213,18 @@ def fit( # Add covariates if provided if covariates: for cov in covariates: - X = np.column_stack([X, data[cov].values.astype(float)]) + X = np.column_stack([X, working_data[cov].values.astype(float)]) var_names.append(cov) + # Add fixed effects as dummy variables + if fixed_effects: + for fe in fixed_effects: + # Create dummies, drop first category to avoid multicollinearity + dummies = pd.get_dummies(data[fe], prefix=fe, drop_first=True) + for col in dummies.columns: + X = np.column_stack([X, dummies[col].values.astype(float)]) + var_names.append(col) + # Fit OLS coefficients, residuals, fitted, r_squared = self._fit_ols(X, y) @@ -190,8 +246,8 @@ def fit( att = coefficients[att_idx] se = np.sqrt(vcov[att_idx, att_idx]) - # Compute test statistics - df = len(y) - X.shape[1] + # Compute test statistics (adjust df for absorbed fixed effects) + df = len(y) - X.shape[1] - n_absorbed_effects t_stat = att / se p_value = compute_p_value(t_stat, df=df) conf_int = compute_confidence_interval(att, se, self.alpha, df=df) diff --git a/tests/test_estimators.py b/tests/test_estimators.py index e155b408..d3348823 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -276,3 +276,179 @@ def test_is_significant_property(self, simple_did_data): assert isinstance(results.is_significant, bool) # With true effect, should be significant assert results.is_significant + + +class TestFixedEffects: + """Tests for fixed effects functionality.""" + + @pytest.fixture + def panel_data_with_fe(self): + """Create panel data with fixed effects.""" + np.random.seed(42) + n_units = 50 + n_periods = 4 + n_states = 5 + + data = [] + for unit in range(n_units): + state = unit % n_states + is_treated = unit < n_units // 2 + # State-level effect + state_effect = state * 2.0 + + for period in range(n_periods): + post = 1 if period >= 2 else 0 + + y = 10.0 + state_effect + period * 0.5 + if is_treated and post: + y += 3.0 # True ATT + + y += np.random.normal(0, 0.5) + + data.append({ + "unit": unit, + "state": f"state_{state}", + "period": period, + "treated": int(is_treated), + "post": post, + "outcome": y, + }) + + return pd.DataFrame(data) + + def test_fixed_effects_dummy(self, panel_data_with_fe): + """Test fixed effects using dummy variables.""" + did = DifferenceInDifferences() + results = did.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post", + fixed_effects=["state"] + ) + + assert results is not None + assert did.is_fitted_ + # ATT should still be close to 3.0 + assert abs(results.att - 3.0) < 1.0 + + def test_fixed_effects_coefficients_include_dummies(self, panel_data_with_fe): + """Test that dummy coefficients are included in results.""" + did = DifferenceInDifferences() + results = did.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post", + fixed_effects=["state"] + ) + + # Should have state dummy coefficients + state_coefs = [k for k in results.coefficients.keys() if k.startswith("state_")] + assert len(state_coefs) == 4 # 5 states - 1 (dropped first) + + def test_absorb_fixed_effects(self, panel_data_with_fe): + """Test absorbed (within-transformed) fixed effects.""" + did = DifferenceInDifferences() + results = did.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post", + absorb=["unit"] + ) + + assert results is not None + assert did.is_fitted_ + # ATT should still be close to 3.0 + assert abs(results.att - 3.0) < 1.0 + + def test_fixed_effects_vs_no_fe(self, panel_data_with_fe): + """Test that FE produces different (usually better) estimates.""" + did_no_fe = DifferenceInDifferences() + did_with_fe = DifferenceInDifferences() + + results_no_fe = did_no_fe.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post" + ) + + results_with_fe = did_with_fe.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post", + fixed_effects=["state"] + ) + + # Both should estimate positive ATT + assert results_no_fe.att > 0 + assert results_with_fe.att > 0 + + # FE model should have higher R-squared (explains more variance) + assert results_with_fe.r_squared >= results_no_fe.r_squared + + def test_invalid_fixed_effects_column(self, panel_data_with_fe): + """Test error when fixed effects column doesn't exist.""" + did = DifferenceInDifferences() + with pytest.raises(ValueError, match="not found"): + did.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post", + fixed_effects=["nonexistent_column"] + ) + + def test_invalid_absorb_column(self, panel_data_with_fe): + """Test error when absorb column doesn't exist.""" + did = DifferenceInDifferences() + with pytest.raises(ValueError, match="not found"): + did.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post", + absorb=["nonexistent_column"] + ) + + def test_multiple_fixed_effects(self, panel_data_with_fe): + """Test multiple fixed effects.""" + # Add another categorical variable + panel_data_with_fe["industry"] = panel_data_with_fe["unit"] % 3 + + did = DifferenceInDifferences() + results = did.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post", + fixed_effects=["state", "industry"] + ) + + assert results is not None + # Should have both state and industry dummies + state_coefs = [k for k in results.coefficients.keys() if k.startswith("state_")] + industry_coefs = [k for k in results.coefficients.keys() if k.startswith("industry_")] + assert len(state_coefs) > 0 + assert len(industry_coefs) > 0 + + def test_covariates_with_fixed_effects(self, panel_data_with_fe): + """Test combining covariates with fixed effects.""" + # Add a continuous covariate + panel_data_with_fe["size"] = np.random.normal(100, 10, len(panel_data_with_fe)) + + did = DifferenceInDifferences() + results = did.fit( + panel_data_with_fe, + outcome="outcome", + treatment="treated", + time="post", + covariates=["size"], + fixed_effects=["state"] + ) + + assert results is not None + assert "size" in results.coefficients