# Gregg (2020) Replication

This notebook replicates the main results from Gregg (2020) on factory productivity and incorporation in late Imperial Russia.

In [36]:
# === IMPORTS ===
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm

print("All packages imported successfully!")

All packages imported successfully!


In [37]:
# === LOAD DATA ===
df = pd.read_stata("AG_Corp_Prod_Database.dta")
print(f"Data loaded successfully! Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Data loaded successfully! Shape: (40288, 65)
Columns: ['id', 'Form', 'PSZ', 'PSZ1900', 'FoundingYear', 'Province', 'Region', 'Industry', 'OntheSide', 'Age', 'TaxedActivity', 'YEAR', 'PSZLastYear', 'PSZ1908', 'SubindustryCode', 'STCAP', 'Revenue', 'TotalWorkers', 'TotalPower', 'GrandTotalWorkers', 'RevperWorker', 'PowerperWorker', 'RevperGrandWorker', 'PowerperGrandWorker', 'logRevperWorker', 'logPowerperWorker', 'logRevperGrandWorker', 'logPowerperGrandWorker', 'logRev', 'logWorkers', 'logPower', 'RegIndGroup', 'RegIndYearGroup', 'ProvIndGroup', 'ProvIndYearGroup', 'IndYearGroup', 'IndustryFactor', 'ProvinceFactor', 'YearFactor', 'AKTS', 'PAI', 'factory_id', 'FormNextYear', 'FormNextNextYear', 'FactoryisCorpin1894', 'FormNextYearin1894', 'FactoryisCorpin1900', 'FormNextYearin1900', 'FactoryisCorpin1908', 'NEWDEV', 'SHARES', 'STPRICE', 'BONDS', 'Silk', 'Flax', 'Animal', 'Wool', 'Cotton', 'MixedMaterials', 'Wood', 'Paper', 'MetalsandMachines', 'Foods', 'Chemical', 'Mineral']


In [38]:
# === TABLE 3 CHECK: Pooled OLS ===
df_check = df.dropna(subset=["logRevperWorker", "Form", "Industry", "Province", "YEAR"])
model_check = smf.ols(
    formula="logRevperWorker ~ Form + C(Industry) + C(Province) + C(YEAR)",
    data=df_check
).fit(cov_type="HC1")

print("\n=== Table 3 Coefficient on Form (pooled OLS) ===")
print(f"Coefficient: {model_check.params['Form']:.4f}")
print(f"Std Error: {model_check.bse['Form']:.4f}")


=== Table 3 Coefficient on Form (pooled OLS) ===
Coefficient: 0.4900
Std Error: 0.0195


In [39]:
# === STEP 1: Construct TFP as residuals (Stata-style: no fixed effects) ===

# Filter for valid observations
df_tfp = df.dropna(subset=["logRev", "logWorkers", "logPower", "id"]).copy()

# Ensure numeric data types
df_tfp["logRev"] = pd.to_numeric(df_tfp["logRev"], errors="coerce")
df_tfp["logWorkers"] = pd.to_numeric(df_tfp["logWorkers"], errors="coerce")
df_tfp["logPower"] = pd.to_numeric(df_tfp["logPower"], errors="coerce")

# Drop any new NaNs
df_tfp = df_tfp.dropna(subset=["logRev", "logWorkers", "logPower"])

print(f"TFP calculation dataset shape: {df_tfp.shape}")

# Run Cobb-Douglas regression: logRev ~ logWorkers + logPower (NO FIXED EFFECTS)
X = df_tfp[["logWorkers", "logPower"]]
X = sm.add_constant(X)
y = df_tfp["logRev"]

X = X.astype(float)
y = y.astype(float)

model = sm.OLS(y, X).fit()
df_tfp["TFP"] = model.resid

# Merge TFP into main dataframe
df = df.merge(df_tfp[["id", "TFP"]], on="id", how="left")

print("TFP calculated successfully (Stata-style, no fixed effects)!")



TFP calculation dataset shape: (15435, 65)
TFP calculated successfully (Stata-style, no fixed effects)!


In [40]:
# === STEP 2: FIXED EFFECTS REGRESSIONS (Table 5) ===
outcomes = ["logRevperWorker", "logPowerperWorker", "TFP"]
df_fe = df.dropna(subset=outcomes + ["Form", "YEAR", "factory_id"]).copy()

# Convert outcome variables to numeric
for outcome in outcomes:
    if outcome in df_fe.columns:
        df_fe[outcome] = pd.to_numeric(df_fe[outcome], errors='coerce')

# Drop rows with NaN in outcome variables
df_fe = df_fe.dropna(subset=outcomes)

print(f"Fixed effects dataset shape: {df_fe.shape}")
print("\n=== Table 5 Fixed Effects Results ===")

for outcome in outcomes:
    if outcome in df_fe.columns:
        model = smf.ols(
            formula=f"{outcome} ~ Form + C(YEAR)",
            data=df_fe
        ).fit(cov_type="cluster", cov_kwds={"groups": df_fe["factory_id"]})

        coef = model.params["Form"]
        se = model.bse["Form"]
        pval = model.pvalues["Form"]
        ci = model.conf_int().loc["Form"].tolist()

        print(f"\nOutcome: {outcome}")
        print(f"  Coefficient: {coef:.3f}")
        print(f"  Std. Error:  {se:.3f}")
        print(f"  P-value:     {pval:.4f}")
        print(f"  95% CI:      [{ci[0]:.3f}, {ci[1]:.3f}]")
    else:
        print(f"\nOutcome: {outcome} - Column not found in dataset")

Fixed effects dataset shape: (15435, 66)

=== Table 5 Fixed Effects Results ===

Outcome: logRevperWorker
  Coefficient: 0.322
  Std. Error:  0.032
  P-value:     0.0000
  95% CI:      [0.260, 0.385]

Outcome: logPowerperWorker
  Coefficient: -0.010
  Std. Error:  0.036
  P-value:     0.7816
  95% CI:      [-0.081, 0.061]

Outcome: TFP
  Coefficient: 0.056
  Std. Error:  0.030
  P-value:     0.0652
  95% CI:      [-0.004, 0.116]
