# Table 1 Replication - Leverage Analysis (1965-2003)

This notebook replicates **Table 1** from the paper, showing descriptive statistics for **All Firms** and **Survivors** using Compustat data.

---


In [72]:
# STEP 0 — Environment Setup
# Install required packages: wrds, pandas, numpy, scipy, jupyter, ipykernel
%pip install -r ../requirements.txt


Note: you may need to restart the kernel to use updated packages.


In [73]:
# Import required libraries
import wrds
import pandas as pd
import numpy as np
from scipy import stats
import os

# Display settings for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)
pd.set_option('display.width', 120)


## STEP 1 — Load Data


In [74]:
# Load data: CSV if available, otherwise WRDS
DATA_FILE = 'data/01_raw_data.csv'
db = None

if os.path.exists(DATA_FILE):
    df = pd.read_csv(DATA_FILE)
    print(f"Loaded {len(df):,} obs from CSV")
else:
    try:
        db = wrds.Connection()
    except Exception as e:
        raise SystemExit(f"No CSV & WRDS failed: {e}\nDownload from wrds-www.wharton.upenn.edu → {DATA_FILE}")


Loaded 296,633 obs from CSV


In [75]:
# Sanity check (only if connected to WRDS)
if db:
    libraries = db.list_libraries()
    print(f"Available libraries: {len(libraries)}, 'comp' available: {'comp' in libraries}")
else:
    print("Skipped (using local CSV)")


Skipped (using local CSV)


## STEP 2 — Pull Compustat Data

Pull annual Compustat data matching Section I of the paper:
- **Nonfinancial firms** (SIC codes outside 6000-6999)
- 1965–2003
- Consolidated, domestic, INDL format


In [76]:
# Skip if already loaded from CSV above
if 'df' not in dir() or df is None:
    if db:
        sql = """
        SELECT gvkey, fyear, sic, at, dlc, dltt, sale, oibdp, ppent, prcc_f, csho, pstkl, txditc, intan, dvc
        FROM comp.funda
        WHERE indfmt = 'INDL' AND datafmt = 'STD' AND popsrc = 'D' AND consol = 'C'
          AND fyear BETWEEN 1965 AND 2003 AND sic IS NOT NULL AND (sic < 6000 OR sic > 6999)
        """
        df = db.raw_sql(sql)
        print(f"Downloaded {len(df):,} obs from WRDS")
else:
    print(f"Using pre-loaded data: {len(df):,} obs")


Using pre-loaded data: 296,633 obs


In [77]:
# Additional Python-side filter for financials (ensures exclusion even when loading from CSV)
df = df[df['sic'].notna()]
df = df[(df['sic'] < 6000) | (df['sic'] > 6999)]
print(f"After excluding financials (SIC 6000-6999): {len(df):,} observations")
print(f"Unique firms: {df['gvkey'].nunique():,}")


After excluding financials (SIC 6000-6999): 241,927 observations
Unique firms: 20,170


In [78]:
# Save raw downloaded data
os.makedirs('data', exist_ok=True)
df.to_csv('data/01_raw_data.csv', index=False)
print(f"✓ Saved raw data: {len(df):,} observations to data/01_raw_data.csv")


✓ Saved raw data: 241,927 observations to data/01_raw_data.csv


In [79]:
# Preview the raw data
df.head()


Unnamed: 0,costat,curcd,datafmt,indfmt,consol,gvkey,fyear,sic,at,dlc,dltt,intan,ppent,pstkl,txditc,dvc,oibdp,sale,csho,prcc_f
0,I,USD,STD,INDL,C,1000,1965,3089,2.31,0.3,1.154,0.0,1.397,0.0,0.0,0.0,-0.16,1.688,0.206,
1,I,USD,STD,INDL,C,1002,1965,3825,,,0.8,0.0,,0.0,0.0,0.0,,11.7,0.803,
2,A,USD,STD,INDL,C,1004,1965,5080,2.519,0.347,0.153,0.0,0.41,0.0,0.015,0.0,0.706,3.821,0.42,
3,I,USD,STD,INDL,C,1010,1965,3743,328.7,0.0,93.526,0.0,188.9,0.0,23.176,10.698,60.78,323.2,5.921,47.75
4,I,USD,STD,INDL,C,1040,1965,3949,451.9,6.5,186.0,8.838,124.8,5.745,19.957,15.44,62.38,385.8,17.15,19.625


## STEP 3 — Basic Cleaning

Apply paper's cleaning rules:
1. Require non-missing, positive assets
2. Fill missing debt components with zero
3. Calculate total debt


In [80]:
print(f"Before cleaning: {len(df):,} observations")

# Require non-missing, positive assets
df = df[df['at'].notna() & (df['at'] > 0)]
print(f"After asset filter: {len(df):,} observations")

# Replace missing debt components with 0
df['dlc'] = df['dlc'].fillna(0)
df['dltt'] = df['dltt'].fillna(0)

# Total debt
df['debt'] = df['dlc'] + df['dltt']

print(f"Debt computed for all {len(df):,} observations")


Before cleaning: 241,927 observations
After asset filter: 225,614 observations
Debt computed for all 225,614 observations


In [81]:
# Save cleaned data
df.to_csv('data/02_cleaned_data.csv', index=False)
print(f"✓ Saved cleaned data: {len(df):,} observations to data/02_cleaned_data.csv")


✓ Saved cleaned data: 225,614 observations to data/02_cleaned_data.csv


## STEP 4 — Construct Leverage Measures

Calculate book and market leverage as defined in the Appendix:
- **Book leverage** = Total Debt / Total Assets
- **Market leverage** = Total Debt / (Total Debt + Market Equity)


In [82]:
# Book leverage
df['book_lev'] = df['debt'] / df['at']

# Market equity
df['me'] = df['prcc_f'] * df['csho']

# Market leverage
df['market_lev'] = df['debt'] / (df['debt'] + df['me'])

print(f"Before leverage filter: {len(df):,} observations")

# Keep leverage in [0,1]
df = df[
    (df['book_lev'].between(0, 1)) &
    (df['market_lev'].between(0, 1))
]

print(f"After leverage filter [0,1]: {len(df):,} observations")


Before leverage filter: 225,614 observations
After leverage filter [0,1]: 176,817 observations


In [83]:
# Save leverage data
df.to_csv('data/03_leverage_data.csv', index=False)
print(f"✓ Saved leverage data: {len(df):,} observations to data/03_leverage_data.csv")


✓ Saved leverage data: 176,817 observations to data/03_leverage_data.csv


## STEP 5 — Construct Table 1 Variables

Create all variables that appear in Table 1:
- Log sales (firm size) — set to NaN for non-positive sales
- Market-to-book ratio
- Profitability
- Tangibility
- Intangibles
- Dividend payer dummy


In [84]:
# Log sales (proxy for firm size)
# Set to NaN for non-positive sales (don't clip to 1, that creates fake values)
df['log_sales'] = np.where(df['sale'] > 0, np.log(df['sale']), np.nan)

# Market-to-book ratio
df['mtb'] = (
    df['me']
    + df['debt']
    + df['pstkl'].fillna(0)
    - df['txditc'].fillna(0)
) / df['at']

# Profitability (EBITDA / Assets)
df['profitability'] = df['oibdp'] / df['at']

# Tangibility (PPE / Assets)
df['tangibility'] = df['ppent'] / df['at']

# Intangibles (Intangible Assets / Assets)
df['intangibles'] = df['intan'] / df['at']

# Dividend payer (binary: 1 if pays dividend, 0 otherwise)
df['div_payer'] = (df['dvc'].fillna(0) > 0).astype(int)

print("All Table 1 variables constructed")
print(f"log_sales non-missing: {df['log_sales'].notna().sum():,} ({100*df['log_sales'].notna().mean():.1f}%)")


All Table 1 variables constructed
log_sales non-missing: 173,745 (98.3%)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [85]:
# Save variables data
df.to_csv('data/04_variables_data.csv', index=False)
print(f"✓ Saved variables data: {len(df):,} observations to data/04_variables_data.csv")


✓ Saved variables data: 176,817 observations to data/04_variables_data.csv


## STEP 6 — Cash-Flow Volatility

Calculate rolling 3-year standard deviation of **scaled** operating income (oibdp/at).

Using raw oibdp leads to scale issues (large firms have large volatility in dollar terms). Scaling by assets makes this a profitability-type measure.


In [86]:
# Sort by firm and year
df = df.sort_values(['gvkey', 'fyear'])

# Scaled cash flow (oibdp / assets)
df['cf_base'] = df['oibdp'] / df['at']

# Rolling 3-year standard deviation of SCALED operating income
df['cf_vol'] = (
    df.groupby('gvkey')['cf_base']
      .rolling(window=3, min_periods=3)
      .std()
      .reset_index(level=0, drop=True)
)

print(f"Cash-flow volatility computed for {df['cf_vol'].notna().sum():,} observations")
print(f"cf_vol summary: mean={df['cf_vol'].mean():.4f}, median={df['cf_vol'].median():.4f}, std={df['cf_vol'].std():.4f}")


Cash-flow volatility computed for 141,888 observations
cf_vol summary: mean=0.1510, median=0.0322, std=10.5683


## STEP 7 — Industry Median Book Leverage

Calculate industry median leverage using 2-digit SIC codes.

**Note:** This uses SIC-2 classification. The paper may use Fama-French 38 industry classification, which could lead to minor differences in `ind_med_lev`.


In [87]:
# Create 2-digit SIC code
df['sic2'] = df['sic'] // 100

# Industry median leverage (by SIC-2 and year)
df['ind_med_lev'] = (
    df.groupby(['sic2', 'fyear'])['book_lev']
      .transform('median')
)

print(f"Industry median leverage computed")
print(f"Number of unique industries (SIC-2): {df['sic2'].nunique()}")


Industry median leverage computed
Number of unique industries (SIC-2): 67


In [88]:
# Save final data (no trimming applied to rows - trimming done in summary stats)
df.to_csv('data/05_final_data.csv', index=False)
print(f"✓ Saved final data: {len(df):,} observations to data/05_final_data.csv")


✓ Saved final data: 176,817 observations to data/05_final_data.csv


## STEP 8 — Define Survivors

**Survivors** are defined as firms with ≥20 years of book leverage data in the sample.


In [89]:
# Count years of book leverage data per firm
lev_counts = df.groupby('gvkey')['book_lev'].count()

# Survivors: firms with ≥20 years
survivors = lev_counts[lev_counts >= 20].index

total_firms = df['gvkey'].nunique()
print(f"Total unique firms: {total_firms:,}")
print(f"Survivors (≥20 years): {len(survivors):,}")
if total_firms > 0:
    print(f"Survivor rate: {100 * len(survivors) / total_firms:.1f}%")
else:
    print("Survivor rate: N/A (no firms in dataset)")

# Create two datasets
df_all = df.copy()
df_surv = df[df['gvkey'].isin(survivors)].copy()

print(f"\nAll Firms dataset: {len(df_all):,} observations")
print(f"Survivors dataset: {len(df_surv):,} observations")


Total unique firms: 17,304
Survivors (≥20 years): 2,471
Survivor rate: 14.3%

All Firms dataset: 176,817 observations
Survivors dataset: 71,453 observations


## STEP 9 — Replicate Table 1 Statistics

Generate descriptive statistics (Mean, Median, SD) for both **All Firms** and **Survivors**.

**Trimming approach:** Rather than dropping rows from the dataset, we compute trimmed statistics (1st/99th percentile) per variable within the summary function. This preserves sample size while handling outliers.


In [90]:
# Variables to include in Table 1
vars_table1 = [
    'book_lev', 'market_lev', 'log_sales', 'mtb', 'profitability',
    'tangibility', 'cf_vol', 'ind_med_lev', 'div_payer', 'intangibles'
]

def trimmed_stats(s, q=0.01):
    """
    Compute mean, median, std after trimming tails at q and 1-q quantiles.
    This is applied per variable, not across the entire dataset.
    """
    s = s.dropna()
    if s.empty or len(s) < 10:
        return (np.nan, np.nan, np.nan)
    lo, hi = s.quantile([q, 1-q])
    s_trimmed = s[(s >= lo) & (s <= hi)]
    return (s_trimmed.mean(), s_trimmed.median(), s_trimmed.std())

def summary_table(data, q=0.01):
    """
    Generate summary statistics table with trimmed moments.
    Each variable is trimmed independently at 1%/99% by default.
    """
    out = {}
    for v in vars_table1:
        mean, med, sd = trimmed_stats(data[v], q=q)
        out[v] = {'Mean': mean, 'Median': med, 'SD': sd}
    return pd.DataFrame(out).T

# Generate tables
table_all = summary_table(df_all)
table_surv = summary_table(df_surv)


In [91]:
print("="*80)
print("TABLE 1 REPLICATION: ALL FIRMS")
print("="*80)
print(table_all.round(3))
print(f"\nNumber of observations: {len(df_all):,}")
print(f"Number of unique firms: {df_all['gvkey'].nunique():,}")


TABLE 1 REPLICATION: ALL FIRMS
                Mean  Median     SD
book_lev       0.246   0.228  0.195
market_lev     0.272   0.215  0.247
log_sales      4.370   4.360  2.182
mtb            1.558   1.001  1.651
profitability  0.068   0.116  0.213
tangibility    0.330   0.275  0.236
cf_vol         0.061   0.032  0.088
ind_med_lev    0.222   0.221  0.098
div_payer      0.413   0.000  0.492
intangibles    0.056   0.003  0.105

Number of observations: 176,817
Number of unique firms: 17,304


In [92]:
print("="*80)
print("TABLE 1 REPLICATION: SURVIVORS")
print("="*80)
print(table_surv.round(3))
print(f"\nNumber of observations: {len(df_surv):,}")
print(f"Number of unique firms: {df_surv['gvkey'].nunique():,}")


TABLE 1 REPLICATION: SURVIVORS
                Mean  Median     SD
book_lev       0.257   0.251  0.171
market_lev     0.305   0.273  0.235
log_sales      5.320   5.272  1.985
mtb            1.152   0.891  0.828
profitability  0.133   0.134  0.090
tangibility    0.377   0.321  0.232
cf_vol         0.035   0.023  0.036
ind_med_lev    0.245   0.232  0.092
div_payer      0.655   1.000  0.476
intangibles    0.041   0.002  0.075

Number of observations: 71,453
Number of unique firms: 2,471


## STEP 10 — Comparison: All Firms vs Survivors

Compare the two groups side-by-side to highlight differences.


In [93]:
# Create side-by-side comparison
comparison = pd.DataFrame({
    'All_Mean': table_all['Mean'],
    'All_Median': table_all['Median'],
    'Surv_Mean': table_surv['Mean'],
    'Surv_Median': table_surv['Median'],
    'Diff_Mean': table_surv['Mean'] - table_all['Mean']
})

print("="*80)
print("COMPARISON: ALL FIRMS vs SURVIVORS")
print("="*80)
print(comparison.round(3))
print("\nKey Observations:")
print("- Survivors are LARGER (higher log_sales)")
print("- Survivors are MORE PROFITABLE (higher profitability)")
print("- Survivors are MORE TANGIBLE (higher tangibility)")
print("- Survivors have LOWER GROWTH (lower mtb)")
print("- Survivors are MORE LEVERED (higher book & market leverage)")


COMPARISON: ALL FIRMS vs SURVIVORS
               All_Mean  All_Median  Surv_Mean  Surv_Median  Diff_Mean
book_lev          0.246       0.228      0.257        0.251      0.011
market_lev        0.272       0.215      0.305        0.273      0.033
log_sales         4.370       4.360      5.320        5.272      0.950
mtb               1.558       1.001      1.152        0.891     -0.406
profitability     0.068       0.116      0.133        0.134      0.065
tangibility       0.330       0.275      0.377        0.321      0.047
cf_vol            0.061       0.032      0.035        0.023     -0.026
ind_med_lev       0.222       0.221      0.245        0.232      0.023
div_payer         0.413       0.000      0.655        1.000      0.241
intangibles       0.056       0.003      0.041        0.002     -0.015

Key Observations:
- Survivors are LARGER (higher log_sales)
- Survivors are MORE PROFITABLE (higher profitability)
- Survivors are MORE TANGIBLE (higher tangibility)
- Survivors have 

## Export Results

Save the results to CSV files for further analysis or reporting.


In [94]:
# Export summary tables
table_all.to_csv('table1_all_firms.csv')
table_surv.to_csv('table1_survivors.csv')
comparison.to_csv('table1_comparison.csv')

print("Results exported to CSV files!")
print("  - table1_all_firms.csv")
print("  - table1_survivors.csv")
print("  - table1_comparison.csv")


Results exported to CSV files!
  - table1_all_firms.csv
  - table1_survivors.csv
  - table1_comparison.csv


In [95]:
# Close the database connection
db.close()
print("WRDS connection closed.")


AttributeError: 'NoneType' object has no attribute 'close'