<a href="https://colab.research.google.com/github/helenlu-vbs/TechStackPhD/blob/main/Predict_Seasoned_Equity_Offerings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
### import packages ###

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

In [51]:
### Load the data ###

url = "https://github.com/pgeertsema/AIForCorporateFinance/raw/refs/heads/main/jkp_seo.dta"
df = pd.read_stata(url)

# Reduce fragmentation post read_stata()
df = df.copy()

In [52]:
target = "seo_next_yr"

# Candidate regressors: all variables from "bidask" to "debt_at"
candidates = df.loc[:, "bidask":"debt_at"].columns.tolist()

# (No industry FE here; only pick among numeric candidates)
df_corr = df[[target] + candidates].dropna()

corr = df_corr[candidates].corrwith(df_corr[target])
top6 = corr.abs().sort_values(ascending=False).head(6).index.tolist()

print("Top 6 (abs) correlations with seo_next_yr:")
print(pd.DataFrame({"corr": corr[top6]}).sort_values("corr", key=lambda s: s.abs(), ascending=False))

# Keep only target + selected vars
dfr = df[[target] + top6].dropna()

Top 6 (abs) correlations with seo_next_yr:
               corr
eqnpo_12m -0.214647
ebit_at   -0.211538
ret_9_1    0.205856
noa_gr1a   0.205053
fincf_at   0.203093
gp_at     -0.197239


  c /= stddev[:, None]
  c /= stddev[None, :]


In [60]:

# ============================================================
# Summary stats (target + selected vars) -> LaTeX
# ============================================================
sumtab = dfr.describe(percentiles=[0.25, 0.50, 0.75]).T
sumtab = sumtab.rename(columns={
    "count": "N",
    "mean": "Mean",
    "std": "SD",
    "min": "Min",
    "25%": "P25",
    "50%": "P50",
    "75%": "P75",
    "max": "Max"
})[["N", "Mean", "SD", "Min", "P25", "P50", "P75", "Max"]]

sumtab["N"] = sumtab["N"].astype(int)
sumtab = sumtab.round(2)

latex = sumtab.to_latex(
    index=True,
    escape=True,
    caption="Summary statistics",
    label="tab:sumstats",
    float_format="%.2f"
)

# Replace booktabs rules with \hline
latex = latex.replace(r"\toprule", r"\hline")
latex = latex.replace(r"\midrule", r"\hline")
latex = latex.replace(r"\bottomrule", r"\hline")

with open("summary_stats.tex", "w") as f:
    f.write(latex)

print("Wrote: summary_stats.tex")

Wrote: summary_stats.tex


In [59]:
# ============================================================
# Probit regression using only the top-6 vars
# ============================================================
y = dfr[target].astype(float)
X = sm.add_constant(dfr[top6].astype(float))

probit = sm.Probit(y, X).fit(disp=False)

regtab = summary_col(
    results=[probit],
    model_names=["Probit"],
    stars=True,
    float_format="%0.2f",
    info_dict={
        "N": lambda m: f"{int(m.nobs)}",
        "Pseudo $ R^2 $": lambda m: f"{m.prsquared:.2f}",
    }
)

latex = regtab.as_latex()

# Replace the empty caption/label with yours
latex = latex.replace(r"\caption{}", r"\caption{Determinants of seasoned equity offerings}")
latex = latex.replace(r"\label{}", r"\label{tab:probit}")

# Fix the escaped math for pseudo R^2
latex = latex.replace(r"Pseudo \$ R^2 \$", r"Pseudo $ R^2 $")

with open("probit.tex", "w") as f:
    f.write(latex)

print("Wrote: probit.tex")



Wrote: probit.tex
