In [1]:
import pandas as pd
import numpy as np
import sqlite3
import statsmodels.formula.api as smf
import wrds
from regtabletotext import prettify_result
from dotenv import load_dotenv

from plotnine import *
from mizani.formatters import comma_format, percent_format
from datetime import datetime

In [2]:
from sqlalchemy import create_engine
import os

load_dotenv()

connection_string = (
  "postgresql+psycopg2://"
 f"{os.getenv('WRDS_USER')}:{os.getenv('WRDS_PASSWORD')}"
  "@wrds-pgdata.wharton.upenn.edu:9737/wrds"
)

wrds = create_engine(connection_string, pool_pre_ping=True)

In [3]:
start_date = "01/01/1960"
end_date = "12/31/2023"

In [4]:
compustat_query = (
  "SELECT gvkey, datadate, seq, ceq, csho, at, act, che, dlc, dp, ib, lct, lt, txditc, txdb, txp, itcb,  pstkrv, "
         "pstkl, pstk, capx, oancf, sale, cogs, xint, xsga, ebit, fyr "
    "FROM comp.funda "
    "WHERE indfmt = 'INDL' "
          "AND datafmt = 'STD' "
          "AND consol = 'C' "
          "AND curcd = 'USD' "
         f"AND datadate BETWEEN '{start_date}' AND '{end_date}'"
)

compustat = pd.read_sql_query(
  sql=compustat_query,
  con=wrds,
  dtype={"gvkey": str},
  parse_dates={"datadate"}
)

In [5]:
compustat = (compustat
  .assign(
    be=lambda x: 
      (x["seq"].combine_first(x["ceq"]+x["pstk"])
       .combine_first(x["at"]-x["lt"])+
       x["txditc"].combine_first(x["txdb"]+x["itcb"]).fillna(0)-
       x["pstkrv"].combine_first(x["pstkl"])
       .combine_first(x["pstk"]).fillna(0))
  )
  .assign(
    be=lambda x: x["be"].apply(lambda y: np.nan if y <= 0 else y)
  )
  .assign(
    op=lambda x: 
      ((x["sale"]-x["cogs"].fillna(0)- 
        x["xsga"].fillna(0)-x["xint"].fillna(0))/x["be"])
  )
)

In [6]:
compustat = (compustat
  .assign(year=lambda x: pd.DatetimeIndex(x["datadate"]).year)
  .sort_values("datadate")
  .groupby(["gvkey", "year"])
  .tail(1)
  .reset_index()
)

In [7]:
compustat_lag = (compustat
  .get(["gvkey", "year", "at"])
  .assign(year=lambda x: x["year"]+1)
  .rename(columns={"at": "at_lag"})
)

compustat = (compustat
  .merge(compustat_lag, how="left", on=["gvkey", "year"])
  .assign(inv=lambda x: x["at"]/x["at_lag"]-1)
  .assign(inv=lambda x: np.where(x["at_lag"] <= 0, np.nan, x["inv"]))
)

In [8]:
tidy_finance = sqlite3.connect('data/compustat.db.sqlite')

In [9]:
(compustat
  .to_sql(name="compustat", 
          con=tidy_finance, 
          if_exists="replace",
          index=False)
)

495358

In [10]:
tidy_finance.close()