# Import Packages

In [1]:
import warnings
import pandas as pd

warnings.filterwarnings('ignore')

import pandas_datareader.data as web
import polars as pl
import wrds
from datetime import date

from functions.utils.func import *

### Establish First Connection to WRDS

In [2]:
# db = wrds.Connection(wrds_username='jofan23')
# db.create_pgpass_file()

# Present Date

In [3]:
live = True
current_date = date.today().strftime('%Y-%m-%d')

# Link Table

In [4]:
print("-" * 60)
sql_link = f"""
    SELECT a.gvkey, a.conm, a.tic, a.cusip, a.cik, a.sic, a.naics, b.linkprim,
           b.linktype, b.liid, b.lpermno, b.lpermco, b.linkdt, b.linkenddt
    FROM comp.names as a
    INNER JOIN crsp.ccmxpf_lnkhist as b
    ON a.gvkey = b.gvkey
    WHERE b.linktype in ('LC', 'LU')
    AND b.linkprim in ('P', 'C')
    ORDER BY a.gvkey;
"""

# Read in linking table
print("Read in linking table...")
db = wrds.Connection(wrds_username='jofan23')
link = db.raw_sql(sql_link)
db.close()

# Rename columns
print("Rename columns...")
link = link.rename(columns={
    'linkdt': 'timeLinkStart_d',
    'linkenddt': 'timeLinkEnd_d',
    'lpermno': 'permno',
    'tic': 'ticker'
})

link['permno'] = link['permno'].astype(int)

# Export link table
print("Export link table...")
link.to_parquet(get_parquet_dir(live) / 'data_link.parquet.brotli', compression='brotli')

------------------------------------------------------------
Read in linking table
Loading library list...
Done
Rename columns
Export link table


# Compustat Annual

In [5]:
print("-" * 60)
sql_compustat_annual = f"""
    SELECT a.gvkey, a.datadate, a.conm, a.fyear, a.tic, a.cusip, a.naicsh, a.sich, 
    a.aco, a.act, a.ajex, a.am, a.ao, a.ap, a.at, a.capx, a.ceq, a.ceqt, a.che, a.cogs,
    a.csho, a.cshrc, a.dcpstk, a.dcvt, a.dlc, a.dlcch, a.dltis, a.dltr,
    a.dltt, a.dm, a.dp, a.drc, a.drlt, a.dv,a.dvc,a.dvp,a.dvpa,a.dvpd,
    a.dvpsx_c, a.dvt, a.ebit, a.ebitda, a.emp, a.epspi, a.epspx, a.fatb, a.fatl,
    a.ffo, a.fincf, a.fopt, a.gdwl, a.gdwlia, a.gdwlip, a.gwo, a.ib, a.ibcom,
    a.intan, a.invt, a.ivao, a.ivncf, a.ivst, a.lco, a.lct, a.lo ,a.lt, a.mib,
    a.msa, a.ni, a.nopi, a.oancf, a.ob, a.oiadp, a.oibdp, a.pi, a.ppenb, a.ppegt,
    a.ppenls, a.ppent, a.prcc_c, a.prcc_f, a.prstkc, a.prstkcc, a.pstk, a.pstkl, a.pstkrv,
    a.re, a.rect, a.recta, a.revt, a.sale, a.scstkc, a.seq, a.spi, a.sstk,
    a.tstkp, a.txdb, a.txdi, a.txditc, a.txfo, a.txfed, a.txp, a.txt,
    a.wcap, a.wcapch, a.xacc, a.xad, a.xint, a.xrd, a.xpp, a.xsga
    FROM COMP.FUNDA as a
    WHERE a.consol = 'C'
    AND a.popsrc = 'D'
    AND a.datafmt = 'STD'
    AND a.curcd = 'USD'
    AND a.indfmt = 'INDL'
    AND a.datadate BETWEEN '2005-01-01' AND '{current_date}'
"""

# Read in Compustat Annual
print("Read in Compustat Annual...")
db = wrds.Connection(wrds_username='jofan23')
compustat_annual = db.raw_sql(sql_compustat_annual)
db.close()

# Read in link table
print("Read in link table...")
link_table = pd.read_parquet(get_parquet_dir(live) / 'data_link.parquet.brotli')
link_table = link_table.drop(['cusip', 'conm'], axis=1)

# Merge link table and Compustat Annual
print("Merge link table and Compustat Annual...")
annual = compustat_annual.merge(link_table, on='gvkey', how='inner')

# Drop rows based on condition
print("Drop rows based on condition...")
annual = annual.dropna(subset=['at', 'prcc_c', 'ni'])

# Extract 6 digits from CUSIP
print("Extract 6 digits from CUSIP...")
annual['cnum'] = annual['cusip'].str[:6]

# Replacing missing values
print("Replacing missing values...")
annual['dr'] = annual.apply(lambda row: row['drc'] + row['drlt'] if pd.notna(row['drc']) and pd.notna(row['drlt']) else (row['drc'] if pd.notna(row['drc']) else (row['drlt'] if pd.notna(row['drlt']) else None)), axis=1)
annual.loc[(annual['dcpstk'] > annual['pstk']) & pd.notna(annual['dcpstk']) & pd.notna(annual['pstk']) & pd.isna(annual['dcvt']), 'dc'] = annual['dcpstk'] - annual['pstk']
annual.loc[pd.isna(annual['pstk']) & pd.notna(annual['dcpstk']) & pd.isna(annual['dcvt']), 'dc'] = annual['dcpstk']
annual.loc[pd.isna(annual['dc']), 'dc'] = annual['dcvt']
annual['xint0'] = annual['xint'].fillna(0)
annual['xsga0'] = annual['xsga'].fillna(0)
annual['xad0'] = annual.apply(lambda row: 0 if row['xad'] < 0 else row['xad'], axis=1)
vars_list = ['nopi', 'dvt', 'ob', 'dm', 'dc', 'aco', 'ap', 'intan', 'ao', 'lco', 'lo', 'rect', 'invt', 'drc', 'spi', 'gdwl', 'che', 'dp', 'act', 'lct', 'tstkp', 'dvpa', 'scstkc', 'sstk', 'mib', 'ivao', 'prstkc', 'prstkcc', 'txditc', 'ivst']
for var in vars_list:
    annual[var].fillna(0, inplace=True)

# Shift data forward by 6 months
print("Shift data forward by 6 months...")
annual['date'] = pd.to_datetime(annual['datadate']).dt.to_period('M') + 6

# Convert index from annually to monthly
print("Convert index from annually to monthly...")
annual = annual.reindex(annual.index.repeat(12))
annual['tempTime'] = annual.groupby(['gvkey', 'date']).cumcount()
annual['date'] += annual['tempTime']
annual = annual.drop(columns=['tempTime'])

# Convert from YY-MM to YY-MM-DD (2012-01 to 2012-01-31)
print("Convert from YY-MM to YY-MM-DD (2012-01 to 2012-01-31)")
annual.date = annual.date.dt.to_timestamp("M")
annual = annual.drop('datadate', axis=1)

# Set index and remove duplicate indices
print("Set index and remove duplicate indices...")
annual = annual.set_index(['permno', 'date'])
annual = annual.sort_index(level=['permno', 'date'])
annual = annual[~annual.index.duplicated(keep='first')]

# Export data
print("Export data")
annual.to_parquet(get_parquet_dir(live) / 'data_fund_raw_a.parquet.brotli', compression='brotli')

------------------------------------------------------------
Read in Compustat Annual
Loading library list...
Done
Read in link table
Merge link table and Compustat Annual
Drop rows based on condition
Extract 6 digits from CUSIP
Replacing missing values
Shift data forward by 6 months
Convert index from annually to monthly
Convert from YY-MM to YY-MM-DD (2012-01 to 2012-01-31)
Set index and remove duplicate indices
Export data


# Compustat Quarterly

In [6]:
print("-" * 60)
sql_compustat_quarterly = f"""
    SELECT a.gvkey, a.datadate, a.fyearq, a.fqtr, a.datacqtr, a.datafqtr, a.acoq,
	a.actq, a.ajexq, a.apq, a.atq, a.ceqq, a.cheq, a.cogsq, a.cshoq, a.cshprq,
	a.dlcq, a.dlttq, a.dpq, a.drcq, a.drltq, a.dvpsxq, a.dvpq, a.dvy, a.epspiq, a.epspxq, a.fopty,
	a.gdwlq, a.ibq, a.invtq, a.intanq, a.ivaoq, a.lcoq, a.lctq, a.loq, a.ltq, a.mibq,
	a.niq, a.oancfy, a.oiadpq, a.oibdpq, a.piq, a.ppentq, a.ppegtq, a.prstkcy, a.prccq,
	a.pstkq, a.rdq, a.req, a.rectq, a.revtq, a.saleq, a.seqq, a.sstky, a.txdiq,
	a.txditcq, a.txpq, a.txtq, a.xaccq, a.xintq, a.xsgaq, a.xrdq, a.capxy
    FROM COMP.FUNDQ as a
	WHERE a.consol = 'C'
	AND a.popsrc = 'D'
	AND a.datafmt = 'STD'
	AND a.curcdq = 'USD'
	AND a.indfmt = 'INDL'
    AND a.datadate BETWEEN '2005-01-01' AND '{current_date}'
"""

# Read in Compustat Quarterly
print("Read In Compustat Quarterly...")
db = wrds.Connection(wrds_username='jofan23')
compustat_quarterly = db.raw_sql(sql_compustat_quarterly)
db.close()

# Read in link table
print("Read in link table...")
link_table = pd.read_parquet(get_parquet_dir(live) / 'data_link.parquet.brotli')
link_table = link_table.drop(['cusip', 'conm'], axis=1)

# Merge link table and Compustat Annual
print("Merge link table and Compustat Annual...")
quarterly = compustat_quarterly.merge(link_table, on='gvkey', how='inner')


# Keep only the most recent data for each fiscal quarter
print("Keep only the most recent data for each fiscal quarter...")
quarterly = quarterly.sort_values(by=['gvkey', 'fyearq', 'fqtr', 'datadate'])
quarterly = quarterly.groupby(['gvkey', 'fyearq', 'fqtr']).last().reset_index()

# Convert to datetime
print("Convert to datetime...")
quarterly['datadate'] = pd.to_datetime(quarterly['datadate'])
quarterly['rdq'] = pd.to_datetime(quarterly['rdq'])

# Shift data 3 months forward
print("Shift data 3 months forward...")
quarterly['time_avail_m'] = (quarterly['datadate'] + pd.DateOffset(months=3)).dt.to_period('M')
quarterly.loc[(~quarterly['rdq'].isnull()) & (quarterly['rdq'].dt.to_period('M') > quarterly['time_avail_m']), 'time_avail_m'] = quarterly['rdq'].dt.to_period('M')

# Compute month difference
print("Compute month difference...")
month_diff = (quarterly['rdq'] - quarterly['datadate']).dt.days // 30
quarterly = quarterly.drop(quarterly[(month_diff > 6) & ~quarterly['rdq'].isnull()].index)
quarterly = quarterly.sort_values(by=['gvkey', 'time_avail_m', 'datadate'])

# Keep most recent data
print("Keep most recent data...")
quarterly = quarterly.groupby(['gvkey', 'time_avail_m']).last().reset_index()

# Create extra yearly columns
print("Create extra yearly columns...")
for col in ['sstky', 'prstkcy', 'oancfy', 'fopty']:
    grouped = quarterly.groupby(['gvkey', 'fyearq'])[col]
    condition = quarterly['fqtr'] == 1
    new_values = np.where(condition, quarterly[col], quarterly[col] - grouped.shift(1))
    quarterly[col + 'q'] = new_values
    
# Convert index from quarterly to monthly
print("Convert index from quarterly to monthly...")
quarterly = quarterly.loc[quarterly.index.repeat(3)]
quarterly['tempTimeAvailM'] = quarterly['time_avail_m']
quarterly = quarterly.sort_values(by=['gvkey', 'tempTimeAvailM'])
quarterly['time_avail_m'] = quarterly.groupby(['gvkey', 'tempTimeAvailM']).cumcount() + quarterly['time_avail_m']

# Sort values
print("Sort values and keep most recent data...")
quarterly = quarterly.sort_values(by=['gvkey', 'time_avail_m', 'datadate'])
# Keep most recent data
quarterly = quarterly.groupby(['gvkey', 'time_avail_m']).last().reset_index()
quarterly = quarterly.drop(columns=['tempTimeAvailM'])
quarterly = quarterly.rename(columns={'datadate': 'datadateq', 'time_avail_m':'date'})

# Convert from YY-MM to YY-MM-DD (2012-01 to 2012-01-31)
print("Convert from YY-MM to YY-MM-DD (2012-01 to 2012-01-31)...")
quarterly.date = quarterly.date.dt.to_timestamp("M")
quarterly = quarterly.set_index(['permno', 'date'])

# Convert data to numerical format (exclude columns that are not numerical format)
print("Convert data to numerical format (exclude columns that are not numerical format)...")
numeric_cols = quarterly.select_dtypes(include=['number']).columns
quarterly[numeric_cols] = quarterly[numeric_cols].astype(float)
non_numeric_cols = quarterly.select_dtypes(exclude=['number']).columns
quarterly_numeric = quarterly[numeric_cols]
quarterly_numeric = quarterly_numeric.sort_index(level=['permno', 'date'])

# Forward fill yearly data
print("Forward fill yearly data...")
cols_to_fill = [col for col in quarterly_numeric.columns if col.endswith('y')]
quarterly_numeric[cols_to_fill] = quarterly_numeric[cols_to_fill].ffill()

# Export data
print("Export data...")
quarterly_numeric.to_parquet(get_parquet_dir(live) / 'data_fund_raw_q.parquet.brotli', compression='brotli')

------------------------------------------------------------
Read In Compustat Quarterly
Loading library list...
Done
Read in link table
Merge link table and Compustat Annual
Keep only the most recent data for each fiscal quarter
Convert to datetime
Shift data 3 months forward
Compute month difference
Keep most recent data
Create extra yearly columns
Convert index from quarterly to monthly
Sort values and keep most recent data
Convert from YY-MM to YY-MM-DD (2012-01 to 2012-01-31)
Convert data to numerical format (exclude columns that are not numerical format)
Forward fill yearly data
Export data


# Compustat Pension

In [7]:
print("-" * 60)
sql_compustat_pension = f"""
    SELECT a.gvkey, a.datadate, a.paddml, a.pbnaa, a.pbnvv, a.pbpro, 
	       a.pbpru, a.pcupsu, a.pplao, a.pplau
    FROM COMP.ACO_PNFNDA as a
	WHERE a.consol = 'C'
	AND a.popsrc = 'D'
	AND a.datafmt = 'STD'
	AND a.indfmt = 'INDL'
    AND a.datadate BETWEEN '2005-01-01' AND '{current_date}'
"""

# Read in Pension Annual
print("Read in Pension Annual...")
db = wrds.Connection(wrds_username='jofan23')
pension = db.raw_sql(sql_compustat_pension)
db.close()

# Drop duplicate indices
print("Drop duplicate indices...")
pension = pension.sort_values(by=['gvkey', 'datadate'])
pension = pension.groupby(['gvkey', 'datadate']).last().reset_index()

# Convert to datetime and set index
print("Convert to datetime and set index...")
pension['datadate'] = pd.to_datetime(pension['datadate'])
pension = pension.rename(columns = {'datadate': 'date', 'tic': 'ticker'})
pension = pension.set_index('date')

# Shift everything 1 year forward
print("Shift everything 1 year forward...")
for col in pension.columns:
    if col != 'gvkey' or col != 'indfmt' or col != 'datafmt' or col != 'consol' or col != 'popsrc' or col != 'ticker':
        pension[col] = pension.groupby('gvkey')[col].shift(1)

# Export data
print("Export data")
pension.to_parquet(get_parquet_dir(live) / 'data_pension.parquet.brotli', compression='brotli')

------------------------------------------------------------
Read in Pension Annual
Loading library list...
Done
Drop duplicate indices
Convert to datetime and set index
Shift everything 1 year forward
Export data


In [None]:
pension