In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import wrds
import matplotlib.pyplot as plt
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
from scipy import stats
import matplotlib.dates as mdates
import statsmodels.api as sm
from scipy.stats import pearsonr
import itertools
from joblib import Parallel, delayed
import multiprocessing
import datetime

In [2]:
conn = wrds.Connection()

Enter your WRDS username [komalniraula]: kn2505
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  n


You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [9]:
years = list(range(1970, 2025))
df_list = []

for i, year in enumerate(years):
    print(f"Loading {year} ({i+1}/{len(years)})...")

    start_date = f"{year}-01-01"
    end_date = f"{year}-12-31"

    query = f"""
        SELECT 
            dsf.date,
            dsf.permno,
            dsf.permco,
            dsf.cusip,
            names.ticker,
            dsf.prc,
            dsf.vol,
            dsf.VWRETD,
            dsf.vwretd,
            dsf.vwretx,
            
            dsf.ret,
            dsf.retx,
            dsf.bidlo,
            dsf.askhi,
            dsf.openprc,
            dsf.shrout,
            dsf.cfacpr,
            dsf.cfacshr
        FROM crsp.dsf AS dsf
        LEFT JOIN crsp.msenames AS names
            ON dsf.permno = names.permno
            AND dsf.date BETWEEN names.namedt AND names.nameendt
        WHERE dsf.date BETWEEN '{start_date}' AND '{end_date}'
    """

    df_year = conn.raw_sql(query, date_cols=['date'])
    df_list.append(df_year)

df_crsp_daily = pd.concat(df_list, ignore_index=True)
df_crsp_daily

Loading 1970 (1/55)...


ProgrammingError: (psycopg2.errors.UndefinedColumn) column dsf.vwretd does not exist
LINE 10:             dsf.VWRETD,
                     ^
HINT:  Perhaps you meant to reference the column "dsf.ret" or the column "dsf.retx".

[SQL: 
        SELECT 
            dsf.date,
            dsf.permno,
            dsf.permco,
            dsf.cusip,
            names.ticker,
            dsf.prc,
            dsf.vol,
            dsf.VWRETD,
            dsf.vwretd,
            dsf.vwretx,
            
            dsf.ret,
            dsf.retx,
            dsf.bidlo,
            dsf.askhi,
            dsf.openprc,
            dsf.shrout,
            dsf.cfacpr,
            dsf.cfacshr
        FROM crsp.dsf AS dsf
        LEFT JOIN crsp.msenames AS names
            ON dsf.permno = names.permno
            AND dsf.date BETWEEN names.namedt AND names.nameendt
        WHERE dsf.date BETWEEN '1970-01-01' AND '1970-12-31'
    ]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
df_crsp_daily.isnull().sum()

In [None]:
df_crsp_daily.to_csv('all_crsp.csv', index = False)
df = df_crsp_daily.copy()

In [10]:
df = pd.read_csv('all_crsp.csv')

In [11]:
df = df.drop(columns=['openprc'])

required_cols = ['prc', 'ret', 'shrout']
df = df.dropna(subset=required_cols)
df.isnull().sum()

date             0
permno           0
permco           0
cusip            0
ticker      260994
prc              0
vol        6350503
ret              0
retx             0
bidlo         1527
askhi         1750
shrout           0
cfacpr           0
cfacshr          0
dtype: int64

In [12]:
cols_to_keep = ['date', 'permno', 'ticker', 'prc', 'retx', 'shrout', 'cfacpr', 'vol']
df_final = df[cols_to_keep].copy()
df_final

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol
0,1970-01-05,20538,WHL,44.62500,-0.055556,1270.0,1.00,15200.0
1,1970-01-05,20562,ML,20.50000,-0.012048,19780.0,6.75,17300.0
2,1970-01-05,20570,NG,23.75000,0.010638,14644.0,1.50,5100.0
3,1970-01-05,20589,CYL,35.87500,-0.003472,2034.0,1.50,300.0
4,1970-01-05,20618,CRS,27.25000,0.004608,4293.0,8.00,3200.0
...,...,...,...,...,...,...,...,...
94731329,2024-12-31,92396,ECH,25.04000,0.000000,18950.0,1.00,67015.0
94731330,2024-12-31,92397,BKF,36.49050,-0.002550,1850.0,1.00,2732.0
94731331,2024-12-31,92398,AIA,67.83000,-0.005571,10500.0,1.00,38260.0
94731332,2024-12-31,92402,MSCI,600.01001,0.000600,78371.0,1.00,223964.0


In [14]:
df_final['date'] = pd.to_datetime(df_final['date'])
df_final = df_final[df_final['date'] > datetime.datetime(1974, 12, 31)] 

In [15]:
query_dsi = """
    SELECT date, vwretd, vwretx
    FROM crsp.dsi
    WHERE date BETWEEN '2010-01-01' AND '2024-12-31'
"""

df_market = conn.raw_sql(query_dsi, date_cols=['date'])

df_final = df_final.merge(df_market, on='date', how='left')

In [16]:
df_final

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx
0,1975-01-02,23924,CFSR,-11.62500,0.000000,1591.0,2.0000,,0.02524,0.025212
1,1975-01-02,23931,NSP,17.12500,0.070313,23233.0,4.0000,12300.0,0.02524,0.025212
2,1975-01-02,23975,CLRK,-21.00000,0.000000,1279.0,45.5625,,0.02524,0.025212
3,1975-01-02,23990,ROH,47.87500,0.035135,12871.0,18.0000,4600.0,0.02524,0.025212
4,1975-01-02,24002,DEW,9.25000,0.000000,16487.0,1.5000,12200.0,0.02524,0.025212
...,...,...,...,...,...,...,...,...,...,...
88359665,2024-12-31,92396,ECH,25.04000,0.000000,18950.0,1.0000,67015.0,-0.003392,-0.003541
88359666,2024-12-31,92397,BKF,36.49050,-0.002550,1850.0,1.0000,2732.0,-0.003392,-0.003541
88359667,2024-12-31,92398,AIA,67.83000,-0.005571,10500.0,1.0000,38260.0,-0.003392,-0.003541
88359668,2024-12-31,92402,MSCI,600.01001,0.000600,78371.0,1.0000,223964.0,-0.003392,-0.003541


In [17]:
df_final.to_csv('stock_daily.csv', index = False)