In [1]:
import pandas as pd
import numpy as np
import wrds
import datetime

In [2]:
# Establish connection to wrds
conn = wrds.Connection()

Enter your WRDS username [komalniraula]: kn2505
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  n


You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [3]:
# Get data from 2014 to 2025
#Even though we'll only use 2015-2024, large data is collected to calculate features that depend on past data like rolling volatility 
years = list(range(2014, 2025))
df_list = []

for i, year in enumerate(years):
    print(f"Loading {year} ({i+1}/{len(years)})...")

    start_date = f"{year}-01-01"
    end_date = f"{year}-12-31"

    query = f"""
        SELECT 
            dsf.date,
            dsf.permno,
            names.ticker,
            dsf.prc,
            dsf.vol,
            
            dsf.ret,
            dsf.retx,
            dsf.bidlo,
            dsf.askhi,
            dsf.openprc,
            dsf.shrout,
            dsf.cfacpr,
            dsf.cfacshr
        FROM crsp.dsf AS dsf
        LEFT JOIN crsp.msenames AS names
            ON dsf.permno = names.permno
            AND dsf.date BETWEEN names.namedt AND names.nameendt
        WHERE dsf.date BETWEEN '{start_date}' AND '{end_date}'
    """

    df_year = conn.raw_sql(query, date_cols=['date'])
    df_list.append(df_year)

df_crsp_daily = pd.concat(df_list, ignore_index=True)
df_crsp_daily

Loading 2014 (1/11)...
Loading 2015 (2/11)...
Loading 2016 (3/11)...
Loading 2017 (4/11)...
Loading 2018 (5/11)...
Loading 2019 (6/11)...
Loading 2020 (7/11)...
Loading 2021 (8/11)...
Loading 2022 (9/11)...
Loading 2023 (10/11)...
Loading 2024 (11/11)...


Unnamed: 0,date,permno,ticker,prc,vol,ret,retx,bidlo,askhi,openprc,shrout,cfacpr,cfacshr
0,2014-01-02,10001,EGAS,8.04,72900.0,0.001245,0.001245,7.9225,8.12,8.0,10452.0,1.0,1.0
1,2014-01-02,10025,AEPI,53.04,66074.0,0.003975,0.003975,51.46,53.18,52.68,5601.0,1.0,1.0
2,2014-01-02,10026,JJSF,87.15,50930.0,-0.016255,-0.016255,86.48,88.3,88.23,18681.0,1.0,1.0
3,2014-01-02,10028,DGSE,2.15,11300.0,-0.035874,-0.035874,2.06,2.21,2.2,12176.0,1.0,1.0
4,2014-01-02,10032,PLXS,42.78,145207.0,-0.011781,-0.011781,42.52,43.26,43.22,33787.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22310665,2024-12-31,86778,MKSI,104.39,321621.0,0.003075,0.003075,103.64,106.005,104.67,67299.0,1.0,1.0
22310666,2024-12-31,86783,BKNG,4968.41992,104869.0,-0.004498,-0.004498,4963.25977,5007.22998,4990.0,33097.0,1.0,1.0
22310667,2024-12-31,86799,CNX,36.67,1269648.0,-0.010256,-0.010256,36.64,37.1,36.96,149271.0,1.0,1.0
22310668,2024-12-31,86810,TREX,69.03,536832.0,-0.002889,-0.002889,69.01,70.25,69.67,107144.0,1.0,1.0


In [4]:
df_crsp_daily.to_csv('all_crsp.csv', index = False) # save entire data, backup
df = df_crsp_daily.copy()

In [5]:
# check null values
df.isnull().sum()

date            0
permno          0
ticker     101888
prc        107721
vol        107734
ret        116690
retx       116690
bidlo      107721
askhi      107721
openprc    714460
shrout          1
cfacpr          1
cfacshr         1
dtype: int64

In [6]:
df = df.drop(columns=['openprc']) # too many null value

required_cols = ['prc', 'ret', 'shrout'] # these are must for closing prc, return and b/m calculations
df = df.dropna(subset=required_cols)
df.isnull().sum()

date        0
permno      0
ticker      7
prc         0
vol        18
ret         0
retx        0
bidlo       0
askhi       0
shrout      0
cfacpr      0
cfacshr     0
dtype: int64

In [7]:
cols_to_keep = ['date', 'permno', 'ticker', 'prc', 'retx', 'shrout', 'cfacpr', 'vol'] # keep the columns
df_final = df[cols_to_keep].copy()
df_final

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol
0,2014-01-02,10001,EGAS,8.04,0.001245,10452.0,1.0,72900.0
1,2014-01-02,10025,AEPI,53.04,0.003975,5601.0,1.0,66074.0
2,2014-01-02,10026,JJSF,87.15,-0.016255,18681.0,1.0,50930.0
3,2014-01-02,10028,DGSE,2.15,-0.035874,12176.0,1.0,11300.0
4,2014-01-02,10032,PLXS,42.78,-0.011781,33787.0,1.0,145207.0
...,...,...,...,...,...,...,...,...
22310665,2024-12-31,86778,MKSI,104.39,0.003075,67299.0,1.0,321621.0
22310666,2024-12-31,86783,BKNG,4968.41992,-0.004498,33097.0,1.0,104869.0
22310667,2024-12-31,86799,CNX,36.67,-0.010256,149271.0,1.0,1269648.0
22310668,2024-12-31,86810,TREX,69.03,-0.002889,107144.0,1.0,536832.0


In [8]:
df_final['date'] = pd.to_datetime(df_final['date'])

Value weighted returns to look over market returns

In [9]:
query_dsi = """
    SELECT date, vwretd, vwretx
    FROM crsp.dsi
    WHERE date BETWEEN '2010-01-01' AND '2024-12-31'
"""

df_market = conn.raw_sql(query_dsi, date_cols=['date'])

df_final = df_final.merge(df_market, on='date', how='left') # merge with crsp price data
df_final

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx
0,2014-01-02,10001,EGAS,8.04,0.001245,10452.0,1.0,72900.0,-0.008757,-0.008884
1,2014-01-02,10025,AEPI,53.04,0.003975,5601.0,1.0,66074.0,-0.008757,-0.008884
2,2014-01-02,10026,JJSF,87.15,-0.016255,18681.0,1.0,50930.0,-0.008757,-0.008884
3,2014-01-02,10028,DGSE,2.15,-0.035874,12176.0,1.0,11300.0,-0.008757,-0.008884
4,2014-01-02,10032,PLXS,42.78,-0.011781,33787.0,1.0,145207.0,-0.008757,-0.008884
...,...,...,...,...,...,...,...,...,...,...
22193975,2024-12-31,86778,MKSI,104.39,0.003075,67299.0,1.0,321621.0,-0.003392,-0.003541
22193976,2024-12-31,86783,BKNG,4968.41992,-0.004498,33097.0,1.0,104869.0,-0.003392,-0.003541
22193977,2024-12-31,86799,CNX,36.67,-0.010256,149271.0,1.0,1269648.0,-0.003392,-0.003541
22193978,2024-12-31,86810,TREX,69.03,-0.002889,107144.0,1.0,536832.0,-0.003392,-0.003541


In [10]:
df_final.to_csv('stock_daily.csv', index = False)