# Full data

In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

CRSP from 199801 to 201912.

In [2]:
data = pd.read_csv('crsp_9819.csv',low_memory=False)

## Adjustments

In [3]:
data.dtypes

PERMNO       int64
date         int64
SHRCD      float64
SICCD       object
TICKER      object
COMNAM      object
TRDSTAT     object
SECSTAT     object
PERMCO       int64
HSICCD      object
CUSIP       object
HSICMG     float64
HSICIG     float64
DLRETX      object
DLRET       object
PRC        float64
VOL        float64
RET         object
SHROUT     float64
SPREAD     float64
RETX        object
vwretd     float64
ewretd     float64
dtype: object

### Returns

* $RET$ == C excluded
* $RET$ == B exclided
* $RET$ == NaN set to zero

In [4]:
data['RET'].size

1950751

In [5]:
print('RET == E observations: {}'.format(data.loc[data['RET'] == 'E']['RET'].size))
print('RET == D observations: {}'.format(data.loc[data['RET'] == 'D']['RET'].size))
print('RET == C observations: {} (No valid previous price)'.format(data.loc[data['RET'] == 'C']['RET'].size))
print('RET == B observations: {} (Off-exchange)'.format(data.loc[data['RET'] == 'B']['RET'].size))
print('RET == A observations: {}'.format(data.loc[data['RET'] == 'A']['RET'].size))
print('RET == NaN observations: {} (No valid price)'.format(data.loc[data['RET'].isna()].shape[0]))

RET == E observations: 0
RET == D observations: 0
RET == C observations: 11791 (No valid previous price)
RET == B observations: 26446 (Off-exchange)
RET == A observations: 0
RET == NaN observations: 24556 (No valid price)


In [6]:
data_ret_non_nan = data.loc[data['RET'].notna()]
data_ret_nan = data.loc[data['RET'].isna()]
data_ret_non_nan['RET'].size + data_ret_nan['RET'].size

1950751

In [7]:
data_ret_nan['RET'] = 0
data_ret_non_nan = data_ret_non_nan.loc[(data_ret_non_nan['RET'] != 'C') & (data_ret_non_nan['RET'] != 'B')]
data_ret_adj = pd.concat([data_ret_nan,data_ret_non_nan])

In [8]:
data_ret_adj['RET'].size

1912514

In [9]:
data = data_ret_adj

In [10]:
11791+26446 == 1950751-1912514

True

### Prices

Excluded stocks with $PRC < 1$.

In [11]:
data['PRC'] = data['PRC'].abs()

In [12]:
data = data.loc[data['PRC'] >= 1]

In [13]:
data['RET'].size

1822680

### Sharecodes

Only sharecodes $10$ and $11$ are of interest (ordinary common shares of US stocks).

In [14]:
data = data.loc[(data['SHRCD'] == 10) | (data['SHRCD'] == 11)]

In [15]:
data['RET'].size

1164202

### Delisting returns

Delisting returns are adjusted according to Shumway (1997) procedure. Also $RETX$ adjusted (for robustness testing)

In [16]:
print('DLRET == S observations: {} (CRSP has no source to establish a value after delisting)'.format(data.loc[data['DLRET'] == 'S']['RET'].size))
print('DLRET == T observations: {}'.format(data.loc[data['DLRET'] == 'T']['RET'].size))
print('DLRET == A observations: {} (Security is still active)'.format(data.loc[data['DLRET'] == 'A']['RET'].size))
print('DLRET == P observations: {}'.format(data.loc[data['DLRET'] == 'P']['RET'].size))
print('DLRET == NaN observations: {}'.format(data.loc[data['DLRET'].isna()].shape[0]))

DLRET == S observations: 3 (CRSP has no source to establish a value after delisting)
DLRET == T observations: 0
DLRET == A observations: 3415 (Security is still active)
DLRET == P observations: 0
DLRET == NaN observations: 1159746


In [17]:
print('Hence {} DLRET entries'.format(data['RET'].size - (1159746 + 3415 + 3)))

Hence 1038 DLRET entries


In [18]:
non_dl_data = data.loc[data['DLRET'].isna()] # no need for adjustment
non_dl_data['RET'].size

1159746

In [19]:
dl_rets = data.loc[data['DLRET'].notna()] # need for adjustment
dl_rets['RET'].size

4456

In [20]:
df_rets_no_adjust = dl_rets.loc[dl_rets['DLRET'] == 'A']
dl_rets_has_dlret = dl_rets.loc[(dl_rets['DLRET'] != 'A') & (dl_rets['DLRET'] != 'S')]

In [21]:
dl_rets_has_dlret['RET'] = dl_rets_has_dlret['DLRET']
dl_rets_has_dlret['RETX'] = dl_rets_has_dlret['DLRET']

In [22]:
dl_rets_has_nodl = dl_rets.loc[dl_rets['DLRET'] == 'S']
dl_rets_has_nodl['RET'] = -1
dl_rets_has_nodl['RETX'] = -1

In [23]:
data = pd.concat([non_dl_data,df_rets_no_adjust,dl_rets_has_dlret,dl_rets_has_nodl]).sort_index()

In [24]:
data['RET'].size

1164202

### ME variable

Market value of equity

In [25]:
data["ME"] = data["PRC"] * data["SHROUT"] / 1000

### Date conversion

In [26]:
def date_conversion(date): return date[:6]

v_date_conversion = np.vectorize(date_conversion)

In [27]:
data["date"] = data["date"].astype(str)
data["date"] = v_date_conversion(data["date"])

In [28]:
data['date'] = data['date'].astype(int)

## Needed variables

In [29]:
data.dtypes

PERMNO       int64
date         int64
SHRCD      float64
SICCD       object
TICKER      object
COMNAM      object
TRDSTAT     object
SECSTAT     object
PERMCO       int64
HSICCD      object
CUSIP       object
HSICMG     float64
HSICIG     float64
DLRETX      object
DLRET       object
PRC        float64
VOL        float64
RET         object
SHROUT     float64
SPREAD     float64
RETX        object
vwretd     float64
ewretd     float64
ME         float64
dtype: object

In [30]:
data = data[['PERMNO','date','TICKER','COMNAM','RET','RETX','ME','vwretd','ewretd']]

In [31]:
data['RET'] = data['RET'].astype(float)
data['RETX'] = data['RETX'].astype(float)

In [32]:
fdata = data