In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import statsmodels.api as sm
from tqdm import tqdm
import os
import datetime

# get the absolute path of the notebook file
notebook_path = os.path.abspath("CRSP_Dividend.ipynb")
# get the directory containing the notebook file
path_data = os.path.dirname(notebook_path)

In [2]:
crsp_dv = pd.read_csv('crsp_div.csv')

In [3]:
len(crsp_dv)

2246550

In [4]:
crsp_dv.head()

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,RET,SHROUT,RETX
0,10000,1985-12-31,,,,,7952,,,,
1,10000,1986-01-31,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.375,C,3680.0,C
2,10000,1986-02-28,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.25,-0.257143,3680.0,-0.257143
3,10000,1986-03-31,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.4375,0.365385,3680.0,0.365385
4,10000,1986-04-30,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.0,-0.098592,3793.0,-0.098592


SHRCD : Sharecode

EXCHCD: Exchange Code

PERMCO: Permanant Identifier

PRC: Price

RET: Holding Period Return

SHROUT: Number of Shares outstanding

RETX: RETX contains returns without dividends. Ordinary dividends and certain other regularly taxable dividends are excluded from the returns calculation. The formula is the same as for RET except d(t) is usually 0.

### Cleaning Data

In [5]:
crsp_dv[crsp_dv['RET']=='C']

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,RET,SHROUT,RETX
1,10000,1986-01-31,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.375,C,3680.0,C
20,10001,1986-01-31,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,-6.125,C,985.0,C
315,10002,1986-01-31,10.0,3.0,MBNC,MOBILE NATIONAL CORP,7954,-11.625,C,1175.0,C
610,10003,1986-01-31,11.0,3.0,GCBK,GREAT COUNTRY BK ASONIA CT,7957,-22.000,C,1900.0,C
731,10005,1986-01-31,10.0,3.0,WERC,WESTERN ENERGY RESOURCES INC,7961,-0.375,C,4655.0,C
...,...,...,...,...,...,...,...,...,...,...,...
2246541,93432,2010-06-30,11.0,3.0,JGBO,JIANGBO PHARMACEUTICALS INC,53450,9.430,C,11702.0,C
2246543,93433,2010-06-30,11.0,3.0,MOTR,MOTRICITY INC,53451,8.800,C,39976.0,C
2246545,93434,2010-06-30,11.0,3.0,SANW,S & W SEED CO,53427,3.270,C,5800.0,C
2246547,93435,2010-06-30,11.0,3.0,SCEI,SINO CLEAN ENERGY INC,53452,6.200,C,16557.0,C


#### Comment about C values in RET:
Following our research, a "C" value for RET in CRSP data indicates that the security in question underwent a distribution during the period being considered and that the return calculation includes the amount of cash received as part of the total return.

Droping values with C:

In [6]:
crsp_dv = crsp_dv[crsp_dv['RET']!='C']

In [7]:
crsp_dv[crsp_dv['RET']=='B']

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,RET,SHROUT,RETX
822,10007,1987-12-31,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
823,10007,1988-01-29,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
824,10007,1988-02-29,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
825,10007,1988-03-31,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
826,10007,1988-04-29,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
...,...,...,...,...,...,...,...,...,...,...,...
2245717,93263,2010-02-26,11.0,0.0,,CELSIUS HOLDINGS INC,53346,,B,15631.0,B
2245718,93263,2010-03-31,11.0,0.0,,CELSIUS HOLDINGS INC,53346,,B,15631.0,B
2245719,93263,2010-04-30,11.0,0.0,,CELSIUS HOLDINGS INC,53346,,B,15631.0,B
2245720,93263,2010-05-28,11.0,0.0,,CELSIUS HOLDINGS INC,53346,,B,15631.0,B


#### Comment about B Values in RET:
A "B" value for RET in CRSP data indicates that the security in question underwent a buyback or a repurchase of shares during the period being considered and that the return calculation includes the amount of capital gain resulting from the buyback as part of the total return.

In [8]:
crsp_dv = crsp_dv[crsp_dv['RET']!='B'].reset_index(drop=True)

### Figuring out Dividends

We calculate monthly dividends as the begin-of-month market equity times the difference between returns with and without dividends. (Replicating Anomalies- page 2081) 

Market equity is calculated by multiplying the current market price per share by the total number of outstanding shares of a company's stock. We will use this at the annual level.

In [9]:
crsp_dv.loc[:,'RET']=crsp_dv['RET'].astype(float)
crsp_dv.loc[:,'RETX']=crsp_dv['RETX'].astype(float)
crsp_dv['RET_DM']=(crsp_dv['RET']-crsp_dv['RETX']) #Monthly dividends calculated from return difference
crsp_dv['MARKET_CAP']=crsp_dv['SHROUT']*(abs(crsp_dv['PRC'])) #Monthly market cap
crsp_dv['DVY_M']=crsp_dv['RET_DM']*crsp_dv['MARKET_CAP'] #Monthly dividend yield
crsp_dv['DVY_M'] = pd.to_numeric(crsp_dv['DVY_M'], errors='coerce')


#### Function to have data from last year July to this year June ("fiscal year"):

In [10]:
def fiscal_year(date):
    if (pd.to_datetime(date).month>6):
        year=pd.to_datetime(date).year+1
        month=6
        day=30
        return pd.to_datetime(str(year)+'-'+str(month)+'-'+str(day) )
    else:
        year=pd.to_datetime(date).year
        month=6
        day=30 
        return pd.to_datetime(str(year)+'-'+str(month)+'-'+str(day) )        

Having the fiscal year in dataframe:

In [11]:
crsp_dv['date_dv'] = crsp_dv['date'].apply(lambda x: fiscal_year(x))
crsp_dv.head()

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,RET,SHROUT,RETX,RET_DM,MARKET_CAP,DVY_M,date_dv
0,10000,1985-12-31,,,,,7952,,,,,,,,1986-06-30
1,10000,1986-02-28,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.25,-0.257143,3680.0,-0.257143,0.0,11960.0,0.0,1986-06-30
2,10000,1986-03-31,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.4375,0.365385,3680.0,0.365385,0.0,16330.0,0.0,1986-06-30
3,10000,1986-04-30,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.0,-0.098592,3793.0,-0.098592,0.0,15172.0,0.0,1986-06-30
4,10000,1986-05-30,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.10938,-0.222656,3793.0,-0.222656,0.0,11793.87834,0.0,1986-06-30


#### Summing DVY_M for each fiscal year:

In [17]:
dv= crsp_dv.groupby(['PERMCO','date_dv']).sum('DVY_M').reset_index()[['date_dv','PERMCO','DVY_M']]
dv

Unnamed: 0,date_dv,PERMCO,DVY_M
0,1986-06-30,2,1767.284752
1,1987-06-30,2,5065.102904
2,1988-06-30,2,3779.932210
3,1989-06-30,2,5309.191852
4,1990-06-30,2,5981.804823
...,...,...,...
195706,2006-06-30,58620,0.000000
195707,2007-06-30,58620,0.000000
195708,2008-06-30,58620,1254.665008
195709,2009-06-30,58620,1614.889277


#### Choosing the required data and merge them:

In [18]:
dv_02 = crsp_dv[['PERMNO','SHRCD','EXCHCD','TICKER','COMNAM','PERMCO','PRC','SHROUT','date','MARKET_CAP']].reset_index(drop=True)
dv_02['date']= pd.to_datetime(dv_02['date'])
dv_02

Unnamed: 0,PERMNO,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,SHROUT,date,MARKET_CAP
0,10000,,,,,7952,,,1985-12-31,
1,10000,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.25000,3680.0,1986-02-28,11960.00000
2,10000,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.43750,3680.0,1986-03-31,16330.00000
3,10000,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.00000,3793.0,1986-04-30,15172.00000
4,10000,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.10938,3793.0,1986-05-30,11793.87834
...,...,...,...,...,...,...,...,...,...,...
2191404,93432,,,,,53450,,,2010-05-28,
2191405,93433,,,,,53451,,,2010-05-28,
2191406,93434,,,,,53427,,,2010-05-28,
2191407,93435,,,,,53452,,,2010-05-28,


In [19]:
dv_t = pd.merge(dv_02,dv,left_on=['PERMCO','date'],right_on=['PERMCO','date_dv'])
dv_t = dv_t.drop('date_dv',axis=1)
dv_t['DVY_A']=dv_t['DVY_M']/(dv_t['MARKET_CAP']) #Yearly dividend yield
dv_t

Unnamed: 0,PERMNO,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,SHROUT,date,MARKET_CAP,DVY_M,DVY_A
0,10000,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.09375,3793.0,1986-06-30,11734.59375,0.000000,0.000000
1,10000,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,,3893.0,1987-06-30,,0.000000,
2,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,-6.12500,985.0,1986-06-30,6033.12500,194.865751,0.032299
3,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,5.87500,991.0,1987-06-30,5822.12500,419.734086,0.072093
4,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,6.25000,992.0,1988-06-30,6200.00000,404.015932,0.065164
...,...,...,...,...,...,...,...,...,...,...,...,...
140894,93400,11.0,3.0,RLOC,REACHLOCAL INC,53426,12.97000,27824.0,2010-06-30,360877.28000,0.000000,0.000000
140895,93401,11.0,3.0,TNAV,TELENAV INC,53428,8.39000,42140.0,2010-06-30,353554.60000,0.000000,0.000000
140896,93402,11.0,3.0,TPCG,T P C GROUP INC,53429,16.60000,18256.0,2010-06-30,303049.60000,0.000000,0.000000
140897,93422,,,,,53439,,,2010-06-30,,0.000000,


In [20]:
dv_t.to_csv('dividend_yearly.csv')