In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import statsmodels.api as sm
from tqdm import tqdm
import os
import datetime

path_data = '/Users/daros/Documents/MBAN/Data Driven Investments - BAIF 508/Final project/Dividend Yield/'


In [10]:
crsp_dv = pd.read_csv(path_data+'ikd7pnvlgcwtgfuu.csv')

In [11]:
len(crsp_dv)

3373531

In [13]:
crsp_dv.head()

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,RET,SHROUT,RETX
0,10000,1985-12-31,,,,,7952,,,,
1,10000,1986-01-31,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.375,C,3680.0,C
2,10000,1986-02-28,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.25,-0.257143,3680.0,-0.257143
3,10000,1986-03-31,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.4375,0.365385,3680.0,0.365385
4,10000,1986-04-30,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.0,-0.098592,3793.0,-0.098592


SHRCD : Sharecode

EXCHCD: Exchange Code

PERMCO: Permanant Identifier

PRC: Price

RET: Holding Period Return

SHROUT: Number of Shares outstanding

RETX: RETX contains returns without dividends. Ordinary dividends and certain other regularly taxable dividends are excluded from the returns calculation. The formula is the same as for RET except d(t) is usually 0.

### Cleaning Data

In [21]:
crsp_dv[crsp_dv['RET']=='C']

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,RET,SHROUT,RETX
1,10000,1986-01-31,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.375,C,3680.0,C
20,10001,1986-01-31,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,-6.125,C,985.0,C
401,10002,1986-01-31,10.0,3.0,MBNC,MOBILE NATIONAL CORP,7954,-11.625,C,1175.0,C
728,10003,1986-01-31,11.0,3.0,GCBK,GREAT COUNTRY BK ASONIA CT,7957,-22.000,C,1900.0,C
849,10005,1986-01-31,10.0,3.0,WERC,WESTERN ENERGY RESOURCES INC,7961,-0.375,C,4655.0,C
...,...,...,...,...,...,...,...,...,...,...,...
3373119,93432,2010-06-30,11.0,3.0,JGBO,JIANGBO PHARMACEUTICALS INC,53450,9.430,C,11702.0,C
3373133,93433,2010-06-30,11.0,3.0,MOTR,MOTRICITY INC,53451,8.800,C,39976.0,C
3373213,93434,2010-06-30,11.0,3.0,SANW,S & W SEED CO,53427,3.270,C,5800.0,C
3373360,93435,2010-06-30,11.0,3.0,SCEI,SINO CLEAN ENERGY INC,53452,6.200,C,16557.0,C


#### ChatGPT about C Values:
In short, a "C" value for RET in CRSP data indicates that the security in question underwent a distribution during the period being considered and that the return calculation includes the amount of cash received as part of the total return.

droping values with C:

In [22]:
crsp_dv = crsp_dv[crsp_dv['RET']!='C']

In [23]:
crsp_dv[crsp_dv['RET']=='B']

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,RET,SHROUT,RETX
940,10007,1987-12-31,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
941,10007,1988-01-29,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
942,10007,1988-02-29,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
943,10007,1988-03-31,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
944,10007,1988-04-29,10.0,0.0,,SHAREDATA INC,7963,,B,2064.0,B
...,...,...,...,...,...,...,...,...,...,...,...
3367197,93369,2016-10-31,11.0,0.0,,ACCRETIVE HEALTH INC,53403,,B,97267.0,B
3367198,93369,2016-11-30,11.0,0.0,,ACCRETIVE HEALTH INC,53403,,B,97267.0,B
3367199,93369,2016-12-30,11.0,0.0,,ACCRETIVE HEALTH INC,53403,,B,97267.0,B
3367200,93369,2017-01-31,11.0,0.0,,ACCRETIVE HEALTH INC,53403,,B,97267.0,B


#### ChatGPT about B Values:
In short, a "B" value for RET in CRSP data indicates that the security in question underwent a buyback or a repurchase of shares during the period being considered and that the return calculation includes the amount of capital gain resulting from the buyback as part of the total return.

In [24]:
crsp_dv = crsp_dv[crsp_dv['RET']!='B'].reset_index(drop=True)

#### Drop negative prices:

In [79]:
crsp_dv = crsp_dv[crsp_dv['PRC']>=0].reset_index(drop=True)

### Figuring out Dividends

We calculate monthly dividends as the begin-of-month market equity times the difference between returns with and without dividends. (Replicating Anomalies- page 2081) 

Market equity is calculated by multiplying the current market price per share by the total number of outstanding shares of a company's stock. We will use this at the annual level.

In [27]:
crsp_dv.loc[:,'RET']=crsp_dv['RET'].astype(float)
crsp_dv.loc[:,'RETX']=crsp_dv['RETX'].astype(float)
crsp_dv['RET_DM']=(crsp_dv['RET']-crsp_dv['RETX']) #Monthly dividends calculated from return difference

  crsp_dv.loc[:,'RET']=crsp_dv['RET'].astype(float)
  crsp_dv.loc[:,'RETX']=crsp_dv['RETX'].astype(float)


#### Function to have data from last year July to this year June:

In [31]:
def fiscal_year(date):
    if (pd.to_datetime(date).month>6):
        year=pd.to_datetime(date).year+1
        month=6
        day=30
        return pd.to_datetime(str(year)+'-'+str(month)+'-'+str(day) )
    else:
        year=pd.to_datetime(date).year
        month=6
        day=30 
        return pd.to_datetime(str(year)+'-'+str(month)+'-'+str(day) )        

Convert the fiscal year in dataframe:

In [32]:
crsp_dv['date_dv'] = crsp_dv['date'].apply(lambda x: fiscal_year(x))
crsp_dv.head()

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,RET,SHROUT,RETX,RET_DM,date_dv
0,10000,1985-12-31,,,,,7952,,,,,,1986-06-30
1,10000,1986-02-28,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.25,-0.257143,3680.0,-0.257143,0.0,1986-06-30
2,10000,1986-03-31,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.4375,0.365385,3680.0,0.365385,0.0,1986-06-30
3,10000,1986-04-30,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-4.0,-0.098592,3793.0,-0.098592,0.0,1986-06-30
4,10000,1986-05-30,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.10938,-0.222656,3793.0,-0.222656,0.0,1986-06-30


#### Summing RET_DM for one year:

In [58]:
dv= crsp_dv.groupby(['PERMCO','date_dv']).sum('RET_DM').reset_index()[['date_dv','PERMCO','RET_DM']]
dv

Unnamed: 0,date_dv,PERMCO,RET_DM
0,1985-06-30,2,0.000000
1,1986-06-30,2,0.077015
2,1987-06-30,2,0.092750
3,1988-06-30,2,0.063659
4,1989-06-30,2,0.074080
...,...,...,...
282846,2023-06-30,59373,0.000000
282847,2023-06-30,59374,0.000000
282848,2023-06-30,59375,0.000000
282849,2023-06-30,59376,0.000000


#### Choosing the required data and merge them:

In [84]:
dv_02 = crsp_dv[['PERMNO','SHRCD','EXCHCD','TICKER','COMNAM','PERMCO','PRC','SHROUT','date']].reset_index(drop=True)
dv_02['date']= pd.to_datetime(dv_02['date'])
dv_02

Unnamed: 0,PERMNO,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,SHROUT,date
0,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,6.37500,991.0,1986-09-30
1,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,6.62500,991.0,1986-10-31
2,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,7.00000,991.0,1986-11-28
3,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,7.00000,991.0,1986-12-31
4,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,6.75000,991.0,1987-01-30
...,...,...,...,...,...,...,...,...,...
2954286,93436,11.0,3.0,TSLA,TESLA INC,53453,1077.59998,1035976.0,2022-03-31
2954287,93436,11.0,3.0,TSLA,TESLA INC,53453,870.76001,1036010.0,2022-04-29
2954288,93436,11.0,3.0,TSLA,TESLA INC,53453,758.26001,1036010.0,2022-05-31
2954289,93436,11.0,3.0,TSLA,TESLA INC,53453,673.41998,1041000.0,2022-06-30


In [78]:
dv_t = pd.merge(dv_02,dv,left_on=['PERMCO','date'],right_on=['PERMCO','date_dv'])
dv_t = dv_t.drop('date_dv',axis=1)
dv_t['dvy']=dv_t['RET_DM']*(abs(dv_t['PRC'])*dv_t['SHROUT'])
dv_t

Unnamed: 0,PERMNO,SHRCD,EXCHCD,TICKER,COMNAM,PERMCO,PRC,SHROUT,date,RET_DM,dvy
0,10000,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,-3.09375,3793.0,1986-06-30,0.000000,0.000000
1,10000,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,7952,,3893.0,1987-06-30,0.000000,
2,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,-6.12500,985.0,1986-06-30,0.031834,192.058501
3,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,5.87500,991.0,1987-06-30,0.066416,386.682254
4,10001,11.0,3.0,GFGC,GREAT FALLS GAS CO,7953,6.25000,992.0,1988-06-30,0.066508,412.349600
...,...,...,...,...,...,...,...,...,...,...,...
202220,93436,11.0,3.0,TSLA,TESLA MOTORS INC,53453,212.28000,148015.0,2016-06-30,0.000000,0.000000
202221,93436,11.0,3.0,TSLA,TESLA INC,53453,361.60999,166863.0,2017-06-30,0.000000,0.000000
202222,93436,11.0,3.0,TSLA,TESLA INC,53453,1079.81006,186000.0,2020-06-30,0.000000,0.000000
202223,93436,11.0,3.0,TSLA,TESLA INC,53453,679.70001,984003.0,2021-06-30,0.000000,0.000000


In [85]:
dv_t.to_csv('dividend_yearly.csv')