In [40]:
import pandas as pd
import numpy as np
import polars as pl

from factorlib.utils.system import get_data_dir

In [41]:
raw_data_dir = get_data_dir() / 'raw'
raw_data = pd.read_csv(raw_data_dir / 'div_distcd_dates.csv')

In [42]:
# Each digit of distribution code means something. Separating each digit into columns for easy access.
separated_digits = raw_data

def get_digit(x, digit):
    if pd.notnull(x):
        return int(str(x)[digit])
    return np.nan

separated_digits['cd1'] = separated_digits['DISTCD'].apply(get_digit, args=[0])
separated_digits['cd2'] = separated_digits['DISTCD'].apply(get_digit, args=[1])
separated_digits['cd3'] = separated_digits['DISTCD'].apply(get_digit, args=[2])
separated_digits['cd4'] = separated_digits['DISTCD'].apply(get_digit, args=[3])
separated_digits = separated_digits.drop(columns=['DISTCD'])
separated_digits

Unnamed: 0,PERMNO,DIVAMT,FACSHR,EXDT,RCRDDT,PAYDT,cd1,cd2,cd3,cd4
0,89477,0.02343,0.0,2004-04-27,2000-11-22,2004-04-27,1.0,2.0,1.0,2.0
1,89477,0.00049,0.0,2007-11-27,2000-11-22,2007-11-27,1.0,2.0,1.0,2.0
2,87433,0.00765,0.0,2002-01-04,2001-12-03,2002-01-04,1.0,2.0,1.0,2.0
3,87835,0.00720,0.0,2002-01-17,2001-12-14,2002-01-17,1.0,2.0,1.0,2.0
4,89477,0.01120,0.0,2002-01-11,2001-12-19,2002-01-11,1.0,2.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...
430491,93432,,,,,,,,,
430492,93433,,,,,,,,,
430493,93434,,,,,,,,,
430494,93435,,,,,,,,,


In [43]:
# If the first digit is a 1 and the second digit is a 2, this means that this distribution event is a cash dividend. These are the only distribution codes we want because DivSeason focuses on DIVIDEND seasonality only.
digits_one_two_filter = separated_digits[(separated_digits['cd1'] == 1) & (separated_digits['cd2'] == 2)]

In [44]:
# We have dates like ex_date, rcrd_date, and pay_date, but we need to normalize these to a date_index. Therefore, we are making a time_available column that will serve as the date_index and represents the time that this dividend was available.
with_date_index = digits_one_two_filter
with_date_index['date_index'] = pd.to_datetime(digits_one_two_filter.loc[:, 'EXDT']).dt.to_period('M')
with_date_index


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,PERMNO,DIVAMT,FACSHR,EXDT,RCRDDT,PAYDT,cd1,cd2,cd3,cd4,date_index
0,89477,0.02343,0.0,2004-04-27,2000-11-22,2004-04-27,1.0,2.0,1.0,2.0,2004-04
1,89477,0.00049,0.0,2007-11-27,2000-11-22,2007-11-27,1.0,2.0,1.0,2.0,2007-11
2,87433,0.00765,0.0,2002-01-04,2001-12-03,2002-01-04,1.0,2.0,1.0,2.0,2002-01
3,87835,0.00720,0.0,2002-01-17,2001-12-14,2002-01-17,1.0,2.0,1.0,2.0,2002-01
4,89477,0.01120,0.0,2002-01-11,2001-12-19,2002-01-11,1.0,2.0,1.0,2.0,2002-01
...,...,...,...,...,...,...,...,...,...,...,...
403744,88197,0.15000,0.0,2022-12-30,2023-01-03,2023-01-13,1.0,2.0,3.0,2.0,2022-12
403745,90396,0.49000,0.0,2022-12-30,2023-01-03,2023-01-17,1.0,2.0,3.0,2.0,2022-12
403746,92118,0.02268,0.0,2022-12-30,2023-01-03,2023-01-03,1.0,2.0,7.0,2.0,2022-12
403747,93096,0.55000,0.0,2022-12-30,2023-01-03,2023-01-17,1.0,2.0,3.0,2.0,2022-12


In [45]:
# Now we need to get the total dividend amounts per date, per ticker, per third digit of the distribution code. The third digit of the distribution represents the frequency that the dividend is paid by the company.
div_amts_summed = with_date_index.groupby(['PERMNO', 'cd3', 'date_index']).sum('DIVAMT').reset_index()
div_amts_summed = div_amts_summed.sort_values(by=['PERMNO', 'date_index', 'cd3'])
div_amts_summed

Unnamed: 0,PERMNO,cd3,date_index,DIVAMT,FACSHR,cd1,cd2,cd4
86,10001,3.0,2002-03,0.130,0.0,1.0,2.0,2.0
87,10001,3.0,2002-06,0.135,0.0,1.0,2.0,2.0
88,10001,3.0,2002-09,0.135,0.0,1.0,2.0,2.0
89,10001,3.0,2002-12,0.135,0.0,1.0,2.0,2.0
90,10001,3.0,2003-03,0.135,0.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...
392106,93429,3.0,2021-11,0.480,0.0,1.0,2.0,2.0
392107,93429,3.0,2022-02,0.480,0.0,1.0,2.0,2.0
392108,93429,3.0,2022-05,0.480,0.0,1.0,2.0,2.0
392109,93429,3.0,2022-08,0.480,0.0,1.0,2.0,2.0


In [46]:
# Remove any duplicate rows that have the same date_index and permno
duplicates_removed = div_amts_summed.groupby(['PERMNO', 'date_index']).first().reset_index()
duplicates_removed

Unnamed: 0,PERMNO,date_index,cd3,DIVAMT,FACSHR,cd1,cd2,cd4
0,10001,2002-03,3.0,0.130,0.0,1.0,2.0,2.0
1,10001,2002-06,3.0,0.135,0.0,1.0,2.0,2.0
2,10001,2002-09,3.0,0.135,0.0,1.0,2.0,2.0
3,10001,2002-12,3.0,0.135,0.0,1.0,2.0,2.0
4,10001,2003-03,3.0,0.135,0.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...
386918,93429,2021-11,3.0,0.480,0.0,1.0,2.0,2.0
386919,93429,2022-02,3.0,0.480,0.0,1.0,2.0,2.0
386920,93429,2022-05,3.0,0.480,0.0,1.0,2.0,2.0
386921,93429,2022-08,3.0,0.480,0.0,1.0,2.0,2.0


In [47]:
# Forward-fill any missing distribution codes with the most recent and clean div_amts by filling nans with 0.
cleaned_data = duplicates_removed
cleaned_data['cd3'].fillna(method='ffill', inplace=True)
cleaned_data['DIVAMT'] = cleaned_data['DIVAMT'].fillna(0)
cleaned_data

Unnamed: 0,PERMNO,date_index,cd3,DIVAMT,FACSHR,cd1,cd2,cd4
0,10001,2002-03,3.0,0.130,0.0,1.0,2.0,2.0
1,10001,2002-06,3.0,0.135,0.0,1.0,2.0,2.0
2,10001,2002-09,3.0,0.135,0.0,1.0,2.0,2.0
3,10001,2002-12,3.0,0.135,0.0,1.0,2.0,2.0
4,10001,2003-03,3.0,0.135,0.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...
386918,93429,2021-11,3.0,0.480,0.0,1.0,2.0,2.0
386919,93429,2022-02,3.0,0.480,0.0,1.0,2.0,2.0
386920,93429,2022-05,3.0,0.480,0.0,1.0,2.0,2.0
386921,93429,2022-08,3.0,0.480,0.0,1.0,2.0,2.0


In [48]:
# Make a column representing whether a dividend was paid on time, and filter based on distribution codes
with_divpaid = cleaned_data
with_divpaid['div_paid'] = np.where(with_divpaid['DIVAMT'] > 0, 1, 0)
filtered = with_divpaid[with_divpaid['cd3'] != 2]
filtered = filtered[filtered['cd3'] < 6]
filtered

Unnamed: 0,PERMNO,date_index,cd3,DIVAMT,FACSHR,cd1,cd2,cd4,div_paid
0,10001,2002-03,3.0,0.130,0.0,1.0,2.0,2.0,1
1,10001,2002-06,3.0,0.135,0.0,1.0,2.0,2.0,1
2,10001,2002-09,3.0,0.135,0.0,1.0,2.0,2.0,1
3,10001,2002-12,3.0,0.135,0.0,1.0,2.0,2.0,1
4,10001,2003-03,3.0,0.135,0.0,1.0,2.0,2.0,1
...,...,...,...,...,...,...,...,...,...
386918,93429,2021-11,3.0,0.480,0.0,1.0,2.0,2.0,1
386919,93429,2022-02,3.0,0.480,0.0,1.0,2.0,2.0,1
386920,93429,2022-05,3.0,0.480,0.0,1.0,2.0,2.0,1
386921,93429,2022-08,3.0,0.480,0.0,1.0,2.0,2.0,1


In [50]:
# Create the div12 variable
with_div12 = filtered.set_index(['date_index'])
with_div12 = with_div12.groupby('PERMNO').resample('M', convention='start').first().reset_index('date_index').drop(columns=['PERMNO']).reset_index()
with_div12['div_paid'] = with_div12['div_paid'].fillna(0)
with_div12['div12'] = with_div12.groupby('PERMNO')['div_paid'].rolling(12).sum().reset_index(0, drop=True)
with_div12

Unnamed: 0,PERMNO,date_index,cd3,DIVAMT,FACSHR,cd1,cd2,cd4,div_paid,div12
0,10001,2002-03,3.0,0.130,0.0,1.0,2.0,2.0,1.0,
1,10001,2002-04,,,,,,,0.0,
2,10001,2002-05,,,,,,,0.0,
3,10001,2002-06,3.0,0.135,0.0,1.0,2.0,2.0,1.0,
4,10001,2002-07,,,,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...
773227,93429,2022-07,,,,,,,0.0,4.0
773228,93429,2022-08,3.0,0.480,0.0,1.0,2.0,2.0,1.0,4.0
773229,93429,2022-09,,,,,,,0.0,4.0
773230,93429,2022-10,,,,,,,0.0,4.0


In [53]:
# Now we create our div_season factor!
with_div_season = with_div12
with_div_season['div_season'] = np.where(with_div_season['div12'] > 0, 0, np.nan)

for i in range(3):
    condition = ((with_div_season['cd3'] == 3) | (with_div_season['cd3'] == 0) | (with_div_season['cd3'] == 1)) \
    & (with_div_season.groupby('PERMNO')['div_paid'].shift(i*3) == 1)
    with_div_season.loc[condition, 'div_season'] = 1

for i in [5, 11]:
    condition = (with_div_season['cd3'] == 4) & (with_div_season.groupby('PERMNO')['div_paid'].shift(i) == 1)
    with_div_season.loc[condition, 'div_season'] = 1

condition = (with_div_season['cd3'] == 5) & (with_div_season.groupby('PERMNO')['div_paid'].shift(11) == 1)
with_div_season.loc[condition, 'div_season'] = 1
with_div_season['div_season'] = with_div_season['div_season'].fillna(0)
with_div_season

Unnamed: 0,PERMNO,date_index,cd3,DIVAMT,FACSHR,cd1,cd2,cd4,div_paid,div12,div_season
0,10001,2002-03,3.0,0.130,0.0,1.0,2.0,2.0,1.0,,1.0
1,10001,2002-04,,,,,,,0.0,,0.0
2,10001,2002-05,,,,,,,0.0,,0.0
3,10001,2002-06,3.0,0.135,0.0,1.0,2.0,2.0,1.0,,1.0
4,10001,2002-07,,,,,,,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...
773227,93429,2022-07,,,,,,,0.0,4.0,0.0
773228,93429,2022-08,3.0,0.480,0.0,1.0,2.0,2.0,1.0,4.0,1.0
773229,93429,2022-09,,,,,,,0.0,4.0,0.0
773230,93429,2022-10,,,,,,,0.0,4.0,0.0


In [61]:
columns_to_keep = ['date_index', 'PERMNO', 'div_season']
div_season_factor = with_div_season[columns_to_keep]
div_season_factor

Unnamed: 0,date_index,PERMNO,div_season
0,2002-03,10001,1.0
1,2002-04,10001,0.0
2,2002-05,10001,0.0
3,2002-06,10001,1.0
4,2002-07,10001,0.0
...,...,...,...
773227,2022-07,93429,0.0
773228,2022-08,93429,1.0
773229,2022-09,93429,0.0
773230,2022-10,93429,0.0


In [65]:
# Replace permno with tickers
permnos = pd.read_csv(raw_data_dir / 'permnos.csv')
permnos_dict = dict(zip(permnos['PERMNO'], permnos['TICKER']))
final_div_season = div_season_factor.replace(permnos_dict)
final_div_season.rename(columns={'PERMNO': 'ticker'}, inplace=True)
final_div_season

Unnamed: 0,date_index,PERMNO,div_season
0,2002-03,EGAS,1.0
1,2002-04,EGAS,0.0
2,2002-05,EGAS,0.0
3,2002-06,EGAS,1.0
4,2002-07,EGAS,0.0
...,...,...,...
773227,2022-07,CBOE,0.0
773228,2022-08,CBOE,1.0
773229,2022-09,CBOE,0.0
773230,2022-10,CBOE,0.0


In [72]:
fundamentals_data_dir = get_data_dir() / 'fundamental'
final_div_season.to_csv(fundamentals_data_dir / 'div_season.csv')