In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import datetime

from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from pandas.tseries.offsets import DateOffset

from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday, nearest_workday, \
    USMartinLutherKingJr, USPresidentsDay, GoodFriday, USMemorialDay, \
    USLaborDay, USThanksgivingDay

class USTradingCalendar(AbstractHolidayCalendar):
    rules = [
        Holiday('NewYearsDay', month=1, day=1, observance=nearest_workday),
        USMartinLutherKingJr,
        USPresidentsDay,
        GoodFriday,
        USMemorialDay,
        Holiday('USIndependenceDay', month=7, day=4, observance=nearest_workday),
        USLaborDay,
        USThanksgivingDay,
        Holiday('Christmas', month=12, day=25, observance=nearest_workday)
    ]

def get_trading_close_holidays(min_date, max_date):
    inst = USTradingCalendar()
    return inst.holidays(min_date, max_date)

In [2]:
ret_df = pd.read_pickle('data/crsp_dailydata_holding_w_prices.pkl')

In [3]:
min_date = ret_df['date'].min()
max_date = ret_df['date'].max()
holidays = get_trading_close_holidays(min_date=min_date, max_date=max_date)
list_bdays = list(pd.bdate_range(min_date, max_date, freq='C', holidays=holidays))

In [5]:
ret_df_split_list = [[permno, df] for permno, df in ret_df.groupby('PERMNO')]

In [9]:
ret_filled_df_list = []
for permno, df in tqdm(ret_df_split_list):   
    df_dates = df['date']
    list_bdays_permno = sorted(list(set(list_bdays + list(df_dates))))
    start_index = list_bdays_permno.index(df_dates.min())
    end_index = list_bdays_permno.index(df_dates.max())

    new_date_list = list_bdays_permno[start_index:end_index + 1]
    df = df.set_index('date').reindex(new_date_list).reset_index()
    df.columns = ['date', 'PERMNO', 'TICKER', 'PRC', 'RET']
    
    df['PERMNO'] = [permno for i in range(len(df))]
    df['RET'] = df['RET'].fillna(0)
    df['30d_ret'] = np.exp((np.log(df['RET'] + 1)).rolling(window=30).sum().shift(-29)) - 1
    df['60d_ret'] = np.exp((np.log(df['RET'] + 1)).rolling(window=60).sum().shift(-59)) - 1
    df['2d_ret'] = np.exp((np.log(df['RET'] + 1)).rolling(window=2).sum().shift(-1)) - 1
    df['3d_ret'] = np.exp((np.log(df['RET'] + 1)).rolling(window=3).sum().shift(-2)) - 1
    df['back_1d'] = np.exp((np.log(df['RET'] + 1)).rolling(window=1).sum().shift(1)) - 1
    df['back_5d'] = np.exp((np.log(df['RET'] + 1)).rolling(window=5).sum().shift(1)) - 1
    df['back_30d'] = np.exp((np.log(df['RET'] + 1)).rolling(window=30).sum().shift(1)) - 1

    ret_filled_df_list.append(df)
    
ret_df_filled = pd.concat(ret_filled_df_list)

100%|████████████████████████████████████████████████████████████████████████████| 17523/17523 [16:55<00:00, 17.25it/s]


In [5]:
ret_df_filled['lagged_PRC'] = ret_df_filled.groupby('PERMNO')['PRC'].shift(1)

In [11]:
ret_df_filled.to_pickle('data/checkpoint_data/crsp_dailydata_holding_w_period_returns.pkl')