In [330]:
import pandas as pd
import numpy as np

# Load and Format Data

## CRSP/COMPUSTAT Merged

In [211]:
# Load Data
CRSP_COMPUSTAT_MERGED = pd.read_csv('Data/CRSP_COMPUSTAT_MERGED_TEST.csv')
CRSP_COMPUSTAT_MERGED.head()

Unnamed: 0,LPERMNO,datacqtr,atq,ceqq,cheq,ltq,niq
0,54594,2017Q1,1502.1,885.7,10.1,616.4,13.7
1,54594,2017Q2,1504.1,914.2,10.3,589.9,21.2
2,54594,2017Q3,1531.7,924.7,15.1,607.0,10.6
3,54594,2017Q4,1544.3,906.5,27.1,637.8,-22.5
4,54594,2018Q1,1512.2,915.2,34.6,597.0,15.5


In [212]:
# Split Up 'datacqtr' into calendar year and quarter
CRSP_COMPUSTAT_MERGED['CalendarYear'] = CRSP_COMPUSTAT_MERGED['datacqtr'].str.slice(0,4)
CRSP_COMPUSTAT_MERGED['Quarter'] = CRSP_COMPUSTAT_MERGED['datacqtr'].str.slice(4)

# Convert Quarters to Pandas Datetimes
CRSP_COMPUSTAT_MERGED['datacqtr_formatted'] = CRSP_COMPUSTAT_MERGED["CalendarYear"] + "-" + CRSP_COMPUSTAT_MERGED["Quarter"]
CRSP_COMPUSTAT_MERGED['QuarterStart'] = pd.PeriodIndex(CRSP_COMPUSTAT_MERGED['datacqtr_formatted'], freq='Q').to_timestamp()

# Calculate Start of Quarter and End of Quarter Dates
CRSP_COMPUSTAT_MERGED['QuarterEnd'] = CRSP_COMPUSTAT_MERGED['QuarterStart'] + pd.offsets.MonthEnd(3)
CRSP_COMPUSTAT_MERGED['QuarterStart_Month'] = CRSP_COMPUSTAT_MERGED['QuarterStart'].dt.to_period('m')
CRSP_COMPUSTAT_MERGED['QuarterEnd_Month'] = CRSP_COMPUSTAT_MERGED['QuarterEnd'].dt.to_period('m')

# Calculate Lagged Dates (Year-Month)
CRSP_COMPUSTAT_MERGED['Date_Lag2'] = CRSP_COMPUSTAT_MERGED['QuarterEnd_Month'] + 2
CRSP_COMPUSTAT_MERGED['Date_Lag3'] = CRSP_COMPUSTAT_MERGED['QuarterEnd_Month'] + 3
CRSP_COMPUSTAT_MERGED['Date_Lag4'] = CRSP_COMPUSTAT_MERGED['QuarterEnd_Month'] + 4
CRSP_COMPUSTAT_MERGED.head()

Unnamed: 0,LPERMNO,datacqtr,atq,ceqq,cheq,ltq,niq,CalendarYear,Quarter,datacqtr_formatted,QuarterStart,QuarterEnd,QuarterStart_Month,QuarterEnd_Month,Date_Lag2,Date_Lag3,Date_Lag4
0,54594,2017Q1,1502.1,885.7,10.1,616.4,13.7,2017,Q1,2017-Q1,2017-01-01,2017-03-31,2017-01,2017-03,2017-05,2017-06,2017-07
1,54594,2017Q2,1504.1,914.2,10.3,589.9,21.2,2017,Q2,2017-Q2,2017-04-01,2017-06-30,2017-04,2017-06,2017-08,2017-09,2017-10
2,54594,2017Q3,1531.7,924.7,15.1,607.0,10.6,2017,Q3,2017-Q3,2017-07-01,2017-09-30,2017-07,2017-09,2017-11,2017-12,2018-01
3,54594,2017Q4,1544.3,906.5,27.1,637.8,-22.5,2017,Q4,2017-Q4,2017-10-01,2017-12-31,2017-10,2017-12,2018-02,2018-03,2018-04
4,54594,2018Q1,1512.2,915.2,34.6,597.0,15.5,2018,Q1,2018-Q1,2018-01-01,2018-03-31,2018-01,2018-03,2018-05,2018-06,2018-07


## CRSP (Monthly)

In [213]:
# Read in Dataframe
CRSP_MONTHLY = pd.read_csv('Data/CRSP_MONTHLY_TEST.csv')

# Convert to Datetime
CRSP_MONTHLY['date'] = pd.to_datetime(CRSP_MONTHLY['date'])

# Convert Date to Month Period
CRSP_MONTHLY['date_month'] = CRSP_MONTHLY['date'].dt.to_period('m')

CRSP_MONTHLY.head()

Unnamed: 0,PERMNO,date,PRC,SHROUT,CFACPR,date_month
0,21020,2017-01-31,44.25,507294,1,2017-01
1,21020,2017-02-28,46.36,504154,1,2017-02
2,21020,2017-03-31,42.3,495750,1,2017-03
3,21020,2017-04-28,42.62,492589,1,2017-04
4,21020,2017-05-31,48.41,492589,1,2017-05


## CRSP (Daily)

In [289]:
# Load In Data
CRSP_DAILY = pd.read_csv('Data/CRSP_DAILY_TEST.csv')
CRSP_DAILY.head()

Unnamed: 0,PERMNO,date,RET
0,21020,1/3/2017,-0.008353
1,21020,1/4/2017,0.008639
2,21020,1/5/2017,-0.017345
3,21020,1/6/2017,0.006973
4,21020,1/9/2017,0.018827


## Sigma Calculation

In [290]:
window=90
CRSP_DAILY['SIGMA'] = ((((CRSP_DAILY['RET'].copy()**2).rolling(window).sum())/(window-1))*252)**(1/2)
CRSP_DAILY.head()

Unnamed: 0,PERMNO,date,RET,SIGMA
0,21020,1/3/2017,-0.008353,
1,21020,1/4/2017,0.008639,
2,21020,1/5/2017,-0.017345,
3,21020,1/6/2017,0.006973,
4,21020,1/9/2017,0.018827,


## Format Daily Data from Merge

In [307]:
# Convert to Datetime
CRSP_DAILY['date'] = pd.to_datetime(CRSP_DAILY['date'])

# Convert Date to Month Period
CRSP_DAILY['date_month'] = CRSP_DAILY['date'].dt.to_period('m')

# Keep Only Last Day of Month
#CRSP_DAILY['EndOfMonth'] = CRSP_DAILY['date'] + pd.offsets.MonthEnd(0)
#CRSP_DAILY = CRSP_DAILY[CRSP_DAILY['date']==CRSP_DAILY['EndOfMonth']]

lastDayOfMonthInfo = CRSP_DAILY.groupby(['PERMNO', 'date_month'])['date'].max().reset_index()
lastDayOfMonthInfo = lastDayOfMonthInfo.rename(columns={'date': 'lastDayOfMonth'})
temp = pd.merge(CRSP_DAILY,
                lastDayOfMonthInfo,
                left_on=['PERMNO', 'date_month'],
                right_on=['PERMNO', 'date_month']
)
CRSP_DAILY = temp.copy()
CRSP_DAILY = CRSP_DAILY[CRSP_DAILY['date'] == CRSP_DAILY['lastDayOfMonth']]
CRSP_DAILY.head()

Unnamed: 0,PERMNO,date,RET,SIGMA,date_month,lastDayOfMonth_x,lastDayOfMonth_y,lastDayOfMonth
0,21020,2017-01-31,-0.014477,,2017-01,2017-01-31,2017-01-31,2017-01-31
1,21020,2017-02-28,0.000864,,2017-02,2017-02-28,2017-02-28,2017-02-28
2,21020,2017-03-31,-0.005642,,2017-03,2017-03-31,2017-03-31,2017-03-31
3,21020,2017-04-28,-0.030923,,2017-04,2017-04-28,2017-04-28,2017-04-28
4,21020,2017-05-31,0.009383,0.325298,2017-05,2017-05-31,2017-05-31,2017-05-31


## SP500 Data

In [320]:
# Read in Data
SP500_MONTHLY = pd.read_csv('Data/SP500_MONTHLY_TEST.csv')

# Format caldt as datetime
SP500_MONTHLY['caldt'] = pd.to_datetime(SP500_MONTHLY['caldt'])

# Convert Date to Month Period
SP500_MONTHLY['date_month'] = SP500_MONTHLY['caldt'].dt.to_period('m')

# Adjust totval (quoted in $1000s)
SP500_MONTHLY['totval'] = SP500_MONTHLY['totval']*1000

# Drop caldt
SP500_MONTHLY = SP500_MONTHLY.drop(columns=['caldt'])

# Rename Columns (to make later merge simpler)
SP500_MONTHLY = SP500_MONTHLY.rename(columns={'vwretd': 'vwretdSP500', 'totval': 'totvalSP500'})

SP500_MONTHLY.head()

Unnamed: 0,vwretdSP500,totvalSP500,date_month
0,0.019384,20371809800000,2017-01
1,0.039459,21074513100000,2017-02
2,0.001664,21090060200000,2017-03
3,0.010591,21303086200000,2017-04
4,0.014552,21539204300000,2017-05


# Merge Dataframes

## Merge CRSP/COMPUSTAT Merged Data set with CRSP (Monthly)

In [332]:
# Select Accounting Features to Merge into CRSP Monthly Dataframe
CRSP_COMPUSTAT_features = ['atq', 'ceqq', 'cheq', 'ltq', 'niq']

# Select Features to Keep after Merge
featuresToKeep = ['PERMNO', 'date_month', 'PRC', 'SHROUT', 'CFACPR']

# Add Accounting Features to Features to keep after merge
featuresToKeep.extend(CRSP_COMPUSTAT_features)

# Add Lagged Accounting Features
for lag in range(2,5):
    CRSP_COMPUSTAT_merge_features = ['LPERMNO', f'Date_Lag{lag}']
    CRSP_COMPUSTAT_merge_features.extend(CRSP_COMPUSTAT_features)
    if lag==2:
        temp = pd.merge(CRSP_COMPUSTAT_MERGED[CRSP_COMPUSTAT_merge_features],
                                 CRSP_MONTHLY[['PERMNO', 'date_month', 'PRC', 'SHROUT', 'CFACPR']],
                                 how='right',
                                 left_on=['LPERMNO', f'Date_Lag{lag}'],
                                 right_on=['PERMNO', 'date_month']
                                )
        temp = temp[featuresToKeep]
        
    else:
        temp = pd.merge(CRSP_COMPUSTAT_MERGED[CRSP_COMPUSTAT_merge_features],
                                 temp[featuresToKeep],
                                 how='right',
                                 left_on=['LPERMNO', f'Date_Lag{lag}'],
                                 right_on=['PERMNO', 'date_month'],
                        suffixes=('', '_y')
                                )
                      
        # Update Features
        for feature in CRSP_COMPUSTAT_features:
            temp[feature] = temp[feature].fillna(temp[f'{feature}_y'])
            temp = temp.drop([f'{feature}_y'], 1)
        
        temp = temp[featuresToKeep]

explanatoryDataFrame = temp.copy()        
explanatoryDataFrame[:10]

Unnamed: 0,PERMNO,date_month,PRC,SHROUT,CFACPR,atq,ceqq,cheq,ltq,niq
0,21020,2017-01,44.25,507294,1,,,,,
1,21020,2017-02,46.36,504154,1,,,,,
2,21020,2017-03,42.3,495750,1,,,,,
3,21020,2017-04,42.62,492589,1,,,,,
4,21020,2017-05,48.41,492589,1,,,,,
5,21020,2017-06,50.32,487662,1,,,,,
6,21020,2017-07,50.44,487009,1,,,,,
7,21020,2017-08,44.74,487009,1,53336.0,3715.0,7440.0,49621.0,864.0
8,21020,2017-09,47.49,480000,1,53336.0,3715.0,7440.0,49621.0,864.0
9,21020,2017-10,46.82,478499,1,53336.0,3715.0,7440.0,49621.0,864.0


## Merge Existing Explanatory Data set with CRSP (Daily)

In [333]:
explanatoryDataFrame = pd.merge(explanatoryDataFrame,
                CRSP_DAILY[['PERMNO', 'date_month', 'SIGMA']],
                how='left',
                left_on=['PERMNO', 'date_month'],
                right_on=['PERMNO', 'date_month']
               )
explanatoryDataFrame[:10]

Unnamed: 0,PERMNO,date_month,PRC,SHROUT,CFACPR,atq,ceqq,cheq,ltq,niq,SIGMA
0,21020,2017-01,44.25,507294,1,,,,,,
1,21020,2017-02,46.36,504154,1,,,,,,
2,21020,2017-03,42.3,495750,1,,,,,,
3,21020,2017-04,42.62,492589,1,,,,,,
4,21020,2017-05,48.41,492589,1,,,,,,0.325298
5,21020,2017-06,50.32,487662,1,,,,,,0.298192
6,21020,2017-07,50.44,487009,1,,,,,,0.282492
7,21020,2017-08,44.74,487009,1,53336.0,3715.0,7440.0,49621.0,864.0,0.290003
8,21020,2017-09,47.49,480000,1,53336.0,3715.0,7440.0,49621.0,864.0,0.266781
9,21020,2017-10,46.82,478499,1,53336.0,3715.0,7440.0,49621.0,864.0,0.294971


## Merge Existing Explanatory Data Set with SP500 Monthly

In [334]:
explanatoryDataFrame = pd.merge(explanatoryDataFrame,
                SP500_MONTHLY,
                how='left',
                left_on=['date_month'],
                right_on=['date_month']
               )
explanatoryDataFrame[:10]

Unnamed: 0,PERMNO,date_month,PRC,SHROUT,CFACPR,atq,ceqq,cheq,ltq,niq,SIGMA,vwretdSP500,totvalSP500
0,21020,2017-01,44.25,507294,1,,,,,,,0.019384,20371809800000
1,21020,2017-02,46.36,504154,1,,,,,,,0.039459,21074513100000
2,21020,2017-03,42.3,495750,1,,,,,,,0.001664,21090060200000
3,21020,2017-04,42.62,492589,1,,,,,,,0.010591,21303086200000
4,21020,2017-05,48.41,492589,1,,,,,,0.325298,0.014552,21539204300000
5,21020,2017-06,50.32,487662,1,,,,,,0.298192,0.005798,21614640700000
6,21020,2017-07,50.44,487009,1,,,,,,0.282492,0.020696,21944725800000
7,21020,2017-08,44.74,487009,1,53336.0,3715.0,7440.0,49621.0,864.0,0.290003,0.002238,21947284000000
8,21020,2017-09,47.49,480000,1,53336.0,3715.0,7440.0,49621.0,864.0,0.266781,0.02005,22371374100000
9,21020,2017-10,46.82,478499,1,53336.0,3715.0,7440.0,49621.0,864.0,0.294971,0.024158,22841910200000


# Remaining Explanatory Variable Calculations

## NITA

In [335]:
# Precomuputations
explanatoryDataFrame['ME'] = explanatoryDataFrame['PRC'] * explanatoryDataFrame['SHROUT']
explanatoryDataFrame['BE'] = explanatoryDataFrame['ceqq']
explanatoryDataFrame['TA'] = explanatoryDataFrame['atq']

# Total Assets Adjusted Calculation
explanatoryDataFrame['totalAssetsAdj'] = explanatoryDataFrame['TA'] + 0.1*(explanatoryDataFrame['ME'] - explanatoryDataFrame['BE'])

# NITA Calculation
explanatoryDataFrame['NITA'] = explanatoryDataFrame['niq'] / explanatoryDataFrame['totalAssetsAdj']

## NIMTA

In [336]:
# NIMTA Calculation
explanatoryDataFrame['NIMTA'] = explanatoryDataFrame['niq'] / (explanatoryDataFrame['ME'] + explanatoryDataFrame['ltq'])

## TLTA

In [337]:
explanatoryDataFrame['TLTA'] = explanatoryDataFrame['ltq'] / explanatoryDataFrame['totalAssetsAdj']

## TLMTA

In [338]:
explanatoryDataFrame['TLMTA'] = explanatoryDataFrame['ltq'] / (explanatoryDataFrame['ME'] + explanatoryDataFrame['ltq'])

## EXRET

In [339]:
explanatoryDataFrame['adjPRC'] = explanatoryDataFrame['PRC'] * explanatoryDataFrame['CFACPR']
explanatoryDataFrame['RET'] = explanatoryDataFrame['adjPRC'].shift(1) / explanatoryDataFrame['adjPRC'] - 1
explanatoryDataFrame['EXRET'] = np.log(1+explanatoryDataFrame['RET']) - np.log(1+explanatoryDataFrame['vwretdSP500'])

## RSIZE

In [341]:
explanatoryDataFrame['RSIZE'] = np.log(explanatoryDataFrame['ME'] / explanatoryDataFrame['totvalSP500'])

## CASHMTA

In [345]:
explanatoryDataFrame['CASHMTA'] = explanatoryDataFrame['cheq'] / (explanatoryDataFrame['ME'] + explanatoryDataFrame['ltq'])

In [347]:
explanatoryDataFrame.head()

Unnamed: 0,PERMNO,date_month,PRC,SHROUT,CFACPR,atq,ceqq,cheq,ltq,niq,...,totalAssetsAdj,NITA,NIMTA,TLTA,TLMTA,adjPRC,RET,EXRET,RSIZE,CASHMTA
0,21020,2017-01,44.25,507294,1,,,,,,...,,,,,,44.25,,,-13.718472,
1,21020,2017-02,46.36,504154,1,,,,,,...,,,,,,46.36,-0.045513,-0.085282,-13.712011,
2,21020,2017-03,42.3,495750,1,,,,,,...,,,,,,42.3,0.095981,0.089987,-13.821209,
3,21020,2017-04,42.62,492589,1,,,,,,...,,,,,,42.62,-0.007508,-0.018072,-13.830119,
4,21020,2017-05,48.41,492589,1,,,,,,...,,,,,,48.41,-0.119603,-0.14183,-13.713759,
