NOTE: All Data Loading should be converted to querries.

In [None]:
import pandas as pd
import os
from google.colab import drive 
import numpy as np

In [None]:
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/Shared drives/Financial Modeling and Testing/Project 1/')

Mounted at /content/drive


# Raw Dataset

## CRSP/COMPUSTAT Merged

### Load Data

In [None]:
# Load Data
CRSP_COMPUSTAT_MERGED = pd.read_csv('Data/Original/CRSP_COMPUSTAT_MERGED.csv')
CRSP_COMPUSTAT_MERGED.shape

(1162129, 27)

### Filtering

Filter CRSP/COMPUSTAT Merged by the Following Criteria:
1. Exclude SIC Codes within 6000-6999 Range
2. Include Exchange Codes within range 11-20 (US Exchanges)

In [None]:
# Filter out 6000-6999 Range Companies and make sure traded on American Exchange
CRSP_COMPUSTAT_MERGED = CRSP_COMPUSTAT_MERGED[((CRSP_COMPUSTAT_MERGED['sic']<6000) |  (CRSP_COMPUSTAT_MERGED['sic']>=7000)) &
                                              (CRSP_COMPUSTAT_MERGED['exchg'].isin([11, 12, 13, 14, 15, 16, 17, 18, 19, 20]))                  
                                             ]
CRSP_COMPUSTAT_MERGED.shape

(857711, 27)

### Manipulation

Apply the following manipulations to the dataset:
1. Calendarize all Fiscal Quarters
2. Lag Accounting variables 2 months to account for 60 days to release quarterly reports.

In [None]:
CRSP_COMPUSTAT_MERGED.tail()

Unnamed: 0,GVKEY,LPERMNO,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,cusip,conm,curcdq,datacqtr,datafqtr,atq,ceqq,cheq,ltq,niq,exchg,cik,costat,conml,dlrsn,sic,dldte
1162119,332115,80577,09/30/2019,2019,3.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2019Q3,2019Q3,29.304,18.132,8.69,11.172,-6.955,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,
1162120,332115,80577,12/31/2019,2019,4.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2019Q4,2019Q4,25.451,14.593,6.033,10.858,-4.586,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,
1162121,332115,80577,03/31/2020,2020,1.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2020Q1,2020Q1,44.102,33.482,24.209,10.62,-5.078,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,
1162122,332115,80577,06/30/2020,2020,2.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2020Q2,2020Q2,49.461,29.638,19.786,19.823,-4.71,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,
1162123,332115,80577,09/30/2020,2020,3.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2020Q3,2020Q3,45.675,24.619,15.885,21.056,-5.769,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,


In [None]:
# Split Up 'datacqtr' into calendar year and quarter
CRSP_COMPUSTAT_MERGED['CalendarYear'] = CRSP_COMPUSTAT_MERGED['datacqtr'].str.slice(0,4)
CRSP_COMPUSTAT_MERGED['Quarter'] = CRSP_COMPUSTAT_MERGED['datacqtr'].str.slice(4)

# Convert Quarters to Pandas Datetimes
CRSP_COMPUSTAT_MERGED['datacqtr_formatted'] = CRSP_COMPUSTAT_MERGED["CalendarYear"] + "-" + CRSP_COMPUSTAT_MERGED["Quarter"]
CRSP_COMPUSTAT_MERGED['QuarterStart'] = pd.PeriodIndex(CRSP_COMPUSTAT_MERGED['datacqtr_formatted'], freq='Q').to_timestamp()

# Calculate Start of Quarter and End of Quarter Dates
CRSP_COMPUSTAT_MERGED['QuarterEnd'] = CRSP_COMPUSTAT_MERGED['QuarterStart'] + pd.offsets.MonthEnd(3)
CRSP_COMPUSTAT_MERGED['QuarterStart_Month'] = CRSP_COMPUSTAT_MERGED['QuarterStart'].dt.to_period('m')
CRSP_COMPUSTAT_MERGED['QuarterEnd_Month'] = CRSP_COMPUSTAT_MERGED['QuarterEnd'].dt.to_period('m')

# Calculate Lagged Dates (Year-Month)
CRSP_COMPUSTAT_MERGED['Date_Lag2'] = CRSP_COMPUSTAT_MERGED['QuarterEnd_Month'] + 2
CRSP_COMPUSTAT_MERGED['Date_Lag3'] = CRSP_COMPUSTAT_MERGED['QuarterEnd_Month'] + 3
CRSP_COMPUSTAT_MERGED['Date_Lag4'] = CRSP_COMPUSTAT_MERGED['QuarterEnd_Month'] + 4

In [None]:
CRSP_COMPUSTAT_MERGED

Unnamed: 0,GVKEY,LPERMNO,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,cusip,conm,curcdq,datacqtr,datafqtr,atq,ceqq,cheq,ltq,niq,exchg,cik,costat,conml,dlrsn,sic,dldte,CalendarYear,Quarter,datacqtr_formatted,QuarterStart,QuarterEnd,QuarterStart_Month,QuarterEnd_Month,Date_Lag2,Date_Lag3,Date_Lag4
0,1000,25881,12/31/1970,1970,4.0,INDL,C,D,STD,AE.2,000032102,A & E PLASTIK PAK INC,USD,1970Q4,1970Q4,,10.544,,,-1.064,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1970,Q4,1970-Q4,1970-10-01,1970-12-31,1970-10,1970-12,1971-02,1971-03,1971-04
1,1000,25881,03/31/1971,1971,1.0,INDL,C,D,STD,AE.2,000032102,A & E PLASTIK PAK INC,USD,1971Q1,1971Q1,,,,,0.346,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1971,Q1,1971-Q1,1971-01-01,1971-03-31,1971-01,1971-03,1971-05,1971-06,1971-07
2,1000,25881,06/30/1971,1971,2.0,INDL,C,D,STD,AE.2,000032102,A & E PLASTIK PAK INC,USD,1971Q2,1971Q2,,,,,0.152,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1971,Q2,1971-Q2,1971-04-01,1971-06-30,1971-04,1971-06,1971-08,1971-09,1971-10
3,1000,25881,09/30/1971,1971,3.0,INDL,C,D,STD,AE.2,000032102,A & E PLASTIK PAK INC,USD,1971Q3,1971Q3,,,,,-0.672,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1971,Q3,1971-Q3,1971-07-01,1971-09-30,1971-07,1971-09,1971-11,1971-12,1972-01
4,1000,25881,12/31/1971,1971,4.0,INDL,C,D,STD,AE.2,000032102,A & E PLASTIK PAK INC,USD,1971Q4,1971Q4,29.330,8.381,2.557,20.949,-2.144,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1971,Q4,1971-Q4,1971-10-01,1971-12-31,1971-10,1971-12,1972-02,1972-03,1972-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162119,332115,80577,09/30/2019,2019,3.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2019Q3,2019Q3,29.304,18.132,8.690,11.172,-6.955,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,,2019,Q3,2019-Q3,2019-07-01,2019-09-30,2019-07,2019-09,2019-11,2019-12,2020-01
1162120,332115,80577,12/31/2019,2019,4.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2019Q4,2019Q4,25.451,14.593,6.033,10.858,-4.586,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,,2019,Q4,2019-Q4,2019-10-01,2019-12-31,2019-10,2019-12,2020-02,2020-03,2020-04
1162121,332115,80577,03/31/2020,2020,1.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2020Q1,2020Q1,44.102,33.482,24.209,10.620,-5.078,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,,2020,Q1,2020-Q1,2020-01-01,2020-03-31,2020-01,2020-03,2020-05,2020-06,2020-07
1162122,332115,80577,06/30/2020,2020,2.0,INDL,C,D,STD,ARMP,04216R102,ARMATA PHARMACEUTICALS INC,USD,2020Q2,2020Q2,49.461,29.638,19.786,19.823,-4.710,12,921114.0,A,Armata Pharmaceuticals Inc,,2836.0,,2020,Q2,2020-Q2,2020-04-01,2020-06-30,2020-04,2020-06,2020-08,2020-09,2020-10


## CRSP (Monthly)

### Load Data

In [None]:
# Read in Dataframe
CRSP_MONTHLY = pd.read_csv('Data/Original/CRSP_MONTHLY.csv')
CRSP_MONTHLY.shape

  interactivity=interactivity, compiler=compiler, result=result)


(4256427, 9)

### Format

In [None]:
# Convert to Datetime
CRSP_MONTHLY['date'] = pd.to_datetime(CRSP_MONTHLY['date'])

### Filter

Filter CRSP (Monthly) by the following criteria:
1. Exclude SIC Codes within 6000-6999 Range
2. Include Share Code to be 10 or 11 (Ordinary Common Shares which have not been further defined OR need not be further defined)
3. Include Share Class 'A'
4. Exclude Companies with monthly returns less than -50 (missing return code)

In [None]:
# Filter Out 6000 Range SIC Companies (Financial and ETFs)
CRSP_MONTHLY['SICCD'] = pd.to_numeric(CRSP_MONTHLY['SICCD'], errors='coerce')
CRSP_MONTHLY = CRSP_MONTHLY[(CRSP_MONTHLY['SICCD']<6000) |  (CRSP_MONTHLY['SICCD']>=7000)]

# Filter Share Code to be 10 or 11
CRSP_MONTHLY['SHRCD'] = pd.to_numeric(CRSP_MONTHLY['SHRCD'], errors='coerce')
CRSP_MONTHLY = CRSP_MONTHLY[(CRSP_MONTHLY['SHRCD'].isin([10,11]))]

# Filter Share Class to be 'A' or NaN
CRSP_MONTHLY = CRSP_MONTHLY[((CRSP_MONTHLY['SHRCLS'].isna()) | (CRSP_MONTHLY['SHRCLS'] == 'A'))]

# Filter Out Returns less than -50
CRSP_MONTHLY['RET'] = pd.to_numeric(CRSP_MONTHLY['RET'], errors='coerce')
CRSP_MONTHLY = CRSP_MONTHLY[(CRSP_MONTHLY['RET']>-50)]

### Manipulation

In [None]:
# Convert Date to Month Period
CRSP_MONTHLY['date_month'] = CRSP_MONTHLY['date'].dt.to_period('m')

## CRSP (Daily)

CRSP Daily Data was pulled in to help with the calculation of SIGMA, the 90 day moving average of the volatility of the underlying security. However, because the CRSP (Daily) data set was so large we had to house the data and calculate SIGMA locally and only read in the reduced monthly dataframe. The code for calculating SIGMA locally and how the reduction happend is included.

In [None]:
"""
# Load In Data
CRSP_DAILY = pd.read_csv('Data/CRSP_DAILY.csv')

# Change to numeric
CRSP_DAILY['RET'] = pd.to_numeric(CRSP_DAILY['RET'], errors='coerce')

# Calcualte SIGMA
window=90
CRSP_DAILY['SIGMA'] = ((((CRSP_DAILY['RET'].copy()**2).rolling(window).sum())/(window-1))*252)**(1/2)

# Convert to Datetime
CRSP_DAILY['date'] = pd.to_datetime(CRSP_DAILY['date'])

# Convert Date to Month Period
CRSP_DAILY['date_month'] = CRSP_DAILY['date'].dt.to_period('m')

# Keep Only Last Day of Month
lastDayOfMonthInfo = CRSP_DAILY.groupby(['PERMNO', 'date_month'])['date'].max().reset_index()
lastDayOfMonthInfo = lastDayOfMonthInfo.rename(columns={'date': 'lastDayOfMonth'})
temp = pd.merge(CRSP_DAILY,
                lastDayOfMonthInfo,
                left_on=['PERMNO', 'date_month'],
                right_on=['PERMNO', 'date_month']
)
CRSP_DAILY = temp.copy()
CRSP_DAILY = CRSP_DAILY[CRSP_DAILY['date'] == CRSP_DAILY['lastDayOfMonth']]
"""
pass

### Load Data

In [None]:
CRSP_DAILY = pd.read_csv('Data/Original/CRSP_DAILY_REDUCED.csv')

### Format Data

In [None]:
# Convert Date to Month Period
CRSP_DAILY['date'] = pd.to_datetime(CRSP_DAILY['date'])
CRSP_DAILY['date_month'] = CRSP_DAILY['date'].dt.to_period('m')

In [None]:
print(CRSP_COMPUSTAT_MERGED['Date_Lag2'].min())
print(CRSP_COMPUSTAT_MERGED['Date_Lag4'].max())
print(CRSP_COMPUSTAT_MERGED.shape)
print(CRSP_COMPUSTAT_MERGED.isna().sum())

1961-05
2021-04
(857711, 37)
GVKEY                      0
LPERMNO                    0
datadate                   0
fyearq                     0
fqtr                     123
indfmt                     0
consol                     0
popsrc                     0
datafmt                    0
tic                       23
cusip                      0
conm                       0
curcdq                     0
datacqtr                1343
datafqtr                 426
atq                    87171
ceqq                   66911
cheq                   93224
ltq                    90092
niq                    17656
exchg                      0
cik                    58764
costat                     0
conml                      0
dlrsn                 315613
sic                        0
dldte                 315613
CalendarYear            1343
Quarter                 1343
datacqtr_formatted      1343
QuarterStart            1343
QuarterEnd              1343
QuarterStart_Month      1343
QuarterEnd_Mon

## SP500 Data

### Load Data

In [None]:
SP500_MONTHLY = pd.read_csv('Data/Original/SP500_MONTHLY.csv')

### Format

In [None]:
# Format caldt as datetime
SP500_MONTHLY['caldt'] = pd.to_datetime(SP500_MONTHLY['caldt'])

### Manipulate

In [None]:
# Convert Date to Month Period
SP500_MONTHLY['date_month'] = SP500_MONTHLY['caldt'].dt.to_period('m')

# Adjust totval (quoted in $1000s)
SP500_MONTHLY['totval'] = SP500_MONTHLY['totval']*1000

# Drop caldt
SP500_MONTHLY = SP500_MONTHLY.drop(columns=['caldt'])

# Rename Columns (to make later merge simpler)
SP500_MONTHLY = SP500_MONTHLY.rename(columns={'vwretd': 'vwretdSP500', 'totval': 'totvalSP500'})

# Merge Dataframes

## Merge CRSP/COMPUSTAT Merged Data with CRSP

In [None]:
CRSP_COMPUSTAT_MERGED.head()

Unnamed: 0,GVKEY,LPERMNO,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,cusip,conm,curcdq,datacqtr,datafqtr,atq,ceqq,cheq,ltq,niq,exchg,cik,costat,conml,dlrsn,sic,dldte,CalendarYear,Quarter,datacqtr_formatted,QuarterStart,QuarterEnd,QuarterStart_Month,QuarterEnd_Month,Date_Lag2,Date_Lag3,Date_Lag4
0,1000,25881,12/31/1970,1970,4.0,INDL,C,D,STD,AE.2,32102,A & E PLASTIK PAK INC,USD,1970Q4,1970Q4,,10.544,,,-1.064,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1970,Q4,1970-Q4,1970-10-01,1970-12-31,1970-10,1970-12,1971-02,1971-03,1971-04
1,1000,25881,03/31/1971,1971,1.0,INDL,C,D,STD,AE.2,32102,A & E PLASTIK PAK INC,USD,1971Q1,1971Q1,,,,,0.346,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1971,Q1,1971-Q1,1971-01-01,1971-03-31,1971-01,1971-03,1971-05,1971-06,1971-07
2,1000,25881,06/30/1971,1971,2.0,INDL,C,D,STD,AE.2,32102,A & E PLASTIK PAK INC,USD,1971Q2,1971Q2,,,,,0.152,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1971,Q2,1971-Q2,1971-04-01,1971-06-30,1971-04,1971-06,1971-08,1971-09,1971-10
3,1000,25881,09/30/1971,1971,3.0,INDL,C,D,STD,AE.2,32102,A & E PLASTIK PAK INC,USD,1971Q3,1971Q3,,,,,-0.672,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1971,Q3,1971-Q3,1971-07-01,1971-09-30,1971-07,1971-09,1971-11,1971-12,1972-01
4,1000,25881,12/31/1971,1971,4.0,INDL,C,D,STD,AE.2,32102,A & E PLASTIK PAK INC,USD,1971Q4,1971Q4,29.33,8.381,2.557,20.949,-2.144,12,,I,A & E Plastik Pak Inc,9.0,3089.0,06/30/1978,1971,Q4,1971-Q4,1971-10-01,1971-12-31,1971-10,1971-12,1972-02,1972-03,1972-04


In [None]:
# Select Accounting Features to Merge into CRSP Monthly Dataframe
CRSP_COMPUSTAT_Accounting_features = ['atq', 'ceqq', 'cheq', 'ltq', 'niq']
CRSP_COMPUSTAT_Identifying_features = ['GVKEY', 'conm'] # Don't include LPERMNO

# Select CRSP Features to Keep after Merge
featuresToKeep = ['PERMNO', 'date_month', 'PRC', 'SHROUT', 'CFACPR', 'RET']

# Add Accounting Features to Features to keep after merge
featuresToKeep.extend(CRSP_COMPUSTAT_Accounting_features)
featuresToKeep.extend(CRSP_COMPUSTAT_Identifying_features)

# Add Lagged Accounting Features
for lag in range(2,5):
    CRSP_COMPUSTAT_merge_features = ['LPERMNO', f'Date_Lag{lag}']
    CRSP_COMPUSTAT_merge_features.extend(CRSP_COMPUSTAT_Accounting_features)
    CRSP_COMPUSTAT_merge_features.extend(CRSP_COMPUSTAT_Identifying_features)
    if lag==2:
        temp = pd.merge(CRSP_COMPUSTAT_MERGED[CRSP_COMPUSTAT_merge_features],
                                 CRSP_MONTHLY[['PERMNO', 'date_month', 'PRC', 'SHROUT', 'CFACPR']],
                                 how='right',
                                 left_on=['LPERMNO', f'Date_Lag{lag}'],
                                 right_on=['PERMNO', 'date_month']
                                )
        # Keep only specified Features
        temp = temp[featuresToKeep]
        
    else:
        temp = pd.merge(CRSP_COMPUSTAT_MERGED[CRSP_COMPUSTAT_merge_features],
                                 temp[featuresToKeep],
                                 how='right',
                                 left_on=['LPERMNO', f'Date_Lag{lag}'],
                                 right_on=['PERMNO', 'date_month'],
                        suffixes=('', '_y')
                                )
        
        
                      
        # Update Features (Fill NAs with Lagged Variable)
        CRSP_COMPUSTAT_features = CRSP_COMPUSTAT_Accounting_features.copy()
        CRSP_COMPUSTAT_features.extend(CRSP_COMPUSTAT_Identifying_features.copy())

        for feature in CRSP_COMPUSTAT_features:
            temp[feature] = temp[feature].fillna(temp[f'{feature}_y'])
            temp = temp.drop([f'{feature}_y'], 1)
        
        temp = temp[featuresToKeep]

explanatoryDataFrame = temp.copy()  

KeyError: ignored

In [None]:
explanatoryDataFrame.head()

## Merge Existing Explanatory Dataframe with CRSP (Daily)

In [None]:
explanatoryDataFrame = pd.merge(explanatoryDataFrame,
                CRSP_DAILY[['PERMNO', 'date_month', 'SIGMA']],
                how='left',
                left_on=['PERMNO', 'date_month'],
                right_on=['PERMNO', 'date_month']
               )

## Merge Existing Explanatory Dataframe with SP500 (Monthly)

In [None]:
explanatoryDataFrame = pd.merge(explanatoryDataFrame,
                SP500_MONTHLY,
                how='left',
                left_on=['date_month'],
                right_on=['date_month']
               )

# Custom Explanatory Variables

Create 8 Custom Explanatory Variables:

## Net Income per Adjusted Total Assets (NITA)

$NITA_{i,t} = \frac{Net Income_{i,t}}{Total Assets Adj_{i,t}}$

> * $NetIncome_{i,t} = NIQ_{i,t}$
> * $TotalAssetsAdj_{i,t} = TA_{i,t} + 0.1 \left(ME_{i,t} - BE_{i,t}\right)$
> > * $TA_{i,t} = ATQ_{i,t}$
> > * $ME_{i,t} = PRC_{i,t} \times SHROUT_{i,t}$
> > * $BE_{i,t} = CEQ_{i,t}$


In [None]:
# Precomuputations
explanatoryDataFrame['ME'] = explanatoryDataFrame['PRC'] * explanatoryDataFrame['SHROUT']
explanatoryDataFrame['BE'] = explanatoryDataFrame['ceqq']
explanatoryDataFrame['TA'] = explanatoryDataFrame['atq']

# Total Assets Adjusted Calculation
explanatoryDataFrame['totalAssetsAdj'] = explanatoryDataFrame['TA'] + 0.1*(explanatoryDataFrame['ME'] - explanatoryDataFrame['BE'])

# NITA Calculation
explanatoryDataFrame['NITA'] = explanatoryDataFrame['niq'] / explanatoryDataFrame['totalAssetsAdj']

## Net Income per Enterprise Value (NIMTA)

Write Formula

In [None]:
# NIMTA Calculation
explanatoryDataFrame['NIMTA'] = explanatoryDataFrame['niq'] / (explanatoryDataFrame['ME'] + explanatoryDataFrame['ltq'])

## Total Liabilities per Adjusted Total Assets (TLTA)

Write Formula

In [None]:
# TLTA Calculation
explanatoryDataFrame['TLTA'] = explanatoryDataFrame['ltq'] / explanatoryDataFrame['totalAssetsAdj']

## Total Liabilites per Enterprise Value (TLMTA)

Write Formula

In [None]:
# TLMTA Calculation
explanatoryDataFrame['TLMTA'] = explanatoryDataFrame['ltq'] / (explanatoryDataFrame['ME'] + explanatoryDataFrame['ltq'])

## Excess Return (EXRET)

Write Formula

In [None]:
# Precomuputations
explanatoryDataFrame['adjPRC'] = explanatoryDataFrame['PRC'] * explanatoryDataFrame['CFACPR']
explanatoryDataFrame['RET'] = explanatoryDataFrame['adjPRC'].shift(1) / explanatoryDataFrame['adjPRC'] - 1

# EXRET Calculation
explanatoryDataFrame['EXRET'] = np.log(1+explanatoryDataFrame['RET']) - np.log(1+explanatoryDataFrame['vwretdSP500'])

## Relative Size (RSIZE)

Write Formula

In [None]:
# EXRET Calculation
explanatoryDataFrame['RSIZE'] = np.log(explanatoryDataFrame['ME'] / explanatoryDataFrame['totvalSP500'])

## Standard Deviation of Returns (SIGMA)

Write Formula

## Cash per Enterprise Value (CASHMTA)

Write Formula

In [None]:
# CASHMTA Calculation
explanatoryDataFrame['CASHMTA'] = explanatoryDataFrame['cheq'] / (explanatoryDataFrame['ME'] + explanatoryDataFrame['ltq'])

# Wrappers

## Code

In [None]:
def prepareCrspCompustatMergedData(CRSP_COMPUSTAT_MERGED, monthsToLagAccountingVariables=2):
  """
  Format and manipulate CRSP/COMPUSTAT Merged Data
  """
  CRSP_COMPUSTAT_MERGED_COPY = CRSP_COMPUSTAT_MERGED.copy()
  # Filter out 6000-6999 Range Companies and make sure traded on American Exchange
  CRSP_COMPUSTAT_MERGED_COPY = CRSP_COMPUSTAT_MERGED_COPY[((CRSP_COMPUSTAT_MERGED_COPY['sic']<6000) |  (CRSP_COMPUSTAT_MERGED_COPY['sic']>=7000)) &
                                                          (CRSP_COMPUSTAT_MERGED_COPY['exchg'].isin([11, 12, 13, 14, 15, 16, 17, 18, 19, 20]))                  
                                                          ]

  # Split Up 'datacqtr' into calendar year and quarter
  CRSP_COMPUSTAT_MERGED_COPY['CalendarYear'] = CRSP_COMPUSTAT_MERGED_COPY['datacqtr'].str.slice(0,4)
  CRSP_COMPUSTAT_MERGED_COPY['Quarter'] = CRSP_COMPUSTAT_MERGED_COPY['datacqtr'].str.slice(4)

  # Convert Quarters to Pandas Datetimes
  CRSP_COMPUSTAT_MERGED_COPY['datacqtr_formatted'] = CRSP_COMPUSTAT_MERGED_COPY["CalendarYear"] + "-" + CRSP_COMPUSTAT_MERGED_COPY["Quarter"]
  CRSP_COMPUSTAT_MERGED_COPY['QuarterStart'] = pd.PeriodIndex(CRSP_COMPUSTAT_MERGED_COPY['datacqtr_formatted'], freq='Q').to_timestamp()

  # Calculate Start of Quarter and End of Quarter Dates
  CRSP_COMPUSTAT_MERGED_COPY['QuarterEnd'] = CRSP_COMPUSTAT_MERGED_COPY['QuarterStart'] + pd.offsets.MonthEnd(3)
  CRSP_COMPUSTAT_MERGED_COPY['QuarterStart_Month'] = CRSP_COMPUSTAT_MERGED_COPY['QuarterStart'].dt.to_period('m')
  CRSP_COMPUSTAT_MERGED_COPY['QuarterEnd_Month'] = CRSP_COMPUSTAT_MERGED_COPY['QuarterEnd'].dt.to_period('m')

  # Calculate Lagged Dates (Year-Month)
  for i in range(3):
    lag = monthsToLagAccountingVariables
    CRSP_COMPUSTAT_MERGED_COPY[f'Date_Lag{i+lag}'] = CRSP_COMPUSTAT_MERGED_COPY['QuarterEnd_Month'] + (lag+i)

  return CRSP_COMPUSTAT_MERGED_COPY

def prepareCrspMonthlyData(CRSP_MONTHLY):
  """
  Format and manipulate CRSP Monthly Data
  """

  # Create Copy of Dataframe
  CRSP_MONTHLY_COPY = CRSP_MONTHLY.copy()

  # Convert to Datetime
  CRSP_MONTHLY_COPY['date'] = pd.to_datetime(CRSP_MONTHLY_COPY['date'])

  # Convert Date to Month Period
  CRSP_MONTHLY_COPY['date_month'] = CRSP_MONTHLY_COPY['date'].dt.to_period('m')

  # Filter Out 6000 Range SIC Companies (Financial and ETFs)
  CRSP_MONTHLY_COPY['SICCD'] = pd.to_numeric(CRSP_MONTHLY_COPY['SICCD'], errors='coerce')
  CRSP_MONTHLY_COPY = CRSP_MONTHLY_COPY[(CRSP_MONTHLY_COPY['SICCD']<6000) |  (CRSP_MONTHLY_COPY['SICCD']>=7000)]

  # Filter Share Code to be 10 or 11
  CRSP_MONTHLY_COPY['SHRCD'] = pd.to_numeric(CRSP_MONTHLY_COPY['SHRCD'], errors='coerce')
  CRSP_MONTHLY_COPY = CRSP_MONTHLY_COPY[(CRSP_MONTHLY_COPY['SHRCD'].isin([10,11]))]

  # Filter Share Class to be 'A' or NaN
  CRSP_MONTHLY_COPY = CRSP_MONTHLY_COPY[((CRSP_MONTHLY_COPY['SHRCLS'].isna()) | (CRSP_MONTHLY_COPY['SHRCLS'] == 'A'))]

  # Filter Out Returns less than -50
  CRSP_MONTHLY_COPY['RET'] = pd.to_numeric(CRSP_MONTHLY_COPY['RET'], errors='coerce')
  CRSP_MONTHLY_COPY = CRSP_MONTHLY_COPY[(CRSP_MONTHLY_COPY['RET']>-50)]

  return CRSP_MONTHLY_COPY

def prepareCrspDailyData(CRSP_DAILY):
  """
  Format and manipulate CRSP (Daily) Data
  """
  # Create Copy of Dataframe
  CRSP_DAILY_COPY = CRSP_DAILY.copy()

  # Convert Date to Month Period
  CRSP_DAILY_COPY['date'] = pd.to_datetime(CRSP_DAILY_COPY['date'])
  CRSP_DAILY_COPY['date_month'] = CRSP_DAILY_COPY['date'].dt.to_period('m')

  return CRSP_DAILY_COPY


def prepareSP500Data(SP500_MONTHLY):
  """
  Format and manipulate SP500 Monthly Data
  """
  # Create Copy of Dataframe
  SP500_MONTHLY_COPY = SP500_MONTHLY.copy()

  # Format caldt as datetime
  SP500_MONTHLY_COPY['caldt'] = pd.to_datetime(SP500_MONTHLY_COPY['caldt'])

  # Convert Date to Month Period
  SP500_MONTHLY_COPY['date_month'] = SP500_MONTHLY_COPY['caldt'].dt.to_period('m')

  # Adjust totval (quoted in $1000s)
  SP500_MONTHLY_COPY['totval'] = SP500_MONTHLY_COPY['totval']*1000

  # Drop caldt
  SP500_MONTHLY_COPY = SP500_MONTHLY_COPY.drop(columns=['caldt'])

  # Rename Columns (to make later merge simpler)
  SP500_MONTHLY_COPY = SP500_MONTHLY_COPY.rename(columns={'vwretd': 'vwretdSP500', 'totval': 'totvalSP500'})

  return SP500_MONTHLY_COPY

def mergeCrspCompustatMergedWithCrspMonthly(CRSP_COMPUSTAT_MERGED, 
                                            CRSP_MONTHLY,
                                            CRSP_COMPUSTAT_Accounting_features = ['atq', 'ceqq', 'cheq', 'ltq', 'niq'],
                                            CRSP_COMPUSTAT_Identifying_features = ['GVKEY', 'conm'],
                                            CRSP_MONTHLY_features = ['PERMNO', 'date_month', 'PRC', 'SHROUT', 'CFACPR', 'RET']
                                            ):
  """
  Merge CRSP/COMPUSTAT Merged Dataframe with CRSP (Monthly)
  """
  # Create Copy of Dataframe
  CRSP_COMPUSTAT_MERGED_COPY = CRSP_COMPUSTAT_MERGED.copy()
  CRSP_MONTHLY_COPY = CRSP_MONTHLY.copy()

  # Select Accounting Features to Merge into CRSP Monthly Dataframe
  CRSP_COMPUSTAT_Accounting_features = CRSP_COMPUSTAT_Accounting_features

  # Select Identifying Information Features to Merge into CRSP Monthly Dataframe
  CRSP_COMPUSTAT_Identifying_features = CRSP_COMPUSTAT_Identifying_features

  # Select CRSP Features to Keep after Merge
  CRSP_MONTHLY_features = CRSP_MONTHLY_features

  # Select Features to Keep after Merge
  featuresToKeep = CRSP_MONTHLY_features.copy()

  # Add Accounting Features to Features to keep after merge
  featuresToKeep.extend(CRSP_COMPUSTAT_Accounting_features.copy())

  # Add Identifying Information Features to Features to keep after merge
  featuresToKeep.extend(CRSP_COMPUSTAT_Identifying_features.copy())

  # Add Lagged Accounting Features
  for lag in range(2,5):
      CRSP_COMPUSTAT_merge_features = ['LPERMNO', f'Date_Lag{lag}']
      CRSP_COMPUSTAT_merge_features.extend(CRSP_COMPUSTAT_Accounting_features.copy())
      CRSP_COMPUSTAT_merge_features.extend(CRSP_COMPUSTAT_Identifying_features.copy())
      if lag==2:
          temp = pd.merge(CRSP_COMPUSTAT_MERGED_COPY[CRSP_COMPUSTAT_merge_features],
                                  CRSP_MONTHLY_COPY[CRSP_MONTHLY_features],
                                  how='right',
                                  left_on=['LPERMNO', f'Date_Lag{lag}'],
                                  right_on=['PERMNO', 'date_month']
                                  )
          # Keep only specified Features
          temp = temp[featuresToKeep]
          
      else:
          temp = pd.merge(CRSP_COMPUSTAT_MERGED_COPY[CRSP_COMPUSTAT_merge_features],
                                  temp[featuresToKeep],
                                  how='right',
                                  left_on=['LPERMNO', f'Date_Lag{lag}'],
                                  right_on=['PERMNO', 'date_month'],
                          suffixes=('', '_y')
                                  )
                        
          # Update Features (Fill NAs with Lagged Variable)
          CRSP_COMPUSTAT_features = CRSP_COMPUSTAT_Accounting_features.copy()
          CRSP_COMPUSTAT_features.extend(CRSP_COMPUSTAT_Identifying_features.copy())

          for feature in CRSP_COMPUSTAT_features:
              temp[feature] = temp[feature].fillna(temp[f'{feature}_y'])
              temp = temp.drop([f'{feature}_y'], 1)
          
          temp = temp[featuresToKeep]

  explanatoryDataFrame = temp.copy()
  return explanatoryDataFrame

def mergeExplanatoryDataframeWithCrspDaily(explanatoryDataFrame, CRSP_DAILY):
  """
  Merge Existing Explanatory Dataframe with CRSP (Daily) Dataframe
  """
  explanatoryDataFrame = pd.merge(explanatoryDataFrame,
                CRSP_DAILY[['PERMNO', 'date_month', 'SIGMA']],
                how='left',
                left_on=['PERMNO', 'date_month'],
                right_on=['PERMNO', 'date_month']
               )
  
  return explanatoryDataFrame

def mergeExplanatoryDataframeWithSP500Monthly(explanatoryDataFrame, SP500_MONTHLY):
  """
  Merge Existing Explanatory Dataframe with SP500 (Monthly) Dataframe
  """
  explanatoryDataFrame = pd.merge(explanatoryDataFrame,
                SP500_MONTHLY,
                how='left',
                left_on=['date_month'],
                right_on=['date_month']
               )
  
  return explanatoryDataFrame

def calculateNITA(PRC, SHROUT, CEQQ, ATQ, NIQ):
  """
  Calculate NITA
  """
  # Precomuputations
  ME = PRC * SHROUT
  BE = CEQQ
  TA = ATQ

  # Total Assets Adjusted Calculation
  totalAssetsAdj = TA + 0.1*(ME - BE)

  # NITA Calculation
  NITA = NIQ / totalAssetsAdj

  return NITA


def calculateNIMTA(PRC, SHROUT, NIQ, LTQ):
  """
  Calculate NIMTA
  """
  # Precomputations
  ME = PRC * SHROUT

  # NIMTA Calculation
  NIMTA = NIQ / (ME + LTQ)

  return NIMTA

def calculateTLTA(PRC, SHROUT, CEQQ, ATQ, LTQ):
  """
  Calculate TLTA
  """
  # Precomuputations
  ME = PRC * SHROUT
  BE = CEQQ
  TA = ATQ

  # Total Assets Adjusted Calculation
  totalAssetsAdj = TA + 0.1*(ME - BE)

  # TLTA Calculation
  TLTA = LTQ / totalAssetsAdj

  return TLTA

def calculateTLMTA(PRC, SHROUT, LTQ):
  """
  Calculate TLMTA
  """
  # Precomuputations
  ME = PRC * SHROUT

  # TLMTA Calculation
  TLMTA = LTQ / (ME + LTQ)

  return TLMTA

def calculateEXRET(PRC, CFACPR, VWRETDSP500):
  """
  Calculate EXRET
  """
  # Precomputations
  ADJPRC = PRC * CFACPR
  RET = ADJPRC.shift(1) / ADJPRC

  # EXRET Calculation
  EXRET = np.log(1+RET) - np.log(1+VWRETDSP500)

  return EXRET

def calculateRSIZE(PRC, SHROUT, TOTVALSP500):
  """
  Calcuate RSIZE
  """
  # Precomuputations
  ME = PRC * SHROUT

  # RSIZE Calculation
  RSIZE = ME / TOTVALSP500

  return RSIZE

def calculateCASHMTA(PRC, SHROUT, CHEQ, LTQ):
  """
  Calculate CASHMTA
  """
  # Precomuputations
  ME = PRC * SHROUT

  # CASHMTA Calculation
  CASHMTA = CHEQ / (ME + LTQ)

  return CASHMTA
  
def createCustomExplanatoryVariables(explanatoryDataFrame, 
                                     explanatoryVariablesToCalculate=['NITA', 
                                                                      'NIMTA',
                                                                      'TLTA',
                                                                      'TLMTA',
                                                                      'EXRET',
                                                                      'RSIZE',
                                                                      'CASHMTA',
                                                                      'SIGMA'
                                                                      ],
                                     identifyingColumns = ['PERMNO',
                                                           'GVKEY',
                                                           'conm'
                                                           ],
                                     keepAllFeatures=False
                                     ):
  """
  Create Custom Explanatory Variables and add them to existing explanatory 
  Dataframe
  """
  if 'NITA' in (explanatoryVariablesToCalculate):
    # Calculate NITA
    explanatoryDataFrame['NITA'] = calculateNITA(PRC=explanatoryDataFrame['PRC'], 
                                                SHROUT=explanatoryDataFrame['SHROUT'], 
                                                CEQQ=explanatoryDataFrame['ceqq'], 
                                                ATQ=explanatoryDataFrame['atq'], 
                                                NIQ=explanatoryDataFrame['niq']
                                                )
  
  if 'NIMTA' in (explanatoryVariablesToCalculate):
    # Calculate NIMTA
    explanatoryDataFrame['NIMTA'] = calculateNIMTA(PRC=explanatoryDataFrame['PRC'], 
                                                SHROUT=explanatoryDataFrame['SHROUT'],
                                                NIQ=explanatoryDataFrame['niq'],
                                                LTQ=explanatoryDataFrame['ltq']
                                                )
    
  if 'TLTA' in (explanatoryVariablesToCalculate):
    # Calculate TLTA
    explanatoryDataFrame['TLTA'] = calculateTLTA(PRC=explanatoryDataFrame['PRC'], 
                                                 SHROUT=explanatoryDataFrame['SHROUT'],
                                                 CEQQ=explanatoryDataFrame['ceqq'],
                                                 ATQ=explanatoryDataFrame['atq'],
                                                 LTQ=explanatoryDataFrame['ltq']
                                                 )
    
  if 'TLMTA' in (explanatoryVariablesToCalculate):
    # Calculate TLTA
    explanatoryDataFrame['TLMTA'] = calculateTLMTA(PRC=explanatoryDataFrame['PRC'], 
                                                 SHROUT=explanatoryDataFrame['SHROUT'],
                                                 LTQ=explanatoryDataFrame['ltq']
                                                 )
    
  if 'EXRET' in (explanatoryVariablesToCalculate):
    # Calculate EXRET
    explanatoryDataFrame['EXRET'] = calculateEXRET(PRC=explanatoryDataFrame['PRC'], 
                                                   CFACPR=explanatoryDataFrame['CFACPR'],
                                                   VWRETDSP500=explanatoryDataFrame['vwretdSP500']
                                                 )
    
  if 'RSIZE' in (explanatoryVariablesToCalculate):
    # Calculate RSIZE
    explanatoryDataFrame['RSIZE'] = calculateRSIZE(PRC=explanatoryDataFrame['PRC'], 
                                                 SHROUT=explanatoryDataFrame['SHROUT'],
                                                 TOTVALSP500=explanatoryDataFrame['totvalSP500']
                                                 )
    
  if 'CASHMTA' in (explanatoryVariablesToCalculate):
    # Calculate CASHMTA
    explanatoryDataFrame['CASHMTA'] = calculateCASHMTA(PRC=explanatoryDataFrame['PRC'], 
                                                 SHROUT=explanatoryDataFrame['SHROUT'],
                                                 CHEQ=explanatoryDataFrame['cheq'],
                                                 LTQ=explanatoryDataFrame['ltq']
                                                 )


  if keepAllFeatures:
  	print('In')
  	explanatoryDataFrame = explanatoryDataFrame
  else:
  	print('out')
  	# Keep only selected columns
  	selectedColumns = identifyingColumns
  	selectedColumns.extend(['date_month'])
  	selectedColumns.extend(explanatoryVariablesToCalculate)
  	explanatoryDataFrame = explanatoryDataFrame[selectedColumns]

  return explanatoryDataFrame

def createXDataFrame(rawDataframes,
                     explanatoryVariablesToCalculate=['NITA', 
                                                      'NIMTA',
                                                      'TLTA',
                                                      'TLMTA',
                                                      'EXRET',
                                                      'RSIZE',
                                                      'CASHMTA',
                                                      'SIGMA'
                                                      ],
                     identifyingColumns = ['PERMNO', 'GVKEY','conm', 'cik'],
                     keepAllFeatures=False,
                     CRSP_COMPUSTAT_Accounting_features = ['atq', 'ceqq', 'cheq', 'ltq', 'niq'],
                     CRSP_COMPUSTAT_Identifying_features = ['GVKEY', 'conm', 'cik'],
                     CRSP_MONTHLY_features = ['PERMNO', 'date_month', 'PRC', 'SHROUT', 'CFACPR', 'RET'],
                     monthsToLagAccountingVariables=2
                     ):
  """
  Create X-Dataframe
  """
  # Load Raw Dataframes
  CRSP_COMPUSTAT_MERGED, CRSP_MONTHLY, CRSP_DAILY, SP500_MONTHLY = rawDataframes

  # Prepare Data
  CRSP_COMPUSTAT_MERGED = prepareCrspCompustatMergedData(CRSP_COMPUSTAT_MERGED, monthsToLagAccountingVariables)
  CRSP_MONTHLY = prepareCrspMonthlyData(CRSP_MONTHLY)
  CRSP_DAILY = prepareCrspDailyData(CRSP_DAILY)
  SP500_MONTHLY = prepareSP500Data(SP500_MONTHLY)

  # Merge Dataframes
  explanatoryDataFrame = mergeCrspCompustatMergedWithCrspMonthly(CRSP_COMPUSTAT_MERGED, 
                                                                 CRSP_MONTHLY,
                                                                 CRSP_COMPUSTAT_Accounting_features,
                                                                 CRSP_COMPUSTAT_Identifying_features,
                                                                 CRSP_MONTHLY_features
                                                                 )
  explanatoryDataFrame = mergeExplanatoryDataframeWithCrspDaily(explanatoryDataFrame, CRSP_DAILY)
  explanatoryDataFrame = mergeExplanatoryDataframeWithSP500Monthly(explanatoryDataFrame, SP500_MONTHLY)

  # Create Explanatory Variables
  explanatoryDataFrame = createCustomExplanatoryVariables(explanatoryDataFrame, 
                                                          explanatoryVariablesToCalculate,
                                                          identifyingColumns,
                                                          keepAllFeatures
                                                          )

  return explanatoryDataFrame

## Testing

### Internal

In [None]:
# Load Data
CRSP_COMPUSTAT_MERGED = pd.read_csv('Data/Original/CRSP_COMPUSTAT_MERGED.csv')
CRSP_MONTHLY = pd.read_csv('Data/Original/CRSP_MONTHLY.csv')
CRSP_DAILY = pd.read_csv('Data/Original/CRSP_DAILY_REDUCED.csv')
SP500_MONTHLY = pd.read_csv('Data/Original/SP500_MONTHLY.csv')

rawDataframes = [CRSP_COMPUSTAT_MERGED, CRSP_MONTHLY, CRSP_DAILY, SP500_MONTHLY]
xDataFrame = createXDataFrame(rawDataframes, 
                              explanatoryVariablesToCalculate=['NITA', 
                                                      'NIMTA',
                                                      'TLTA',
                                                      'TLMTA',
                                                      'EXRET',
                                                      'RSIZE',
                                                      'CASHMTA',
                                                      'SIGMA'
                                                      ],
                              CRSP_COMPUSTAT_Accounting_features = ['atq', 'ceqq', 'cheq', 'ltq', 'niq', 'dlrsn', 'dldte'],
                              keepAllFeatures=True
                              )

  interactivity=interactivity, compiler=compiler, result=result)


In


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
xDataFrame.head()

In [None]:
print(xDataFrame['date_month'].min())
print(xDataFrame['date_month'].max())
print(xDataFrame.shape)
print(xDataFrame.isna().sum())

In [None]:
xDataFrame.head()

### External

In [None]:
import os
import pandas as pd
os.chdir('/content/drive/Shared drives/Financial Modeling and Testing/Project 1/codeWrappers/dataFrameCreation/')
from createXDataframeWrapper import createXDataFrame
os.chdir('/content/drive/Shared drives/Financial Modeling and Testing/Project 1/')

# Load Data
CRSP_COMPUSTAT_MERGED = pd.read_csv('Data/Original/CRSP_COMPUSTAT_MERGED.csv')
CRSP_MONTHLY = pd.read_csv('Data/Original/CRSP_MONTHLY.csv')
CRSP_DAILY = pd.read_csv('Data/Original/CRSP_DAILY_REDUCED.csv')
SP500_MONTHLY = pd.read_csv('Data/Original/SP500_MONTHLY.csv')

rawDataframes = [CRSP_COMPUSTAT_MERGED, CRSP_MONTHLY, CRSP_DAILY, SP500_MONTHLY]
xDataFrame = createXDataFrame(rawDataframes, 
                              explanatoryVariablesToCalculate=['NITA', 
                                                      'NIMTA',
                                                      'TLTA',
                                                      'TLMTA',
                                                      'EXRET',
                                                      'RSIZE',
                                                      'CASHMTA',
                                                      'SIGMA'
                                                      ],
                              CRSP_COMPUSTAT_Accounting_features = ['atq', 'ceqq', 'cheq', 'ltq', 'niq', 'dlrsn', 'dldte'],
                              keepAllFeatures=True
                              )

  interactivity=interactivity, compiler=compiler, result=result)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In


In [None]:
xDataFrame.head()

# Plotting

In [None]:
# Load Paper Data
paperData = pd.read_csv('Data/Original/paperData.csv')
paperData = paperData.set_index(['year'])

## Number of Active Firms

In [None]:
# Count number of Firms per month
activeFirmsAnalysis = xDataFrame.groupby(['date_month'])['PERMNO'].count()

# Convert Series to Dataframe
activeFirmsAnalysis = pd.DataFrame(activeFirmsAnalysis)

# Add Year Column
activeFirmsAnalysis['year'] = activeFirmsAnalysis.index.to_timestamp().to_period('y')

# Keep only last count of companies for month per year
activeFirmsAnalysis = activeFirmsAnalysis.reset_index()
activeFirmsAnalysis = activeFirmsAnalysis.drop_duplicates(subset=['year'], keep='last')
activeFirmsAnalysis = activeFirmsAnalysis.groupby(['year'])['PERMNO'].mean()

# Convert to DataFrame
activeFirmsAnalysis = pd.DataFrame(activeFirmsAnalysis)

# Rename and Reformat Columns/Index before merge
activeFirmsAnalysis = activeFirmsAnalysis.rename(columns={'PERMNO': 'Active Firms (Calculated)'})
activeFirmsAnalysis.index = activeFirmsAnalysis.index.astype(str).astype(int)
activeFirmsAnalysis = activeFirmsAnalysis.join(paperData['activeFirms'])
activeFirmsAnalysis = activeFirmsAnalysis.rename(columns={'activeFirms': 'Active Firms (Paper)'})

# Plot
activeFirmsAnalysis_Plot = activeFirmsAnalysis.plot(title='Created Universe vs Paper Universe',
                                                    ylabel='Number of Active Firms'
                                                   
                                                   )
fig = activeFirmsAnalysis_Plot.get_figure()
#fig.savefig('images/activeFirmsComparison.png')