In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime

In [2]:
## Read data
import os
print(os.getcwd())

/Users/kunalchakraborty/Downloads/industry_project_HQ


## Fama-French Factors

### Start with 3 factors

In [3]:
### Preprocess 3 factors dataset
ff3 = pd.read_csv("data/ff3.csv",skiprows=3,nrows=693, index_col='Unnamed: 0',)
ff3.index = pd.to_datetime(ff3.index,format='%Y%m')

In [4]:
## 5 factors
ff5 = pd.read_csv("data/ff5.CSV",skiprows=3,nrows=693, index_col='Unnamed: 0',)
ff5.index = pd.to_datetime(ff5.index,format='%Y%m')

## Amit Goyal's factors

In [5]:
## Load index data and construct the features
data = pd.read_csv("data/index_data.csv")

In [7]:
data.tail()

Unnamed: 0,date,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx
739,2020-08-01,3500.31,59.128628,98.556667,,0.001,0.0225,0.0327,0.0065,-0.008504,0.0001,0.003153,-0.0349,-0.0488,0.000743,,0.072068,0.070308
740,2020-09-01,3363.0,58.851237,98.22,,0.0011,0.0231,0.0336,0.0068,-0.005698,0.0001,0.001393,0.008,0.0041,0.004907,,-0.038151,-0.039366
741,2020-10-01,3269.96,58.66044,96.856667,,0.001,0.0235,0.0344,0.0079,-0.001895,0.0001,0.000415,-0.0238,-0.019,0.003661,,-0.026409,-0.027507
742,2020-11-01,3621.63,58.469643,95.493333,,0.0009,0.023,0.033,0.0087,-0.005262,0.0001,-0.000611,0.0093,0.0509,0.002492,,0.109403,0.107623
743,2020-12-01,3756.07,58.278846,94.13,,0.0009,0.0226,0.0316,0.0093,-9.4e-05,0.0001,0.000941,-0.0115,0.0,0.000678,,0.041573,0.040138


In [8]:
## As a starter, use Amit goyals data to create features and experiment
## Compute net returns on S&P
data["temp"] = data["Index"] + data["D12"]/12
data["returns"] = data["temp"]/data["Index"].shift()
data["returns"] = data["returns"] - 1
data.drop("temp",axis=1,inplace=True)

In [9]:
# equity premium, i.e., the total rate of return on the stock market minus the prevailing short-term interest rate.
data['premium'] = data['returns'] - data['Rfree']

# Dividend Price Ratio (d/p) is the difference between the log of dividends and the log of prices. 
data['d/p'] = np.log(data['D12']) - np.log(data['Index'])

# Dividend Yield (d/y) is the difference between the log of dividends and the log of lagged prices.
data['d/y'] = np.log(data['D12'])- np.log(data['Index'].shift(1))

# Earnings Price Ratio (e/p) is the difference between the log of earnings and the log of prices.
data['e/p'] = np.log(data['E12']) - np.log(data['Index'])

# Dividend Payout Ratio (d/e) is the difference between the log of dividends and the log of earnings.
data['d/e'] = np.log(data['D12']) - np.log(data['E12'])

# Term Spread (tms) is the difference between the long term yield on government bonds and the T-bill.
data['tms'] = data['lty'] - data['tbl']

# Default Yield Spread (dfy): is the difference between BAA- and AAA- rated cor- porate bond yields.
data['dfy'] = data['BAA'] - data['AAA']

# Default Return Spread (dfr): is the difference between the return on long-term corporate bonds and returns on the long-term government bonds.
data['dfr'] = data['corpr'] - data['ltr']

In [11]:
data.tail()

Unnamed: 0,date,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,...,CRSP_SPvwx,returns,premium,d/p,d/y,e/p,d/e,tms,dfy,dfr
739,2020-08-01,3500.31,59.128628,98.556667,,0.001,0.0225,0.0327,0.0065,-0.008504,...,0.070308,0.071571,0.071471,-4.080892,-4.013173,-3.569975,-0.510916,0.0055,0.0102,-0.0139
740,2020-09-01,3363.0,58.851237,98.22,,0.0011,0.0231,0.0336,0.0068,-0.005698,...,-0.039366,-0.037827,-0.037927,-4.045576,-4.085594,-3.533379,-0.512197,0.0057,0.0105,-0.0039
741,2020-10-01,3269.96,58.66044,96.856667,,0.001,0.0235,0.0344,0.0079,-0.001895,...,-0.027507,-0.026212,-0.026312,-4.020767,-4.048823,-3.519301,-0.501467,0.0069,0.0109,0.0048
742,2020-11-01,3621.63,58.469643,95.493333,,0.0009,0.023,0.033,0.0087,-0.005262,...,0.107623,0.109036,0.108936,-4.126172,-4.024025,-3.635623,-0.490549,0.0078,0.01,0.0416
743,2020-12-01,3756.07,58.278846,94.13,,0.0009,0.0226,0.0316,0.0093,-9.4e-05,...,0.040138,0.038462,0.038362,-4.165889,-4.12944,-3.686452,-0.479438,0.0084,0.009,0.0115


In [12]:
data.to_csv("data/ag_transformed_features.csv",index=False)