In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm.notebook import tqdm

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def createRollingWindow(dataset, look_back = 1):
    """
    Function takes a 2 dimensional array as input and outputs a 2 dimensional array containing rolling windows of the matrix of size [No_Obs - look_back, look_back * No_Vars].
    It creates rolling windows through concatenating all variables at time t with all variables at time t+1 etc until you you have reached t+look_back and move to next window. 
    """
    X= pd.DataFrame(np.empty((dataset.shape[0] - look_back, dataset.shape[1] * look_back)))
    for i in tqdm(range(dataset.shape[0] - look_back)):    
        X.iloc[i] = dataset.iloc[i:(i + look_back):].to_numpy().flatten()
    return X

In [4]:
def createRollingWindow1D(dataset, look_back = 1):
    """
    Function takes a 1 dimensional array as input and outputs a 2 dimensional array containing rolling windows of the series of size look_back.
    """
    X= pd.DataFrame(np.empty((dataset.shape[0] - look_back, look_back)))
    for i in tqdm(range(dataset.shape[0] - look_back)):    
        X.iloc[i] = dataset.iloc[i:(i + look_back):].to_numpy().flatten()
    return X

## Reading Data
First we start with loading the relevant data from the excel to be used in our analyis

In [5]:
#Read the equity premium series to a dataframe
ep = pd.read_excel('data/Augemented_Formatted_results.xls', sheet_name='Equity premium', skiprows= range(1118,1127,1))[:-1]
ep['Date'] = pd.to_datetime(ep['Date'], format='%Y%m')
ep = ep.set_index('Date')
ep = ep.loc[(ep.index >= '1950-12-01')]

In [6]:
#Read the maacroeconomic variables to a dataframe
mev = pd.read_excel('data/Augemented_Formatted_results.xls', sheet_name='Macroeconomic variables', 
                    skiprows= range(1118,1126,1)).fillna(method='bfill')[:-1] #backward fill missing values. 
mev = mev.loc[:, ~mev.columns.str.match('Unnamed')]  #Remove empty column
mev['Date'] = pd.to_datetime(mev['Date'], format='%Y%m') #convert date pandas format
mev = mev.set_index('Date') #Set date as index. 
mev = mev.loc[(mev.index >= '1950-12-01')]

In [7]:
ta = pd.read_excel('data/Augemented_Formatted_results.xls', sheet_name='Technical indicators', 
                    skiprows= range(1118,1119,1))[:-1]
ta['Date'] = pd.to_datetime(ta['Date'], format='%Y%m')
ta = ta.set_index('Date')
ta = ta.loc[(ta.index >= '1950-12-01')]

# Random Forest
In the code below we will train a random forest for each macro economic variable and technical indicator separately using a rolling window of the past 12 months for each variable/indicator. The recorded R2 score is based on in sample analysis, but the MAE, MSE and MSE are calculated using out of sample analysis. Thus the random forest is trained ussing rolling windows from 1950:12 to 1965:12 yielding 180-12=168 rolling windows. The model is then assessed in terms of prediction accuracy in MAE, MSE and RMSE using data from 1966:01 to 2019:12 yielding 648 rolling windows. 
### Macro Economic Variables

In [8]:
#Shift equity premiumms such that they correspond to the 1 month out of sample corresponding to each window. 
y = ep.shift(periods=-12)[:ep.shape[0]-12].reset_index(drop=True)

#Convert y to a series with only log equity premium or simple equity premium 
y = y['Log equity premium'].astype('float64')

In [9]:
# Create empty dictionary
rollingWindowsMEV = dict()

#Fill the dictionairy with the 2D array with rolling windows for each variable. 
for variable in mev.columns:
    rollingWindowsMEV[variable] = createRollingWindow1D(mev[variable], 12)

HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




In [18]:
df = pd.DataFrame(columns=['Variable', 'R2', 'MAE', 'MSE', 'RMSE'])

for variable in mev.columns:
    X_train, X_test, y_train, y_test = train_test_split(rollingWindowsMEV[variable], y, train_size=168, random_state=0, shuffle=False)
    reg = RandomForestRegressor(random_state = 42).fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    df = df.append(pd.Series({'Variable' : variable, 
                              'R2':  reg.score(X_train, y_train), 
                              'MAE': metrics.mean_absolute_error(y_test, y_pred),
                              'MSE': metrics.mean_squared_error(y_test, y_pred), 
                              'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred))}), ignore_index=True)

In [19]:
df

Unnamed: 0,Variable,R2,MAE,MSE,RMSE
0,DP,0.852684,0.053325,0.004161,0.064506
1,DY,0.847577,0.051326,0.003833,0.061911
2,EP,0.854185,0.038757,0.002454,0.049538
3,DE,0.844104,0.035133,0.002153,0.046395
4,RVOL,0.831876,0.033817,0.002062,0.045411
5,BM,0.833905,0.040394,0.002495,0.049947
6,NTIS,0.851369,0.032983,0.001983,0.044532
7,TBL (ann %),0.848862,0.045687,0.003251,0.057021
8,LTY (ann %),0.849903,0.045913,0.003166,0.056272
9,LTR (%),0.851968,0.034124,0.002075,0.045548


### Technical Indiciators


In [12]:
#Shift equity premiumms such that they correspond to the 1 month out of sample corresponding to each window. 
y = ep.shift(periods=-12)[:ep.shape[0]-12].reset_index(drop=True)

#Convert y to a series with only log equity premium or simple equity premium 
y = y['Log equity premium'].astype('float64')

In [13]:
# Create empty dictionary
rollingWindowsTA = dict()

#Fill the dictionairy with the 2D array with rolling windows for each variable. 
for variable in ta.columns:
    rollingWindowsTA[variable] = createRollingWindow1D(ta[variable], 12)

HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




In [14]:
df = pd.DataFrame(columns=['Variable', 'R2', 'MAE', 'MSE', 'RMSE'])

for variable in ta.columns:
    X_train, X_test, y_train, y_test = train_test_split(rollingWindowsTA[variable], y, train_size=168, random_state=0, shuffle=False)
    reg = RandomForestRegressor(random_state = 42).fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    df = df.append(pd.Series({'Variable' : variable, 
                              'R2':  reg.score(X_train, y_train), 
                              'MAE': metrics.mean_absolute_error(y_test, y_pred),
                              'MSE': metrics.mean_squared_error(y_test, y_pred), 
                              'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred))}), ignore_index=True)

In [15]:
df

Unnamed: 0,Variable,R2,MAE,MSE,RMSE
0,"MA(1,9)",0.392204,0.035244,0.002182,0.046716
1,"MA(1,12)",0.291008,0.034091,0.002076,0.045561
2,"MA(2,9)",0.318575,0.035857,0.002245,0.047378
3,"MA(2,12)",0.247087,0.035082,0.002215,0.047065
4,"MA(3,9)",0.265368,0.034526,0.002157,0.046446
5,"MA(3,12)",0.23982,0.034605,0.002143,0.046292
6,MOM(9),0.35589,0.035246,0.002255,0.047485
7,MOM(12),0.322391,0.034443,0.002161,0.046482
8,"VOL(1,9)",0.510372,0.034452,0.002123,0.046074
9,"VOL(1,12)",0.410273,0.034906,0.002182,0.046713
