# OLS Baseline 
In this notebook an OLS estimation will be performed which serves as a baseline to compare the other methods with. It also serves as the proof-of-concept to figure out how to structure the data correctly and save results before moving on to other architectures.

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm.notebook import tqdm

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def createRollingWindow(dataset, look_back = 1):
    """
    Function takes a 2 dimensional array as input and outputs a 2 dimensional array containing rolling windows of the matrix of size [No_Obs - look_back, look_back * No_Vars].
    It creates rolling windows through concatenating all variables at time t with all variables at time t+1 etc until you you have reached t+look_back and move to next window. 
    """
    X= pd.DataFrame(np.empty((dataset.shape[0] - look_back, dataset.shape[1] * look_back)))
    for i in tqdm(range(dataset.shape[0] - look_back)):    
        X.iloc[i] = dataset.iloc[i:(i + look_back):].to_numpy().flatten()
    return X

In [4]:
def createRollingWindow1D(dataset, look_back = 1):
    """
    Function takes a 1 dimensional array as input and outputs a 2 dimensional array containing rolling windows of the series of size look_back.
    """
    X= pd.DataFrame(np.empty((dataset.shape[0] - look_back, look_back)))
    for i in tqdm(range(dataset.shape[0] - look_back)):    
        X.iloc[i] = dataset.iloc[i:(i + look_back):].to_numpy().flatten()
    return X

## Reading Data
First we start with loading the relevant data from the excel to be used in our analyis

In [5]:
#Read the equity premium series to a dataframe
ep = pd.read_excel('data/Augemented_Formatted_results.xls', sheet_name='Equity premium', skiprows= range(1118,1127,1))[:-1]
ep['Date'] = pd.to_datetime(ep['Date'], format='%Y%m')
ep = ep.set_index('Date')
ep = ep.loc[(ep.index >= '1950-12-01')]

In [6]:
#Read the maacroeconomic variables to a dataframe
mev = pd.read_excel('data/Augemented_Formatted_results.xls', sheet_name='Macroeconomic variables', 
                    skiprows= range(1118,1126,1)).fillna(method='bfill')[:-1] #backward fill missing values. 
mev = mev.loc[:, ~mev.columns.str.match('Unnamed')]  #Remove empty column
mev['Date'] = pd.to_datetime(mev['Date'], format='%Y%m') #convert date pandas format
mev = mev.set_index('Date') #Set date as index. 
mev = mev.loc[(mev.index >= '1950-12-01')]

In [7]:
ta = pd.read_excel('data/Augemented_Formatted_results.xls', sheet_name='Technical indicators', 
                    skiprows= range(1118,1119,1))[:-1]
ta['Date'] = pd.to_datetime(ta['Date'], format='%Y%m')
ta = ta.set_index('Date')
ta = ta.loc[(ta.index >= '1950-12-01')]

# Comparisson to Rapach
I start by recreating the analysis as done in the Neely, Rapach, Tu and Zhou (2014) paper as a starting reference point. Seeing as the results do line up with those presented in the paper, we can confirm that the dataset is the same and all is in order. 

It is the exact in-sample predicitve regression as run in the Rapach paper of which the results can be found in table 2. It is the following bi-variate regression which is run on the data from 1951:01 - 2011:12\:  
$$ r_{t+1} = \alpha_i +\beta_i q_{i,t} + \epsilon_{i,t+1}$$

First I recreate the MEV regression followed by the TA regressions to make sure the results line up. 

### MEV

In [8]:
#Shift equity premiumms such that they correspond to the 1 month out of sample corresponding to each window. 
y = ep.shift(periods=-1)[:ep.loc[(ep.index <= '2011-12-01')].shape[0]-1].reset_index(drop=True)

#Convert y to a series with only log equity premium or simple equity premium 
y = y['Log equity premium'].astype('float64')

# Remove the last observation such that the size of the dataamtrix coincides with the shifted y euity ridk premium
X = mev[:mev.loc[(mev.index <= '2011-12-01')].shape[0]-1]

In [9]:
df = pd.DataFrame(columns=['Variable', 'Coef', 'Intercept', 'R2'])
for variable in mev.columns:
#     X_train, X_test, y_train, y_test = train_test_split(, y, train_size=168, random_state=0, shuffle=False)
    reg = LinearRegression().fit(X[variable].values.reshape(-1, 1), y)
    df = df.append(pd.Series({'Variable' : variable, 
                              'Coef' : reg.coef_[0], 
                              'Intercept' : reg.intercept_, 
                              'R2':  reg.score(X[variable].values.reshape(-1,1), y)}), ignore_index=True)

    

In [10]:
df

Unnamed: 0,Variable,Coef,Intercept,R2
0,DP,0.007794,0.031787,0.005802
1,DY,0.008356,0.0337,0.006702
2,EP,0.004349,0.016671,0.001982
3,DE,0.005918,0.008803,0.001708
4,RVOL,0.074078,-0.006035,0.007336
5,BM,0.005359,0.001713,0.000988
6,NTIS,-0.006571,0.004691,8e-06
7,TBL (ann %),-0.001084,0.009667,0.005648
8,LTY (ann %),-0.000756,0.009371,0.002256
9,LTR (%),0.001345,0.00385,0.00757


### TA

In [11]:
# Remove the last observation such that the size of the dataamtrix coincides with the shifted y euity ridk premium
X = ta[:ta.loc[(ta.index <= '2011-12-01')].shape[0]-1]

In [12]:
df = pd.DataFrame(columns=['Variable', 'Coef', 'Intercept', 'R2'])
for variable in ta.columns:
#     X_train, X_test, y_train, y_test = train_test_split(, y, train_size=168, random_state=0, shuffle=False)
    reg = LinearRegression().fit(X[variable].values.reshape(-1, 1), y)
    df = df.append(pd.Series({'Variable' : variable, 
                              'Coef' : reg.coef_[0], 
                              'Intercept' : reg.intercept_, 
                              'R2':  reg.score(X[variable].values.reshape(-1,1), y)}), ignore_index=True)

In [13]:
df

Unnamed: 0,Variable,Coef,Intercept,R2
0,"MA(1,9)",0.006672,7.9e-05,0.00538
1,"MA(1,12)",0.008684,-0.001518,0.008683
2,"MA(2,9)",0.007026,-0.000198,0.005913
3,"MA(2,12)",0.009441,-0.002038,0.01029
4,"MA(3,9)",0.007652,-0.000667,0.006948
5,"MA(3,12)",0.005433,0.000747,0.003371
6,MOM(9),0.005455,0.000761,0.003435
7,MOM(12),0.005752,0.000442,0.003672
8,"VOL(1,9)",0.006776,8.3e-05,0.005644
9,"VOL(1,12)",0.008881,-0.001572,0.009247


## Rolling Window Regression

In the analysis below I have implemented multiple versions of a rolling window variation of the regression. 1
1. First we create a rolling window of all the MEVs of the past 12 months and concatenate them into 1 vector which serves as input for the 1 month out of sample log equity risk premium. 
2. Secondly we create a rolling window of each MEV separately of the past 12 months and concatenate them into 1 vector which serves as input for the 1 month out of sample log equity risk premium. The difference with the first regression is thus that this in this regression we run the analysis for each variable separately to be more in line with Neely, Rapach, Tu and Zhou (2014)/

### Data restructuring
We must create rolling windows of the Macro Economic Variables (MEV) and match them with the 1 month out of sample equity premium in order train a model. 

In [14]:
#Create rolling window version of the MEV dataset.  
X_mev = createRollingWindow(mev, look_back = 12)

HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




In [15]:
#Shift equity premiumms such that they correspond to the 1 month out of sample corresponding to each window. 
y = ep.shift(periods=-12)[:ep.shape[0]-12].reset_index(drop=True)

#Convert y to a series with only log equity premium or simple equity premium 
y = y['Log equity premium'].astype('float64')

### Train OLS Model Windowed
Create rolling windows where we try to predit the 1 month out of sample equity premium based on the previous 12 months of Macro economic variables.

In [16]:
#Create Train and test set
X_train, X_test, y_train, y_test = train_test_split(X_mev, y, train_size=168, random_state=0, shuffle=False)

In [17]:
#Train a linear regression model on MEV rolling window data and the corresponding 1 month out of sample equity premium. 
reg = LinearRegression().fit(X_train, y_train)
coefficients = reg.coef_
intercept = reg.intercept_

In [18]:
#Make a prediction
y_pred = reg.predict(X_test)

print('Coefficients: ', coefficients)
print('Intercept: ', intercept)


print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))
print('Explained Variance:', metrics.explained_variance_score(y_test, y_pred))

Coefficients:  [-2.65252665e+07  1.76773912e-01  2.65252536e+07  2.65252645e+07
 -2.48042903e-01  1.24187343e+00 -3.79275290e+00 -4.35398443e+08
  4.35352041e+08  1.59326858e-02 -4.35352041e+08  9.09159188e-01
 -1.57406129e-02  3.41614032e-02  5.56815926e+07  5.05635120e+00
  5.40864430e+07  1.20185387e+01 -5.40864550e+07 -5.40864488e+07
 -6.27266749e-01 -2.95883028e+00  2.33537704e+00 -4.93021529e+08
  4.93024170e+08  3.98614294e-02 -4.93024171e+08  2.63886321e-01
  1.32358202e-03  1.18363307e-02 -3.16963639e+06 -4.35331152e-01
  1.65885713e+08  1.36194616e+01 -1.65885725e+08 -1.65885732e+08
  9.03046506e-01  3.69012147e-01 -3.83640925e+00 -5.72609186e+08
  5.72605039e+08  8.27989876e-02 -5.72605039e+08  2.16672525e-01
  3.26282680e-02  7.93452840e-03  4.97653085e+06 -5.70604092e-01
  2.41813499e+08  1.17532047e+01 -2.41813519e+08 -2.41813511e+08
  2.25737442e-01  1.70433645e+00  5.07369753e+00 -5.79175805e+08
  5.79162898e+08  8.74013156e-02 -5.79162898e+08 -4.50054422e-01
  4.949659

## Rolling window OLS estimation for each variable separately
Up till now I have combined all variables into 1 window, however this is not in line with Neely, Rapach, Tu and Zhou (2014). They run seperate regression for each macro economic variable and TA variable, and hence that is what should also do. 

rollingWindowMEV is a dictionary which will be filled with the 2D dataframes containing the rolling windows for a single variable. Thus rollingWindowsMEV['DP'] would yield the 2D matrix containing the rolling windows for the variable DP and will be of size [No_obs-window_size, window_size] or in our case [828 obs - 12, 12] = [817,12]. The dictionairy is thus accessible with the variable name as index key and these can be obtained from the original data through mev.columns which yields an array of the column names of the original dataframe.

In [19]:
#Shift equity premiumms such that they correspond to the 1 month out of sample corresponding to each window. 
y = ep.shift(periods=-12)[:ep.shape[0]-12].reset_index(drop=True)

#Convert y to a series with only log equity premium or simple equity premium 
y = y['Log equity premium'].astype('float64')

In [20]:
# Create empty dictionary
rollingWindowsMEV = dict()

#Fill the dictionairy with the 2D array with rolling windows for each variable. 
for variable in mev.columns:
    rollingWindowsMEV[variable] = createRollingWindow1D(mev[variable], 12)

HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




In [21]:
df = pd.DataFrame(columns=['Variable', 'Coef', 'Intercept', 'R2', 'MAE', 'MSE', 'RMSE'])

for variable in mev.columns:
    X_train, X_test, y_train, y_test = train_test_split(rollingWindowsMEV[variable], y, train_size=168, random_state=0, shuffle=False)
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    df = df.append(pd.Series({'Variable' : variable, 
                              'Coef' : reg.coef_[0], 
                              'Intercept' : reg.intercept_, 
                              'R2':  reg.score(X_train, y_train),
                              'MAE': metrics.mean_absolute_error(y_test, y_pred),
                              'MSE': metrics.mean_squared_error(y_test, y_pred), 
                              'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred))}), ignore_index=True)

In [22]:
df

Unnamed: 0,Variable,Coef,Intercept,R2,MAE,MSE,RMSE
0,DP,-0.008935,0.06491,0.07277,0.033165,0.001938,0.044019
1,DY,0.001962,0.068221,0.075063,0.033355,0.001962,0.044296
2,EP,0.014268,0.049222,0.067791,0.033403,0.002025,0.045
3,DE,-0.030262,0.005276,0.053072,0.037623,0.003593,0.059945
4,RVOL,-0.107951,0.008718,0.079077,0.033354,0.002043,0.045201
5,BM,0.051665,-0.027054,0.0829,0.036092,0.002142,0.04628
6,NTIS,-0.536463,0.023586,0.073837,0.036967,0.002435,0.049346
7,TBL (ann %),-0.007852,0.027838,0.140212,0.040772,0.002928,0.054112
8,LTY (ann %),-0.007834,0.03937,0.095641,0.043895,0.003384,0.058174
9,LTR (%),0.004386,0.005874,0.100159,0.038467,0.0025,0.049996


In [23]:
# Create empty dictionary
rollingWindowsTA = dict()

#Fill the dictionairy with the 2D array with rolling windows for each variable. 
for variable in ta.columns:
    rollingWindowsTA[variable] = createRollingWindow1D(ta[variable], 12)

HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=817.0), HTML(value='')))




In [24]:
df = pd.DataFrame(columns=['Variable', 'Coef', 'Intercept', 'R2', 'MAE', 'MSE', 'RMSE'])

for variable in ta.columns:
    X_train, X_test, y_train, y_test = train_test_split(rollingWindowsTA[variable], y, train_size=168, random_state=0, shuffle=False)
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)    
    df = df.append(pd.Series({'Variable' : variable, 
                              'Coef' : reg.coef_[0], 
                              'Intercept' : reg.intercept_, 
                              'R2':  reg.score(X_train, y_train),
                              'MAE': metrics.mean_absolute_error(y_test, y_pred),
                              'MSE': metrics.mean_squared_error(y_test, y_pred), 
                              'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred))}), ignore_index=True)

In [25]:
df

Unnamed: 0,Variable,Coef,Intercept,R2,MAE,MSE,RMSE
0,"MA(1,9)",-0.006507,0.014977,0.079938,0.03309,0.001988,0.044586
1,"MA(1,12)",-0.016159,0.020468,0.08666,0.032972,0.001964,0.044315
2,"MA(2,9)",-0.008058,0.01548,0.112626,0.034302,0.002086,0.045669
3,"MA(2,12)",-0.020813,0.020773,0.113394,0.033403,0.001981,0.044504
4,"MA(3,9)",-0.022828,0.019723,0.109648,0.034211,0.002033,0.045085
5,"MA(3,12)",-0.023328,0.020944,0.154533,0.03418,0.00208,0.045603
6,MOM(9),-0.016882,0.024376,0.077311,0.032872,0.001945,0.044098
7,MOM(12),-0.014455,0.025411,0.071408,0.03273,0.00195,0.044161
8,"VOL(1,9)",-0.001469,0.01371,0.029859,0.032759,0.00192,0.043821
9,"VOL(1,12)",0.001415,0.019103,0.041229,0.033203,0.001975,0.044446
