# **Return Predictability with Agnostic Fundamental Analysis**
### *Michele Orlandi*
### *MGT6078 Fall 2022*

# 1. **Setup**

In [14]:
import os
import sys
sys.path.append(os.getcwd())
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import gc
import warnings

In [2]:
url = 'https://www.dropbox.com/s/iasfmrfdzafjkyq/student_data.csv?dl=1'

In [3]:
gc.enable()
warnings.filterwarnings('ignore')

In [6]:
# read data from Dropbox and remove extra identifiers
msf = pd.read_csv(url).drop(labels=['PERMNO', 'gvkey', 'COMNAM', 'TICKER', 'SICCD'], axis=1)
msf = msf.loc[:, msf.columns[1:]].copy()
gc.collect()
# get datetime objects and Market Value of each firm
msf['date'] = pd.to_datetime(msf['date'], format='%Y%m%d')
msf['nextmonth'] = pd.to_datetime(msf['nextmonth'], format='%Y%m%d')
msf['mktval'] = msf['PRC'] * msf['SHROUT']

In [13]:
# store predictor variable names
predictor_names = msf.columns[7:-1].tolist()
# store prediction name
prediction_name = msf.columns[-1]

In [44]:
msf.head()

Unnamed: 0,date,CUSIP,PRC,SHROUT,RET,nextmonth,next_Ret,atq,dvpq,seqq,...,ibq_MA4,txtq_MA4,niq_MA4,cheq_MA4,saleq_MA4,dvy_MA4,piq_MA4,mktval,linear_fvp,linear_mispricing
0,1987-03-31,36110,33.5,9099.0,0.107438,1987-04-30,-0.11194,228.106,0.0,130.427,...,3.5615,2.8575,3.5615,3.78525,70.677,2.58625,6.419,304816.5,174955700.0,572.970606
1,1987-03-31,10304310,38.25,9158.0,0.145522,1987-04-30,-0.04902,163.403,0.0,112.86,...,5.48775,5.37,5.48775,26.734,48.0415,2.54975,11.11825,350293.5,129133700.0,367.644326
2,1987-03-31,89051610,61.25,4418.0,-0.020349,1987-04-30,-0.089796,79.886,0.0,64.583,...,3.2005,3.22475,3.2005,25.7955,27.85075,1.066,6.42525,270602.5,76984970.0,283.494683
3,1987-03-31,89109210,31.0,6633.0,-0.03861,1987-04-30,-0.032258,261.411,0.296,74.113,...,4.20075,2.679,4.20075,7.613,112.99875,2.643,6.87975,205623.0,-119921200.0,-584.209282
4,1987-03-31,89190610,26.0,15916.0,-0.223881,1987-04-30,-0.028846,27.39,0.0,24.649,...,1.3095,1.29375,1.3095,4.8605,9.08825,0.0,2.60325,413816.0,17737170.0,41.862468


# 2. **Model 1 - OLS**

## 2.1 **Helper Functions**

In [45]:
def scale_data(df: pd.DataFrame):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df)
    return scaled

In [46]:
def get_regression(df: pd.DataFrame, predictors: list, prediction: str):
    X = df.loc[:, predictors]
    y = df.loc[:, prediction]
    reg = LinearRegression()
    x_scaled = scale_data(X)
    reg.fit(x_scaled, y)
    y_pred = reg.predict(X)
    return y_pred

## 2.2 **Mispricing Signal**

The mispricing signal is calculated as: $$ M_{j,t} = {{FairValuePrediction_{j,t} - MarketValue_{j,t}} \over MarketValue_{j,t}} $$
Where:
- $FairValuePrediction_{j,t}$ is defined as firm $j$'s Market Value on month $t$ predicted by a simple OLS
- $MarketValue_{j,t}$ is defined as firm $j$'s Market Value on month $t$ calculated as $PRC * SHROUT$
- $M_{j,t}$ is the mispricing signal for firm $j$ on month $t$


In [32]:
# get monthly fair value for each firm through simple OLS
linear_fair_value = pd.DataFrame(msf.groupby('date').apply(
    lambda x: pd.Series(
        get_regression(x, predictor_names, prediction_name),
        name='linear_fvp'
    ))).reset_index()

In [None]:
# add results to main data frame
msf['linear_fvp'] = linear_fair_value['linear_fvp']
# calculate mispricing signal
msf['linear_sig'] = (msf['linear_fvp'] - msf['mktval']) / msf['mktval']

# 3. **Model 2 - OLS Post-LASSO**

The mispricing signal is calculated as before, except we select the relevant explanatory features through a Least Absolute Selection and Shrinkage Operator.<br>

- Firstly fit monthly data to a LASSO Cross-Validation to find the optimal $l_1$ penalty $\alpha$
- Then fit the data to the LASSO Regressor with the optimal $\alpha$
- Select the most relevant coefficients: $\beta_i > 0$
- Run OLS with only selected explanatory variables to predict fair values

## 3.1 **Helper Functions**

In [77]:
# run LASSO Regression to extract relevant features
def feature_selection(df: pd.DataFrame, predictors: list, prediction):
    X = df.loc[:, predictors]
    y = df.loc[:, prediction]
    x_scaled = scale_data(X)

    lasso = Lasso(max_iter=10000, normalize=True)
    lasso_cv = LassoCV(alphas=None, cv=10, max_iter=10000, normalize=True)
    # fit Cross Validation to get optimal alpha
    lasso_cv.fit(x_scaled, y)
    lasso.alpha = lasso_cv.alpha_
    # fit Lasso with optimal penalty
    lasso.fit(x_scaled, y)
    # select optimal features -> coefficient > 0
    coefficients = lasso.coef_
    selected_features = list(np.array(predictors)[coefficients > 0])

    return selected_features

In [78]:
# run OLS post-LASSO feature selection
def ols_post_lasso(df: pd.DataFrame, predictors: list, prediction: str):
    
    # use LASSO regression to select relevant features
    relevant_feats = feature_selection(df, predictors, prediction)
    # get predicted values from OLS with selected features
    y_pred = get_regression(df, relevant_feats, prediction)
    
    return y_pred

## 3.2 **Mispricing Signal**

In [79]:
# calculate predicted fair value with post-LASSO regression
lasso_fair_value = pd.DataFrame(msf.groupby('date').apply(
    lambda x: pd.Series(
        ols_post_lasso(x, predictor_names, prediction_name),
        name='post_lasso_fvp'
    ))).reset_index()

In [81]:
# calculate OLS post-LASSO mispricing signal
msf['lasso_fvp'] = lasso_fair_value['post_lasso_fvp']
msf['lasso_sig'] = (msf['lasso_fvp'] - msf['mktval']) / msf['mktval']