# **Return Predictability with Agnostic Fundamental Analysis**
### *Michele Orlandi*
### *MGT6078 Fall 2022*

# 1. **Setup**

In [1]:
import os
import sys
sys.path.append(os.getcwd())
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import gc
import warnings

In [2]:
url = 'https://www.dropbox.com/s/iasfmrfdzafjkyq/student_data.csv?dl=1'

In [3]:
gc.enable()
warnings.filterwarnings('ignore')

In [4]:
# read data from Dropbox and remove extra identifiers
msf = pd.read_csv(url).drop(labels=['PERMNO', 'gvkey', 'COMNAM', 'TICKER', 'SICCD'], axis=1)
msf = msf.loc[:, msf.columns[1:]].copy()
gc.collect()
# get datetime objects and Market Value of each firm
msf['date'] = pd.to_datetime(msf['date'], format='%Y%m%d')
msf['nextmonth'] = pd.to_datetime(msf['nextmonth'], format='%Y%m%d')
msf['mktval'] = msf['PRC'] * msf['SHROUT']

In [5]:
# store predictor variable names
predictor_names = msf.columns[7:-1].tolist()
# store prediction name
prediction_name = msf.columns[-1]

In [6]:
msf.head()

Unnamed: 0,date,CUSIP,PRC,SHROUT,RET,nextmonth,next_Ret,atq,dvpq,seqq,...,doq_MA4,nopiq_MA4,ibq_MA4,txtq_MA4,niq_MA4,cheq_MA4,saleq_MA4,dvy_MA4,piq_MA4,mktval
0,1987-03-31,36110,33.5,9099.0,0.107438,1987-04-30,-0.11194,228.106,0.0,130.427,...,0.0,0.44225,3.5615,2.8575,3.5615,3.78525,70.677,2.58625,6.419,304816.5
1,1987-03-31,10304310,38.25,9158.0,0.145522,1987-04-30,-0.04902,163.403,0.0,112.86,...,0.0,0.665,5.48775,5.37,5.48775,26.734,48.0415,2.54975,11.11825,350293.5
2,1987-03-31,89051610,61.25,4418.0,-0.020349,1987-04-30,-0.089796,79.886,0.0,64.583,...,0.0,0.379,3.2005,3.22475,3.2005,25.7955,27.85075,1.066,6.42525,270602.5
3,1987-03-31,89109210,31.0,6633.0,-0.03861,1987-04-30,-0.032258,261.411,0.296,74.113,...,0.0,-0.10925,4.20075,2.679,4.20075,7.613,112.99875,2.643,6.87975,205623.0
4,1987-03-31,89190610,26.0,15916.0,-0.223881,1987-04-30,-0.028846,27.39,0.0,24.649,...,0.0,0.0665,1.3095,1.29375,1.3095,4.8605,9.08825,0.0,2.60325,413816.0


# 2. **Model 1 - OLS**

## 2.1 **Helper Functions**

In [7]:
def scale_data(df: pd.DataFrame):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df)
    return scaled

In [8]:
def get_regression(df: pd.DataFrame, predictors: list, prediction: str):
    X = df.loc[:, predictors]
    y = df.loc[:, prediction]
    reg = LinearRegression()
    x_scaled = scale_data(X)
    reg.fit(x_scaled, y)
    y_pred = reg.predict(x_scaled)
    return y_pred

## 2.2 **Mispricing Signal**
The mispricing signal is calculated as: $$ M_{j,t} = {{FairValuePrediction_{j,t} - MarketValue_{j,t}} \over MarketValue_{j,t}} $$
Where:
- $FairValuePrediction_{j,t}$ is defined as firm $j$'s Market Value on month $t$ predicted by a simple OLS
- $MarketValue_{j,t}$ is defined as firm $j$'s Market Value on month $t$ calculated as $PRC * SHROUT$
- $M_{j,t}$ is the mispricing signal for firm $j$ on month $t$


In [9]:
# get monthly fair value for each firm through simple OLS
linear_fair_value = pd.DataFrame(msf.groupby('date').apply(
    lambda x: pd.Series(
        get_regression(x, predictor_names, prediction_name),
        name='linear_fvp'
    ))).reset_index()

In [10]:
# add results to main data frame
msf['linear_fvp'] = linear_fair_value['linear_fvp']
# calculate mispricing signal
msf['linear_sig'] = (msf['linear_fvp'] - msf['mktval']) / msf['mktval']

# 3. **Model 2 - OLS Post-LASSO**

## 3.1 **Helper Functions**

In [11]:
# run LASSO Regression to extract relevant features
def feature_selection(df: pd.DataFrame, predictors: list, prediction):
    X = df.loc[:, predictors]
    y = df.loc[:, prediction]
    x_scaled = scale_data(X)

    lasso = Lasso(max_iter=10000, normalize=True)
    lasso_cv = LassoCV(alphas=None, cv=10, max_iter=10000, normalize=True)
    # fit Cross Validation to get optimal alpha
    lasso_cv.fit(x_scaled, y)
    lasso.alpha = lasso_cv.alpha_
    # fit Lasso with optimal penalty
    lasso.fit(x_scaled, y)
    # select optimal features -> coefficient > 0
    coefficients = lasso.coef_
    selected_features = list(np.array(predictors)[coefficients > 0])

    return selected_features

In [12]:
# run OLS post-LASSO feature selection
def ols_post_lasso(df: pd.DataFrame, predictors: list, prediction: str):
    
    # use LASSO regression to select relevant features
    relevant_feats = feature_selection(df, predictors, prediction)
    # get predicted values from OLS with selected features
    y_pred = get_regression(df, relevant_feats, prediction)
    
    return y_pred

## 3.2 **Mispricing Signal**
The mispricing signal is calculated as before, except we select the relevant explanatory features through a Least Absolute Selection and Shrinkage Operator.<br>

- Firstly fit monthly data to a LASSO Cross-Validation to find the optimal $l_1$ penalty $\alpha$
- Then fit the data to the LASSO Regressor with the optimal $\alpha$
- Select the most relevant coefficients: $\beta_i > 0$
- Run OLS with only selected explanatory variables to predict fair values

In [13]:
# calculate predicted fair value with post-LASSO regression
lasso_fair_value = pd.DataFrame(msf.groupby('date').apply(
    lambda x: pd.Series(
        ols_post_lasso(x, predictor_names, prediction_name),
        name='post_lasso_fvp'
    ))).reset_index()

In [14]:
# calculate OLS post-LASSO mispricing signal
msf['lasso_fvp'] = lasso_fair_value['post_lasso_fvp']
msf['lasso_sig'] = (msf['lasso_fvp'] - msf['mktval']) / msf['mktval']

# 4. **Model 3 - Random Forest Regressor**

## 4.1 **Helper Functions**

In [15]:
def get_random_forest(df: pd.DataFrame, predictors: list, prediction: str):
    X = df.loc[:, predictors]
    y = df.loc[:, prediction]
    x_scaled = scale_data(X)

    rf_reg = RandomForestRegressor(
        n_estimators=1000,
        random_state=42,
        min_samples_leaf=20,
        max_depth=100,
        n_jobs=-1
        )
    rf_reg.fit(x_scaled, y)
    y_pred = rf_reg.predict(x_scaled)
    return y_pred

## 4.2 **Mispricing Signal**

In [16]:
# calculate predicted fair value with Random Forest regression
rf_fair_value = pd.DataFrame(msf.groupby('date').apply(
    lambda x: pd.Series(
        get_random_forest(x, predictor_names, prediction_name),
        name='rf_fvp'
    ))).reset_index()

In [17]:
# calculate mispricing signal
msf['rf_fvp'] = rf_fair_value['rf_fvp']
msf['rf_sig'] = (msf['rf_fvp'] - msf['mktval']) / msf['mktval']

# 5. **Portfolio Construction**

In [18]:
# select only relevant variables
portfolio_labs = ['date', 'CUSIP', 'mktval'] + [col for col in msf.columns.tolist() if ('_fvp' in col) or ('_sig' in col)]
signal_data = msf.loc[:, portfolio_labs].copy()

In [19]:
signal_data.head()

Unnamed: 0,date,CUSIP,mktval,linear_fvp,linear_sig,lasso_fvp,lasso_sig,rf_fvp,rf_sig
0,1987-03-31,36110,304816.5,456880.662951,0.498871,447478.401169,0.468026,291902.902475,-0.042365
1,1987-03-31,10304310,350293.5,408503.01526,0.166174,472120.883363,0.347787,448585.047065,0.280598
2,1987-03-31,89051610,270602.5,275588.243491,0.018425,357723.212734,0.321951,276894.497616,0.023252
3,1987-03-31,89109210,205623.0,330098.489303,0.605358,374531.419583,0.821447,284665.347578,0.384404
4,1987-03-31,89190610,413816.0,146818.190213,-0.645209,256074.151786,-0.381188,161779.517648,-0.609054
