# **Return Predictability with Agnostic Fundamental Analysis**
### *Michele Orlandi*
### *MGT6078 Fall 2022*

# 1. **Setup**

In [None]:
import os
import sys
sys.path.append(os.getcwd())
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import gc
import warnings

In [None]:
url = 'https://www.dropbox.com/s/iasfmrfdzafjkyq/student_data.csv?dl=1'

In [None]:
gc.enable()
warnings.filterwarnings('ignore')

In [None]:
# read data from Dropbox and remove extra identifiers
msf = pd.read_csv(url).drop(labels=['PERMNO', 'gvkey', 'COMNAM', 'TICKER', 'SICCD'], axis=1)
msf = msf.loc[:, msf.columns[1:]].copy()
gc.collect()
# get datetime objects and Market Value of each firm
msf['date'] = pd.to_datetime(msf['date'], format='%Y%m%d')
msf['nextmonth'] = pd.to_datetime(msf['nextmonth'], format='%Y%m%d')
msf['mktval'] = msf['PRC'] * msf['SHROUT']

In [None]:
# store predictor variable names
predictor_names = msf.columns[7:-1].tolist()
# store prediction name
prediction_name = msf.columns[-1]

In [None]:
msf.head()

# 2. **Model 1 - OLS**

## 2.1 **Helper Functions**

In [None]:
def scale_data(df: pd.DataFrame):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df)
    return scaled

In [None]:
def get_regression(df: pd.DataFrame, predictors: list, prediction: str):
    X = df.loc[:, predictors]
    y = df.loc[:, prediction]
    reg = LinearRegression()
    x_scaled = scale_data(X)
    reg.fit(x_scaled, y)
    y_pred = reg.predict(X)
    return y_pred

## 2.2 **Mispricing Signal**
The mispricing signal is calculated as: $$ M_{j,t} = {{FairValuePrediction_{j,t} - MarketValue_{j,t}} \over MarketValue_{j,t}} $$
Where:
- $FairValuePrediction_{j,t}$ is defined as firm $j$'s Market Value on month $t$ predicted by a simple OLS
- $MarketValue_{j,t}$ is defined as firm $j$'s Market Value on month $t$ calculated as $PRC * SHROUT$
- $M_{j,t}$ is the mispricing signal for firm $j$ on month $t$


In [None]:
# get monthly fair value for each firm through simple OLS
linear_fair_value = pd.DataFrame(msf.groupby('date').apply(
    lambda x: pd.Series(
        get_regression(x, predictor_names, prediction_name),
        name='linear_fvp'
    ))).reset_index()

In [None]:
# add results to main data frame
msf['linear_fvp'] = linear_fair_value['linear_fvp']
# calculate mispricing signal
msf['linear_sig'] = (msf['linear_fvp'] - msf['mktval']) / msf['mktval']

# 3. **Model 2 - OLS Post-LASSO**

## 3.1 **Helper Functions**

In [None]:
# run LASSO Regression to extract relevant features
def feature_selection(df: pd.DataFrame, predictors: list, prediction):
    X = df.loc[:, predictors]
    y = df.loc[:, prediction]
    x_scaled = scale_data(X)

    lasso = Lasso(max_iter=10000, normalize=True)
    lasso_cv = LassoCV(alphas=None, cv=10, max_iter=10000, normalize=True)
    # fit Cross Validation to get optimal alpha
    lasso_cv.fit(x_scaled, y)
    lasso.alpha = lasso_cv.alpha_
    # fit Lasso with optimal penalty
    lasso.fit(x_scaled, y)
    # select optimal features -> coefficient > 0
    coefficients = lasso.coef_
    selected_features = list(np.array(predictors)[coefficients > 0])

    return selected_features

In [None]:
# run OLS post-LASSO feature selection
def ols_post_lasso(df: pd.DataFrame, predictors: list, prediction: str):
    
    # use LASSO regression to select relevant features
    relevant_feats = feature_selection(df, predictors, prediction)
    # get predicted values from OLS with selected features
    y_pred = get_regression(df, relevant_feats, prediction)
    
    return y_pred

## 3.2 **Mispricing Signal**
The mispricing signal is calculated as before, except we select the relevant explanatory features through a Least Absolute Selection and Shrinkage Operator.<br>

- Firstly fit monthly data to a LASSO Cross-Validation to find the optimal $l_1$ penalty $\alpha$
- Then fit the data to the LASSO Regressor with the optimal $\alpha$
- Select the most relevant coefficients: $\beta_i > 0$
- Run OLS with only selected explanatory variables to predict fair values

In [None]:
# calculate predicted fair value with post-LASSO regression
lasso_fair_value = pd.DataFrame(msf.groupby('date').apply(
    lambda x: pd.Series(
        ols_post_lasso(x, predictor_names, prediction_name),
        name='post_lasso_fvp'
    ))).reset_index()

In [None]:
# calculate OLS post-LASSO mispricing signal
msf['lasso_fvp'] = lasso_fair_value['post_lasso_fvp']
msf['lasso_sig'] = (msf['lasso_fvp'] - msf['mktval']) / msf['mktval']

# 4. **Model 3 - Random Forest Regressor**

## 4.1 **Helper Functions**

In [None]:
def get_random_forest(df: pd.DataFrame, predictors: list, prediction: str):
    X = df.loc[:, predictors]
    y = df.loc[:, prediction]
    x_scaled = scale_data(X)

    rf_reg = RandomForestRegressor(n_estimators=1000, random_state=42, min_samples_leaf=20)
    rf_reg.fit(x_scaled, y)
    y_pred = rf_reg.predict(X)
    return y_pred

## 4.2 **Mispricing Signal**

In [None]:
# calculate predicted fair value with Random Forest regression
rf_fair_value = pd.DataFrame(msf.groupby('date').apply(
    lambda x: pd.Series(
        get_random_forest(x, predictor_names, prediction_name),
        name='rf_fvp'
    ))).reset_index()

In [None]:
# calculate mispricing signal
msf['rf_fvp'] = rf_fair_value['rf_fvp']
msf['rf_sig'] = (msf['rf_fvp'] - msf['mktval']) / msf['mktval']