In [1]:
from sqlalchemy import create_engine
import pymssql
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from joblib import dump
from pandas_datareader import DataReader as pdr

### Get data

In [2]:

server = "mssql-82792-0.cloudclusters.net:16272"
username = "user"
password = "RiceOwls1912" # paste password between quote marks
database = "ghz"

string = "mssql+pymssql://" + username + ":" + password + "@" + server + "/" + database

conn = create_engine(string).connect()

In [3]:

data = pd.read_sql(
    """
    select ticker, date, ret, acc, agr, beta, bm, ep, gma, 
    idiovol, lev, mom12m, mom1m, mve, operprof, roeq
    from data
    """, 
    conn
)
data = data.dropna().copy()

### Get market returns

Get market returns from Kenneth French's data library.  Redefine returns to be returns in excess of market return.

In [4]:
factors = pdr("F-F_Research_Data_Factors", "famafrench", start=2000)[0] / 100
factors = factors.reset_index()
factors["Date"] = factors["Date"].astype(str)

data = data.merge(factors, left_on="date", right_on="Date")
data["ret"] = data["ret"] + data["RF"] - data["Mkt-RF"]

### Define X and y

In [5]:
string = "acc, agr, beta, bm, ep, gma, idiovol, lev, mom12m, mom1m, mve, operprof, roeq"
features = string.split(", ")

X = data[features]
y = data["ret"]

## Fit and save models

Fit and save three random forest models, with depths = 4, 5, and 6.

In [6]:

model = RandomForestRegressor(
  max_depth=4,
  random_state=0
)
model.fit(X, y)
dump(model, "model4.joblib")

['model4.joblib']

In [7]:
model = RandomForestRegressor(
  max_depth=5,
  random_state=0
)
model.fit(X, y)
dump(model, "model5.joblib")

['model5.joblib']

In [8]:
model = RandomForestRegressor(
  max_depth=6,
  random_state=0
)
model.fit(X, y)
dump(model, "model6.joblib")

['model6.joblib']