In [1]:
from sqlalchemy import create_engine
import pymssql
import pandas as pd

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

# Get data

In [2]:
server = "mssql-82792-0.cloudclusters.net:16272"
username = "user"
password = "RiceOwls1912" 
database = "ghz"
string = "mssql+pymssql://" + username + ":" + password + "@" + server + "/" + database

conn = create_engine(string).connect()

In [3]:
df = pd.read_sql(
    """
    select date, ticker, bm, mom12m, roeq, siccd, ret
    from data
    order by date, ticker
    """,
    conn
)
df = df.dropna()
df = df.set_index(["date", "ticker"])
features = ["bm", "mom12m", "roeq"]

# Quantile transform

In [4]:
df["actual"] = df.ret

qt = QuantileTransformer(output_distribution="normal")

grouped = df.groupby("date", group_keys=False)
df[features+["ret"]] = grouped[features+["ret"]].apply(
  lambda d: 
    pd.DataFrame(
      qt.fit_transform(d),
      columns=d.columns,
      index=d.index
    )     
)

## Add industry variable to dataframe


In [5]:
inds = pd.read_csv("siccodes12.csv", index_col="industry")
ind_names = inds.index.unique().to_list()

def industry(sic):
  try:
    return inds[(inds.start<=sic)&(sic<=inds.end)].index[0]
  except:
    return "Other"
    
codes = pd.Series({code: industry(code) for code in df.siccd.unique()})
codes = pd.DataFrame(codes).reset_index()
codes.columns = ["siccd", "industry"]

df = df.reset_index().merge(codes, on="siccd")
df = df.set_index(["date", "ticker"])

features.append("industry")

# Make pipeline

In [6]:
transform1 = make_column_transformer(
    (OneHotEncoder(), ["industry"]),
    remainder="passthrough"
)
transform2 = PolynomialFeatures(degree=2)

model = LinearRegression(fit_intercept=False)

pipe = make_pipeline(
    transform1,
    transform2,
    model
)