# Repeat from 4b

In [8]:
from sqlalchemy import create_engine
import pymssql
import pandas as pd

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

In [9]:
server = "mssql-82792-0.cloudclusters.net:16272"
username = "user"
password = "RiceOwls1912" 
database = "ghz"
string = "mssql+pymssql://" + username + ":" + password + "@" + server + "/" + database

conn = create_engine(string).connect()

In [10]:
df = pd.read_sql(
    """
    select date, ticker, bm, mom12m, roeq, siccd, ret
    from data
    order by date, ticker
    """,
    conn
)
df = df.dropna()
df = df.set_index(["date", "ticker"])
features = ["bm", "mom12m", "roeq"]

In [11]:
df["actual"] = df.ret

qt = QuantileTransformer(output_distribution="normal")

grouped = df.groupby("date", group_keys=False)
df[features+["ret"]] = grouped[features+["ret"]].apply(
  lambda d: 
    pd.DataFrame(
      qt.fit_transform(d),
      columns=d.columns,
      index=d.index
    )     
)

In [12]:
inds = pd.read_csv("siccodes12.csv", index_col="industry")
ind_names = inds.index.unique().to_list()

def industry(sic):
  try:
    return inds[(inds.start<=sic)&(sic<=inds.end)].index[0]
  except:
    return "Other"
    
codes = pd.Series({code: industry(code) for code in df.siccd.unique()})
codes = pd.DataFrame(codes).reset_index()
codes.columns = ["siccd", "industry"]

df = df.reset_index().merge(codes, on="siccd")
df = df.set_index(["date", "ticker"])

features.append("industry")

In [13]:
transform1 = make_column_transformer(
    (OneHotEncoder(), ["industry"]),
    remainder="passthrough"
)
transform2 = PolynomialFeatures(degree=2)

model = LinearRegression(fit_intercept=False)

pipe = make_pipeline(
    transform1,
    transform2,
    model
)

# Looping

In [14]:
dates = ["2005-01", "2010-01", "2015-01", "2020-01", "3000-01"]
predictions = None

for train_date, end_date in zip(dates[:-1], dates[1:]):
  
  fltr1 = df.index.get_level_values("date") < train_date
  fltr2 = df.index.get_level_values("date") < end_date
  train = df[fltr1]
  test = df[~fltr1 & fltr2]

  Xtrain = train[features]
  ytrain = train["ret"]
  Xtest = test[features]
  ytest = test["ret"]

  pipe.fit(Xtrain, ytrain)
  print('Train set score: ' + str(pipe.score(Xtrain, ytrain)))

  pred = pipe.predict(Xtest)
  pred = pd.Series(pred, index=test.index)
  predictions = pd.concat((predictions, pred))
  print('Test set score: ' + str(pipe.score(Xtest,ytest)))
  
df["predict"] = predictions

Train set score: 0.009574115159901586
Test set score: 0.0013746247784026666
Train set score: 0.0071837171081828854
Test set score: 0.004316121411689466
Train set score: 0.006708201060152463
Test set score: 0.006813239389918402
Train set score: 0.006925977434913633
Test set score: 0.0012359345409036493
