In [89]:
from functools import reduce
import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from ta import add_all_ta_features
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.mode.chained_assignment = None
np.seterr(divide="ignore", invalid="ignore")
import warnings
warnings.filterwarnings('ignore');

In [7]:
df = pd.read_parquet("../data/compustat_ibes_raw_alpha.1.parquet")

## Mooomentums

In [163]:
def extreme(df):
    """Retain the 20% values that are the smallest and the 20% that are the largest."""
    top = df.y.quantile(0.8)
    low = df.y.quantile(0.2)
    return df[(df.y < low) | (df.y > top)]

In [241]:
def mooom(df):
    """
    Mooom is a strategy that heavily relies on momentum.
    It enriches dataframe with simple momentum and industrial momentum factors. 
    Returns will be substracted by a cross-sectional median.
    """
    df_mom = df[["classification",
                 "mcap",
                 "act",
                 "ceq",
                 "cogs",
                 "lct",
                 "nicon",
                 "revt",
                 "xrd",
                 "prccd",
                 "close"]]
    df_mom["profitability_roe"]=df_mom.nicon / df_mom.ceq
    df_mom["intangible_rdm"] = df_mom.xrd / df_mom.mcap
    df_mom["liquid_cur"] = df_mom.act / df_mom.lct
    df_mom["market_pe"] = df_mom.mcap / (df_mom.nicon * 10 ** 6)
    df_mom["market_mb"] = df_mom.mcap / (df_mom.ceq * 10 ** 6)
    df_mom["52wk_high"] = df_mom.groupby("gvkey").close.transform(lambda x: np.log(x) - np.log(x.rolling(12, min_periods=1).max()))
    df_mom["lag_mom"] = df_mom.groupby("gvkey").close.transform(lambda x: np.log(x.shift(12))-np.log(x.shift(2)))
    df_mom["mom_11m"] = df_mom.groupby("gvkey").close.transform(lambda x: np.log(x) - np.log(x.shift(11)))
    df_mom["mom_6m"] = df_mom.groupby("gvkey").close.transform(lambda x: np.log(x) - np.log(x.shift(6)))
    df_mom["mom_1m"] = df_mom.groupby("gvkey").close.transform(lambda x: np.log(x) - np.log(x.shift(1)))
    df_mom["ind_mom_11m"] = df_mom.groupby(["date", "classification"]).mom_11m.transform("mean")
    df_mom["ind_mom_6m"] = df_mom.groupby(["date", "classification"]).mom_6m.transform("mean")
    df_mom["ind_mom_1m"] = df_mom.groupby(["date", "classification"]).mom_1m.transform("mean")
    df_mom["log_return"] = df_mom.groupby("gvkey").close.transform(lambda x: np.log(x.shift(-3)) - np.log(x))
    df_mom["y"] = df_mom.groupby("date").log_return.transform(lambda x: x - x.median())
    return df_mom.loc[df.prccd > 5, ["mcap",
                                     "profitability_roe",
                                     "intangible_rdm",
                                     "liquid_cur",
                                     "market_mb",
                                     "market_pe",
                                     "52wk_high",
                                     "lag_mom",
                                     "mom_11m",
                                     "mom_6m",
                                     "mom_1m",
                                     "ind_mom_11m",
                                     "ind_mom_6m",
                                     "ind_mom_1m",
                                     "y"]].dropna()

In [243]:
df_mom = mooom(df)
df_train = (df_mom.xs(slice("2002-01-01", "2012-01-01"), level="date", drop_level=False)
            # .groupby("date")
            # .apply(extreme)
            )
df_test = df_mom.xs(slice("2012-01-01", "2016-01-01"), level="date", drop_level=False)
X_train = df_train.drop(["y"], axis=1).to_numpy()
y_train = df_train["y"].to_numpy()
X_test = df_test.drop(["y"], axis=1).to_numpy()
y_test = df_test["y"].to_numpy()

xgb_clf = XGBClassifier(n_estimators=100, max_depth=3, n_jobs=-1)
xgb_fit = xgb_clf.fit(X_train, np.sign(y_train))
print(xgb_clf.score(X_train, np.sign(y_train)))
print(xgb_clf.score(X_test, np.sign(y_test)))

0.5921033255082683
0.5594081323546546


In [154]:
# backtest
# every stock invest 100
# every day invest top 10 predicted best performing stocks
# collect return 7 days after
# y_position = y_pred.grouby("date").head(10)
# y_return = y_position * y_test
