# Simple Model

- We don't have returns after 2022 Jan. 
- We have data until 2022 Sept.

- For model training:
    - Testing period --> 2021 June - 2022 Jan
    - Train period: 2017 June - 2021 June (3 years)
    One stock per line. Aggregate all data into one row

- For Inference:
    - Infer for periods between 2022 Sept. - 2017 Sept.
    - This will give us the returns for 2022 Sept. until 2023 March. 
    - Choose the top 15 stocks



In [1]:
import pandas as pd
import numpy as np
import Preprocessing as pr
import datetime as dt

In [2]:
df = pr.get_data()

  df = pd.read_csv(path, delimiter=";", parse_dates=date_cols).iloc[:, 1:]


In [3]:
df[df.year==2022].groupby("quarter").apply(lambda x: len(x.index.get_level_values(1).unique()))

quarter
1    1407
3    1407
6    1407
9    1407
dtype: int64

In [24]:
df[df.year==2022].close.dropna()

date        symbol  
2022-01-31  ALMIL.PA     22.44000
            2CRSI.PA      4.44300
            ASP.PA        5.68800
            AALB.AS      54.67200
            ABI.BR       55.24200
                          ...    
            3E2.F       123.73600
            NNND.F       55.11400
            SAB.MC        0.75408
            ENG.MC       19.28200
            SGRE.MC      18.37400
Name: close, Length: 1387, dtype: float64

# Model: XGBoost 

In [81]:
# Only keep stocks that are trading in Euros
currency_by_stock = df["reportedCurrency"].groupby("symbol").apply(lambda x:x.dropna()[0] if len(x.dropna()) > 0 else np.nan)
euro_stocks = currency_by_stock[currency_by_stock=="EUR"].index
idx = pd.IndexSlice[:, euro_stocks]
df = df.loc[idx, :]

In [82]:
# Set train and validation dates
train_dates = {"start": dt.datetime(2017, 6, 30), "end": dt.datetime(2021, 6, 30)}
val_dates = {"start": dt.datetime(2021, 6, 30), "end": dt.datetime(2022, 1, 31)}

X_train = df.loc[train_dates["start"]: train_dates["end"]]
X_train = X_train.unstack(level=0).drop("year", axis=1)
X_train



Unnamed: 0_level_0,quarter,quarter,quarter,quarter,quarter,quarter,quarter,quarter,quarter,quarter,...,return,return,return,return,return,return,return,return,return,return
date,2017-06-30,2017-09-30,2018-01-31,2018-03-31,2018-06-30,2018-09-30,2019-01-31,2019-03-31,2019-06-30,2019-09-30,...,2019-03-31,2019-06-30,2019-09-30,2020-01-31,2020-03-31,2020-06-30,2020-09-30,2021-01-31,2021-03-31,2021-06-30
symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
22UA.F,6,9,1,3,6,9,1,3,6,9,...,,,,,0.792782,0.159442,-0.006257,0.518609,-0.002897,1.111287
5CV.F,6,9,1,3,6,9,1,3,6,9,...,,,,,,,,0.968930,-0.030399,-0.264362
8GW.IR,6,9,1,3,6,9,1,3,6,9,...,-0.141791,-0.034783,-0.405405,-0.545455,-0.166667,0.500000,1.000000,-0.033333,-0.051724,0.000000
A3M.MC,6,9,1,3,6,9,1,3,6,9,...,-0.030025,0.012898,-0.179777,-0.097631,-0.238690,-0.020757,-0.049231,0.457030,0.093904,0.053130
A5G.IR,6,9,1,3,6,9,1,3,6,9,...,-0.023377,-0.091740,-0.248162,0.008274,-0.642942,0.179064,-0.207602,0.645086,0.532423,-0.028609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WHA.AS,6,9,1,3,6,9,1,3,6,9,...,-0.147223,-0.071674,-0.123832,-0.133890,-0.620325,0.254971,-0.057892,0.511514,0.248502,0.001131
WKL.AS,6,9,1,3,6,9,1,3,6,9,...,0.098700,0.078003,0.011471,0.053025,-0.099604,0.125364,0.049581,-0.050140,0.067251,0.146544
XIL.PA,6,9,1,3,6,9,1,3,6,9,...,-0.133885,0.227693,-0.071463,0.247120,-0.297229,0.185185,0.216734,-0.051367,0.037391,-0.112023
XIOR.BR,6,9,1,3,6,9,1,3,6,9,...,0.082149,0.064371,0.063473,0.135666,-0.270473,0.230433,0.156309,-0.074919,-0.106025,0.114661


In [83]:
# y_train = 
test_per_rets = df.loc[val_dates["start"]: val_dates["end"], "return"]
test_per_rets

date        symbol  
2021-06-30  22UA.F      1.111287
            5CV.F      -0.264362
            8GW.IR      0.000000
            A3M.MC      0.053130
            A5G.IR     -0.028609
                          ...   
2022-01-31  WHA.AS      0.112201
            WKL.AS     -0.016242
            XIL.PA      0.019968
            XIOR.BR    -0.023805
            YATRA.AS    0.000000
Name: return, Length: 2409, dtype: float64

In [85]:
# drop stocks with missing returns for this period
to_drop = test_per_rets[test_per_rets.isna()].index.get_level_values("symbol").unique()

# drop stocks that traded under 1$ in the period
under_1 = X_train['close'].groupby(level="symbol").apply(lambda x: (x<1).any(axis=1)).droplevel(1)
to_drop = to_drop.union(under_1[under_1].index).unique()

X_train = X_train.drop(to_drop)
y_train = test_per_rets.drop(to_drop, level=1)

In [86]:
print(np.setdiff1d(X_train.index, y_train.index.get_level_values(1)))
print(np.setdiff1d(y_train.index.get_level_values(1), X_train.index))

[]
[]


In [87]:
X_train

Unnamed: 0_level_0,quarter,quarter,quarter,quarter,quarter,quarter,quarter,quarter,quarter,quarter,...,return,return,return,return,return,return,return,return,return,return
date,2017-06-30,2017-09-30,2018-01-31,2018-03-31,2018-06-30,2018-09-30,2019-01-31,2019-03-31,2019-06-30,2019-09-30,...,2019-03-31,2019-06-30,2019-09-30,2020-01-31,2020-03-31,2020-06-30,2020-09-30,2021-01-31,2021-03-31,2021-06-30
symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
22UA.F,6,9,1,3,6,9,1,3,6,9,...,,,,,0.792782,0.159442,-0.006257,0.518609,-0.002897,1.111287
5CV.F,6,9,1,3,6,9,1,3,6,9,...,,,,,,,,0.968930,-0.030399,-0.264362
A3M.MC,6,9,1,3,6,9,1,3,6,9,...,-0.030025,0.012898,-0.179777,-0.097631,-0.238690,-0.020757,-0.049231,0.457030,0.093904,0.053130
AALB.AS,6,9,1,3,6,9,1,3,6,9,...,0.003782,0.116010,0.044305,0.129885,-0.466442,0.379622,0.051896,0.243825,0.119175,0.060519
ABN.AS,6,9,1,3,6,9,1,3,6,9,...,-0.102781,-0.055264,-0.144838,-0.002146,-0.541036,0.075507,-0.086143,0.173162,0.234854,-0.008066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WHA.AS,6,9,1,3,6,9,1,3,6,9,...,-0.147223,-0.071674,-0.123832,-0.133890,-0.620325,0.254971,-0.057892,0.511514,0.248502,0.001131
WKL.AS,6,9,1,3,6,9,1,3,6,9,...,0.098700,0.078003,0.011471,0.053025,-0.099604,0.125364,0.049581,-0.050140,0.067251,0.146544
XIL.PA,6,9,1,3,6,9,1,3,6,9,...,-0.133885,0.227693,-0.071463,0.247120,-0.297229,0.185185,0.216734,-0.051367,0.037391,-0.112023
XIOR.BR,6,9,1,3,6,9,1,3,6,9,...,0.082149,0.064371,0.063473,0.135666,-0.270473,0.230433,0.156309,-0.074919,-0.106025,0.114661


In [89]:
y_train = y_train.groupby(level=1, group_keys=False).apply(lambda x: x.cumprod()[-1])

assert all(X_train.index == y_train.index)

In [110]:
# Last checks and drops on columns
X_train = X_train.select_dtypes(exclude=["object", "datetime"])

In [99]:
X_train.select_dtypes(include="category").columns.get_level_values(0).unique()

Index(['rating', 'ratingDetailsDCFRecommendation',
       'ratingDetailsDERecommendation', 'ratingDetailsPBRecommendation',
       'ratingDetailsPERecommendation', 'ratingDetailsROARecommendation',
       'ratingDetailsROERecommendation', 'ratingRecommendation'],
      dtype='object')

In [112]:
X_train.dtypes.value_counts()

float64     4624
category      85
int64         34
category      17
category      17
category      17
dtype: int64

In [42]:
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [113]:
model = XGBRegressor(enable_categorical=True)
model.fit(X_train, y_train)

  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transformed[data.columns[i]] = data[data.columns[i]]
  transfor

ValueError: Experimental support for categorical data is not implemented for current tree method yet.