In [1]:
import sys, os
sys.path.append(os.path.abspath('../src/'))

In [177]:
import utils
import re
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering, KMeans
import hdbscan

In [65]:
train, test = utils.read_current('../data/round_255/')
train['era'] = train.era.str.extract('(\d+|X)$', expand=False).str.zfill(4)

In [66]:
test.data_type.value_counts()

test          1528119
validation     137779
live             5411
Name: data_type, dtype: Int64

In [81]:
full = train.append(test)

In [83]:
del train, test

# Experimentas NR003

Kaip sugrupuoti eras. Idėja klasterizuoti pagal koreliaciją tarp kintamųjų. Problema kad jei paėmus visus kintamuosius gausis $300^2/2$ variantų. Tiek daug dimensijų sunku suklasterizuoti. Reikia kažkaip sumažinti dimensijų skaičių. PCA nelabai veikė. Pabandom imti vidurkį tarp skirtingų grupių.

In [180]:
class CLusterModel:
    def __init__(self, data, features, era_aggregator, clusterer, model_factory):
        self.agg = era_aggregator
        self.clusterer = clusterer
        self.model_factory = model_factory
        self.data = data
        
    def cluster(self):
        self.correlations = self.data.groupby('era').apply(self.agg)
        self.era_clusters = self.clusterer.fit_predict(correlations)
        self.labels = pd.Series(self.era_clusters).unique()
        
    def train(self):
        self.models = {key:self.model_factory() for key in self.labels}
        data = self.data[self.data.data_type == 'train']
        for key, model in self.models.items():
            data_group = data[data.era.isin(self.correlations.reset_index().era[self.era_clusters == key])].copy()
            model.fit(data_group[features], data_group.target)
            
    def validate(self):
        out = []
        data = self.data[self.data.data_type == 'validation']
        for key, model in self.models.items():
            data_group = data[data.era.isin(self.correlations.reset_index().era[self.era_clusters == key])].copy()
            data["prediction"] = model.predict(data[features])
            out.append(data)
        val = pd.concat(out)
        return evaluate(val, features)

In [181]:
def model_factory():
    return LGBMRegressor(**param)

In [192]:
cc = CLusterModel(full, features, get_correlations, KMeans(n_clusters=3), model_factory)

In [193]:
cc.cluster()

In [194]:
cc.labels

array([2, 1, 0], dtype=int32)

In [195]:
cc.train()



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


In [196]:
cc.validate()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["prediction"] = model.predict(data[features])


era
era121    0.018341
era122    0.014944
era123    0.013762
era124    0.002236
era125    0.019051
era126    0.015341
era127    0.019056
era128    0.038309
era129    0.001941
era130    0.044727
era131    0.012312
era132    0.060971
era197    0.001336
era198   -0.006004
era199   -0.016551
era200    0.011000
era201    0.002800
era202    0.007348
era203    0.026638
era204   -0.002631
era205   -0.005570
era206    0.039856
era207    0.008605
era208   -0.009667
era209    0.012407
era210   -0.019532
era211   -0.031392
era212    0.011675
dtype: float64
Spearman Correlation: 0.0104
Average Payout: 0.052
Sharpe Ratio: 0.5231
Mean Absolute Error (MAE): 0.1556
Max drawdown: 0.05856941609719213
Feature exposure: 0.07659312627362276, Max Feature Exposure: 0.24179176802822683, Square Sum: 1.8343259464961021


(0.0104, 0.052, 0.5231, 0.1556)

In [179]:
features = [f for f in full.columns if re.match('feature.+\d', f)]
len(features)

310

In [85]:
def get_group_stats(df: pd.DataFrame) -> pd.DataFrame:
        for group in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"]:
            cols = [col for col in df.columns if group in col]
            df[f"feature_{group}_mean"] = df[cols].mean(axis=1)
            df[f"feature_{group}_std"] = df[cols].std(axis=1)
            df[f"feature_{group}_skew"] = df[cols].skew(axis=1)
        return df

full = get_group_stats(full)

In [86]:
def get_correlations(df):
    df = df[[c for c in df.columns if 'mean' in c]]
    corr_matrix = df.corr()
    df_corr = corr_matrix.stack().reset_index()
    df_corr.columns = ['FEATURE_1', 'FEATURE_2', 'CORRELATION']
    mask_dups = (df_corr[['FEATURE_1', 'FEATURE_2']].apply(frozenset, axis=1).duplicated()) | (df_corr['FEATURE_1']==df_corr['FEATURE_2']) 
    df_corr = df_corr[~mask_dups]
    return df_corr.CORRELATION

In [88]:
correlations = full.groupby('era').apply(get_correlations)

In [89]:
correlations

CORRELATION,1,2,3,4,5,8,9,10,11,15,16,17,22,23,29
era,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0001,-0.261828,0.224596,0.357775,0.207713,-0.023901,-0.120759,-0.113478,-0.244795,0.094442,0.499943,0.358846,0.188486,0.249967,0.155829,0.449269
0002,-0.263603,0.230903,0.356329,0.223538,-0.028203,-0.160691,-0.160422,-0.280032,0.084017,0.536873,0.401641,0.202604,0.289249,0.172553,0.436288
0003,-0.244919,0.211500,0.381217,0.217929,-0.031521,-0.177733,-0.188151,-0.286470,0.093519,0.546119,0.431544,0.232220,0.299271,0.164915,0.440053
0004,-0.242308,0.220869,0.428731,0.231354,-0.024993,-0.131175,-0.191416,-0.273598,0.104669,0.544824,0.442586,0.253280,0.306586,0.163820,0.425144
0005,-0.270523,0.213598,0.490674,0.222092,-0.023230,-0.107948,-0.198479,-0.258038,0.134461,0.539266,0.434232,0.269917,0.287871,0.113592,0.437874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
era946,-0.332790,0.144782,0.616933,0.128513,-0.062108,0.114745,-0.215496,-0.043828,0.127156,0.391002,0.316447,0.192391,0.246393,0.057418,0.541553
era947,-0.331398,0.141403,0.616277,0.132292,-0.063996,0.110464,-0.205904,-0.053859,0.135194,0.395125,0.328159,0.184938,0.242241,0.046950,0.532691
era948,-0.322252,0.169005,0.618337,0.140119,-0.061699,0.080524,-0.168035,-0.064328,0.149849,0.418248,0.353738,0.176174,0.185376,-0.016323,0.524429
era949,-0.298917,0.179455,0.621964,0.138514,-0.070953,0.076793,-0.074735,-0.073326,0.167498,0.420415,0.360570,0.161499,0.114696,-0.084190,0.511555


In [90]:
clusterer = hdbscan.HDBSCAN(metric='l1')

In [91]:
clusterer = clusterer.fit(correlations)

In [92]:
clusterer.labels_

array([-1, -1, -1, -1, -1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  8,  8,  8,  8,  8,  8,  8, -1,  8,
       -1, -1,  8, -1, -1, -1, -1, -1, -1, -1,  4,  4,  4, -1,  4,  4, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,
        0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  2, 13, 13, 13, 13, 13,
       13, 13, 13, 13, 13, -1, 13, 13, 13, 13, 13, 13, 13, -1, -1, 17, -1,
       17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
       17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
       17, 17, 17, 16, -1, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16

In [109]:
clustering = KMeans(n_clusters=2).fit(correlations)

In [110]:
clustering.labels_

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [111]:
full.data_type.value_counts()

test          1528119
train          501808
validation     275558
live             5411
Name: data_type, dtype: Int64

In [98]:
train = full[full.data_type == 'train']
validation = full[full.data_type == 'validation']

In [112]:
correlations_train = train.groupby('era').apply(get_correlations)
train_labels = clustering.predict(correlations_train)

In [113]:
train1 = train[train.era.isin(correlations_train.reset_index().era[train_labels == 0])].copy()
train2 = train[train.era.isin(correlations_train.reset_index().era[train_labels == 1])].copy()
train3 = train[train.era.isin(correlations_train.reset_index().era[train_labels == 2])].copy()

In [102]:
param = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "l2",
    "num_iterations": 2000,
    "learning_rate": 0.006,
    "lambda_l1": 1.4,
    "lambda_l2": 1.0,
    "bagging_fraction": 0.55,
    "bagging_freq": 1,
    "num_leaves": 107,
    "max_depth": 15,
    "verbose": 0,
    "random_state": 0,
}

In [104]:
from lightgbm import LGBMRegressor

model1 = LGBMRegressor(**param)
model1.fit(train1[features], train1.target)

model2 = LGBMRegressor(**param)
model2.fit(train2[features], train2.target)

model3 = LGBMRegressor(**param)
model3.fit(train3[features], train3.target)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


LGBMRegressor(bagging_fraction=0.55, bagging_freq=1, lambda_l1=1.4,
              lambda_l2=1.0, learning_rate=0.006, max_depth=15, metric='l2',
              num_iterations=2000, num_leaves=107, objective='regression',
              random_state=0, verbose=0)

In [105]:
correlations_validation = validation.groupby('era').apply(get_correlations)
validation_labels = clustering.predict(correlations_validation)

In [106]:
val1 = validation[validation.era.isin(correlations_validation.reset_index().era[validation_labels == 0])].copy()
val2 = validation[validation.era.isin(correlations_validation.reset_index().era[validation_labels == 1])].copy()
val3 = validation[validation.era.isin(correlations_validation.reset_index().era[validation_labels == 2])].copy()

In [107]:
val1["prediction"] = model1.predict(val1[features])
val2["prediction"] = model2.predict(val2[features])
val3["prediction"] = model3.predict(val3[features])
val = pd.concat([val1, val2, val3])

In [108]:
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr


def spearmanr(target, pred):
    return np.corrcoef(target, pred.rank(pct=True, method="first"))[0, 1]


def sharpe_ratio(corrs: pd.Series) -> np.float32:
    """
        Calculate the Sharpe ratio for Numerai by using grouped per-era data

        :param corrs: A Pandas Series containing the Spearman correlations for each era
        :return: A float denoting the Sharpe ratio of your predictions.
        """
    return corrs.mean() / corrs.std()


# https://parmarsuraj99.medium.com/evaluating-financial-machine-learning-models-on-numerai-3562da8fd90
def calculate_feature_exposure(df, feature_names, prediction_name="prediction") -> list:
    """
    Example:
    -----
    feature_exposure, max_feat_exposure, square_sum_feature_exposure = calculate_feature_exposure(df, feature_names)
    """

    exposures = []
    for feature_name in feature_names:
        exposures.append(spearmanr(df[feature_name], df[prediction_name]))

    max_feat_exposure = np.max(np.abs(exposures))
    square_sum_feature_exposure = np.sum([e ** 2 for e in exposures])
    feature_exposure = np.std(exposures)

    return [feature_exposure, max_feat_exposure, square_sum_feature_exposure]


# Calculating Max Drawdown
def max_drawdown(df, prediction_name="prediction", target_name="target"):
    scores_per_era = df.groupby("era").apply(
        lambda x: spearmanr(x[prediction_name], x[target_name])
    )

    rolling_max = (
        (scores_per_era + 1).cumprod().rolling(window=100, min_periods=1).max()
    )
    daily_value = (scores_per_era + 1).cumprod()
    max_drawdown = (rolling_max - daily_value).max()

    return max_drawdown


def evaluate(df: pd.DataFrame, features) -> tuple:
    """
        Evaluate and display relevant metrics for Numerai 

        :param df: A Pandas DataFrame containing the columns "era", "target" and a column for predictions
        :param pred_col: The column where the predictions are stored
        :return: A tuple of float containing the metrics
        """

    def _score(sub_df: pd.DataFrame) -> np.float32:
        """Calculates Spearman correlation"""
        return spearmanr(sub_df["target"], sub_df["prediction"])

    # Calculate metrics
    corrs = df.groupby("era").apply(_score)
    print(corrs)
    payout_raw = (corrs / 0.2).clip(-1, 1)
    spearman = round(corrs.mean(), 4)

    payout = round(payout_raw.mean(), 4)
    numerai_sharpe = round(sharpe_ratio(corrs), 4)
    mae = mean_absolute_error(df["target"], df["prediction"]).round(4)
    drawdown = max_drawdown(df)
    fe, max_fe, square_sum_fe = calculate_feature_exposure(df, features)

    # Display metrics
    print(f"Spearman Correlation: {spearman}")
    print(f"Average Payout: {payout}")
    print(f"Sharpe Ratio: {numerai_sharpe}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Max drawdown: {drawdown}")
    print(
        f"Feature exposure: {fe}, Max Feature Exposure: {max_fe}, Square Sum: {square_sum_fe}"
    )
    return spearman, payout, numerai_sharpe, mae


spearman, payout, numerai_sharpe, mae = evaluate(val, features)

era
era121   -0.018353
era122    0.026287
era123    0.049315
era124    0.051818
era125    0.018260
era126    0.022001
era127    0.009962
era128    0.034088
era129   -0.036523
era130    0.044706
era131    0.026467
era132    0.055581
era197    0.001118
era198   -0.008374
era199   -0.011497
era200    0.004092
era201   -0.008557
era202    0.016605
era203    0.009686
era204    0.039270
era205   -0.000015
era206    0.044208
era207   -0.002118
era208    0.001970
era209    0.026041
era210   -0.012382
era211   -0.026737
era212    0.001656
dtype: float64
Spearman Correlation: 0.0128
Average Payout: 0.064
Sharpe Ratio: 0.5185
Mean Absolute Error (MAE): 0.1555
Max drawdown: 0.05085234446654363
Feature exposure: 0.06719175458095991, Max Feature Exposure: 0.2462870347510918, Square Sum: 1.5390171873490497
