In [None]:
import os
import gc
import csv
import scipy

import numerapi
import numpy as np
import pandas as pd

import lightgbm as lgb

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.ensemble import RandomForestRegressor
import joblib

from sklearn.preprocessing import PowerTransformer
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error


NAPI = numerapi.NumerAPI(verbosity="info")


In [None]:
def download_current_data():
        """
        Downloads the data for the current round
        :param directory: The path to the directory where the data needs to be saved
        """
        current_round = NAPI.get_current_round()
        if os.path.isdir(f'/numerai_dataset_{current_round}/'):
            print(f"You already have the newest data! Current round is: {current_round}")
        else:
            print(f"Downloading new data for round: {current_round}!")
            NAPI.download_current_dataset(unzip=True)

def load_data(reduce_memory: bool=True) -> tuple:
        """
        Get data for current round
        :param directory: The path to the directory where the data needs to be saved
        :return: A tuple containing the datasets
        """
        print('Loading the data')
        full_path = f'numerai_dataset_{NAPI.get_current_round()}/'
        train_path = full_path + 'numerai_training_data.csv'
        test_path = full_path + 'numerai_tournament_data.csv'
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        # Reduce all features to 32-bit floats
        if reduce_memory:
            num_features = [f for f in train.columns if f.startswith("feature")]
            train[num_features] = train[num_features].astype(np.float32)
            test[num_features] = test[num_features].astype(np.float32)
        val = test[test['data_type'] == 'validation']
        test = test[test['data_type'] != 'validation']
        return train, val, test

# Download, unzip and load data
download_current_data()
train, val, test = load_data(reduce_memory=True)

In [None]:
TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

BENCHMARK = 0
BAND = 0.2

#-----------------------------------------------------

# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0, 1]

def correlation(predictions, targets):
    ranked_preds = predictions.rank(pct=True, method="first")
    return np.corrcoef(ranked_preds, targets)[0, 1]

# The payout function
def payout(scores):
    return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1)



def read_csv(file_path):
    """An efficient way to load csv."""
    with open(file_path, 'r') as f:
        column_names = next(csv.reader(f))
        dtypes = {x: np.float16 for x in column_names if
                  x.startswith(('feature', 'target'))}
    return pd.read_csv(file_path, dtype=dtypes)


def get_group_stats(df: pd.DataFrame) -> pd.DataFrame:
        for group in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"]:
            cols = [col for col in df.columns if group in col]
            df.loc[:,f"feature_{group}_mean"] = df[cols].mean(axis=1)
            df.loc[:,f"feature_{group}_median"] = df[cols].median(axis=1)
            df.loc[:,f"feature_{group}_std"] = df[cols].std(axis=1)
            df.loc[:,f"feature_{group}_skew"] = df[cols].skew(axis=1)
            df.loc[:,f"feature_{group}_p25"] = df[cols].quantile(0.25, axis=1)
            df.loc[:,f"feature_{group}_p75"] = df[cols].quantile(0.75, axis=1)
        return df

    
def power_vars(df: pd.DataFrame, power: int) -> pd.DataFrame:
    for col in df.columns:
        if col in feature_names:
            df.loc[:,f"{col}_squared"] = df[col] ** power
        
    return df



def squared_root_vars(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if col in feature_names:
            df.loc[:,f"{col}_squared_root"] = np.sqrt(df[col])
        
    return df


def yeo_transformation(df: pd.DataFrame) -> pd.DataFrame:
    power = PowerTransformer(method='yeo-johnson', standardize=True)
    df_trans = pd.DataFrame(power.fit_transform(df[feature_names]))
    # rename columns
    df_trans.rename(columns=dict(zip(df_trans.columns, [f"{f}_yeo" for f in feature_names])), inplace=True)
    # concat with main dataset
    df = pd.concat([df.reset_index(), df_trans], axis=1).set_index('id')
    
    return df


def sharpe_ratio(corrs: pd.Series) -> np.float32:
        """
        Calculate the Sharpe ratio for Numerai by using grouped per-era data

        :param corrs: A Pandas Series containing the Spearman correlations for each era
        :return: A float denoting the Sharpe ratio of your predictions.
        """
        return corrs.mean() / corrs.std()


def evaluate(df: pd.DataFrame) -> tuple:
        """
        Evaluate and display relevant metrics for Numerai 

        :param df: A Pandas DataFrame containing the columns "era", "target" and a column for predictions
        :param pred_col: The column where the predictions are stored
        :return: A tuple of float containing the metrics
        """
        def _score(sub_df: pd.DataFrame) -> np.float32:
            """Calculates Spearman correlation"""
            return spearmanr(sub_df["target"], sub_df["prediction"])[0]

        # Calculate metrics
        corrs = df.groupby("era").apply(_score)
        print(corrs)
        payout_raw = (corrs / 0.2).clip(-1, 1)
        spearman = round(corrs.mean(), 4)

        payout = round(payout_raw.mean(), 4)
        numerai_sharpe = round(sharpe_ratio(corrs), 4)
        mae = mean_absolute_error(df["target"], df["prediction"]).round(4)

        # Display metrics
        print(f"Spearman Correlation: {spearman}")
        print(f"Average Payout: {payout}")
        print(f"Sharpe Ratio: {numerai_sharpe}")
        print(f"Mean Absolute Error (MAE): {mae}")
        return spearman, payout, numerai_sharpe, mae
        


In [None]:
napi = numerapi.NumerAPI(verbosity="info")

In [None]:
# download current dataset
# napi.download_current_dataset(unzip=True)

In [None]:
#getting the latest round information
current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

In [None]:
#loading 
print("# Loading data...")
# The training data is used to train your model how to predict the targets.
training_data = read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
# The tournament data is the data that Numerai uses to evaluate your model.
tournament_data = read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
# example_preds = read_csv(os.path.join(latest_round, "example_predictions_target_kazutsugi.csv")
validation_data = tournament_data[tournament_data.data_type == "validation"]

In [None]:
feature_names = [f for f in train.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")
cols = feature_names+[TARGET_NAME]

In [None]:
train = get_group_stats(train)
val = get_group_stats(val)
test = get_group_stats(test)

train = power_vars(train, 2)
val = power_vars(val, 2)
test = power_vars(test, 2)

train = squared_root_vars(train)
val = squared_root_vars(val)
test = squared_root_vars(test)

In [None]:
training_data.head()

In [None]:
# A parameter grid for Catboost
params = {
    'iterations':[5000],
    'depth':sp_randint(3,15), 
    'learning_rate': sp_uniform(0.005, 0.15),
    'reg_lambda': sp_randint(1, 5),
    'use_best_model':[True],
    'min_data_in_leaf': sp_randint(1, 5)
}

fit_params={"early_stopping_rounds":15, 
            "eval_set" :[(validation_data.drop(['era','data_type','target'], axis=1).astype(np.float32), validation_data[TARGET_NAME].astype(np.float32))]}

reg = CatBoostRegressor()

# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(reg, 
                                   param_distributions=params,
                                   n_iter=n_iter_search, 
                                   cv=3, 
                                   scoring='neg_mean_squared_error',
                                   verbose=2)

random_search.fit(training_data.drop(['era','data_type','target'], axis=1).astype(np.float32), training_data[TARGET_NAME].astype(np.float32), **fit_params)

In [None]:
random_search.best_params_

In [None]:
joblib.dump(gs, 'models/catboost_v2.pkl')

In [None]:
params = {'depth': 5,
 'iterations': 250,
 'learning_rate': 0.036878060835166426,
 'min_data_in_leaf': 1,
 'reg_lambda': 1}

model = CatBoostRegressor(**params)
model.fit(training_data.drop(['era','data_type','target'], axis=1).astype(np.float32),
          training_data[TARGET_NAME].astype(np.float32),
          verbose=True)

In [None]:
print("Generating predictions on tournament data...")
tournament_preds = model.predict(tournament_data.drop(['era','data_type','target'], axis=1).astype(np.float32))
# tournament_data[PREDICTION_NAME] = tournament_preds


In [None]:
aux = tournament_data.drop(['era','data_type','target'], axis=1)

In [None]:
tournament_preds = model.predict(aux)


In [None]:
aux = tournament_data.drop(['era','data_type','target'], axis=1).astype(np.float32)

In [None]:
# print("Generating predictions on training data...")
# training_preds = model.predict(training_data.drop(['era','data_type','target'], axis=1).astype(np.float32).astype(np.float32))
# training_data[PREDICTION_NAME] = training_preds
# gc.collect()

print("Generating predictions on tournament data...")
tournament_preds = model.predict(tournament_data.drop(['era','data_type','target'], axis=1).astype(np.float32))
tournament_data[PREDICTION_NAME] = tournament_preds

# # Check the per-era correlations on the training set (in sample)
# train_correlations = training_data.groupby("era").apply(score)
# print(f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}")
# print(f"On training the average per-era payout is {payout(train_correlations).mean()}")

# Check the per-era correlations on the validation set (out of sample)
validation_data = tournament_data[tournament_data.data_type == "validation"]
validation_correlations = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations.mean()} and "
        f"std {validation_correlations.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations).mean()}")

#FEAT_EXPOSURE:
corr_list = []
for feature in feature_names:
    corr_list.append(np.corrcoef(tournament_data[feature].values, tournament_data[PREDICTION_NAME])[0,1])
corr_series = pd.Series(corr_list, index=feature_names)
print("Feat. exposure: ", corr_series.describe()['std'])

In [None]:
# A parameter grid for Catboost
params = {
    'num_leaves': sp_randint(5,50),
    'max_depth': sp_randint(3,20), 
    'learning_rate': sp_uniform(0.0005, 0.15),
    'reg_lambda': sp_uniform(0, 5),
    'n_estimators':[5000]
}

feature_list = train.columns.drop(['id','era','data_type','target'])

fit_params={"early_stopping_rounds":15, 
            "eval_set" :[(val[feature_list].fillna(0), val['target'])],
            "eval_metric": "None"}

reg = lgb.LGBMRegressor(random_state=314)

print("Running random search...")
# run randomized search
n_iter_search = 1
random_search = RandomizedSearchCV(reg, 
                                   param_distributions=params,
                                   n_iter=n_iter_search, 
                                   cv=3, 
                                   scoring='neg_mean_squared_error',
                                   verbose=2)

random_search.fit(train[feature_list].fillna(0), train['target'], **fit_params)

In [None]:
random_search.best_estimator_

In [None]:
feature_list = train.columns.drop(['id','era','data_type','target'])
dtrain = lgb.Dataset(train[feature_list].fillna(0), label=train["target"])
dvalid = lgb.Dataset(val[feature_list].fillna(0), label=val["target"])

best_config ={"objective":"regression","learning_rate":0.01,"n_estimators":250,"max_depth":5,"metric":"mse","verbosity": 10, "random_state": 0} 

model = lgb.train(best_config, dtrain)


In [None]:
train.loc[:, "prediction"] = random_search.predict(train[feature_list])

val.loc[:,"prediction"]=val["target"]
val.loc[:,"prediction"] = random_search.predict(val[feature_list])

In [None]:
feature_spearman_val = [spearmanr(val["prediction"], val[f])[0] for f in feature_list]
feature_exposure_val = np.std(feature_spearman_val).round(4)
spearman, payout, numerai_sharpe, mae = evaluate(val)

In [None]:
cboost = joblib.load('models/catboost.pkl')

In [None]:
# n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# max_features = ['log2', 'sqrt']
# max_depth = [int(x) for x in np.linspace(start = 1, stop = 15, num = 15)]
# min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
# min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
# bootstrap = [True, False]
# param_dist = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# rfc = RandomForestRegressor()

# rs = RandomizedSearchCV(rfc, 
#                         param_dist, 
#                         n_iter = 100, 
#                         cv = 3, 
#                         verbose = 1, 
#                         random_state=314)
# rs.fit(training_data.drop(['era','data_type','target'], axis=1), training_data[TARGET_NAME])
# rs.best_params_


In [None]:
print("Generating predictions on training data...")
training_preds = random_search.predict(training_data[feature_names])
training_data[PREDICTION_NAME] = training_preds
gc.collect()

print("Generating predictions on tournament data...")
tournament_preds = random_search.predict(tournament_data[feature_names])
tournament_data[PREDICTION_NAME] = tournament_preds

# Check the per-era correlations on the training set (in sample)
train_correlations = training_data.groupby("era").apply(score)
print(f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}")
print(f"On training the average per-era payout is {payout(train_correlations).mean()}")

# Check the per-era correlations on the validation set (out of sample)
validation_data = tournament_data[tournament_data.data_type == "validation"]
validation_correlations = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations.mean()} and "
        f"std {validation_correlations.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations).mean()}")

#FEAT_EXPOSURE:
corr_list = []
for feature in feature_names:
    corr_list.append(np.corrcoef(tournament_data[feature].values, tournament_data[PREDICTION_NAME])[0,1])
corr_series = pd.Series(corr_list, index=feature_names)
print("Feat. exposure: ", corr_series.describe()['std'])

In [None]:
random_search.best_estimator_

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'logloss', 
            "eval_set" : [(validation_data[feature_names],validation_data[TARGET_NAME])],
            'eval_names': ['valid'],
            'verbose': 100,
            'categorical_feature': 'auto'}

In [None]:
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100


#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMRegressor(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, 
    param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='logloss',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

In [None]:
gs.fit(training_data[feature_names], training_data[TARGET_NAME], **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}


lgb_estimator= lgb.LGBMRegressor(max_depth=-1, random_state=314, metric='None', n_estimators=5000, early_stopping_rounds=30)
g_lgbm = RandomizedSearchCV(estimator=lgb_estimator, param_distributions=param_test, n_iter=60 ,n_jobs = 2, cv= 3, verbose=10)
lgb_model = g_lgbm.fit(X=training_data[feature_names], y=training_data[TARGET_NAME], eval_set = (validation_data[feature_names],validation_data[TARGET_NAME]), eval_metric='r2')