In [1]:
import pandas as pd
import optuna

import lightgbm as lgb

In [2]:
df = pd.read_feather('./data/train.feather')

In [3]:
df.data_type.value_counts()

train         501808
validation    137779
Name: data_type, dtype: Int64

In [4]:
df.target.value_counts(dropna=False)

0.50    320631
0.25    127584
0.75    127578
1.00     31899
0.00     31895
Name: target, dtype: int64

In [14]:
def get_group_stats(df: pd.DataFrame) -> pd.DataFrame:
        for group in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"]:
            cols = [col for col in df.columns if group in col]
            df[f"feature_{group}_mean"] = df[cols].mean(axis=1)
            df[f"feature_{group}_std"] = df[cols].std(axis=1)
            df[f"feature_{group}_skew"] = df[cols].skew(axis=1)
        return df

df = get_group_stats(df)

In [17]:
from sklearn import preprocessing
import random as rn
ft_corr_list=rn.sample(features, 30)# #Please try other features!
interactions = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

interactions.fit(train[ft_corr_list], train["target"])

df_interact = pd.DataFrame(interactions.transform(train[ft_corr_list]))

df=pd.concat([df,df_interact],axis=1)

In [18]:
train = df[df.data_type == 'train']
test = df[df.data_type == 'validation']

In [22]:
features = [f for f in train.columns if type(f) == int or f.startswith('feature')]
len(features)

793

In [9]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr


def spearman(y_true, y_pred, axis=0):
    """ Calculate Spearman correlation """
    return spearmanr(y_true, y_pred, axis=axis)[0]


def sharpe(df: pd.DataFrame, y_pred) -> np.float32:
    """
    Calculate the Sharpe ratio by using grouped per-era data
    :param df: A Pandas DataFrame containing the columns "era", "target" and "prediction"
    :return: The Sharpe ratio for your predictions.
    """
    df = df.copy()
    df['prediction'] = y_pred
    def _score(sub_df: pd.DataFrame) -> np.float32:
        """ Calculate Spearman correlation for Pandas' apply method """
        return spearmanr(sub_df["target"], sub_df["prediction"])[0, 1]

    corrs = df.groupby("era").apply(_score)
    return corrs.mean() / corrs.std()

In [None]:
def objective(trial):
    dtrain = lgb.Dataset(train[features], label=train.target)
 
    param = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'l2',
        "device": "gpu",
        "learning_rate": trial.suggest_uniform('learning_rate', 0.004, 0.1),
        "num_iterations": 2000,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.1, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.1, 10.0),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        "max_depth": trial.suggest_int('max_depth', 3, 20),
        'verbose': -1,
    }
 
    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(test[features])
    accuracy = spearman(test.target, preds)
    return accuracy
 
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)
 
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [10]:
study.best_trial.params

{'learning_rate': 0.006085005910213603,
 'lambda_l1': 1.460802617273486,
 'lambda_l2': 1.0611461754655795e-05,
 'feature_fraction': 0.9969345315260029,
 'bagging_fraction': 0.5585536563609045,
 'bagging_freq': 1,
 'min_child_samples': 83,
 'num_leaves': 107,
 'max_depth': 15}

In [23]:
dtrain = lgb.Dataset(train[features], label=train.target)

param = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    "device": "gpu",
    "num_iterations": 2000,
    'learning_rate': 0.006085005910213603,
 'lambda_l1': 1.460802617273486,
 'lambda_l2': 1.0611461754655795e-05,
 'feature_fraction': 0.9969345315260029,
 'bagging_fraction': 0.5585536563609045,
 'bagging_freq': 1,
 'min_child_samples': 83,
 'num_leaves': 107,
 'max_depth': 15,
    'verbose': 1,
    "random_state": 0,
}

gbm = lgb.train(param, dtrain)
preds = gbm.predict(test[features])
accuracy = spearman(test.target, preds)

accuracy



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 14610
[LightGBM] [Info] Number of data points in the train set: 501808, number of used features: 793
[LightGBM] [Info] Using GPU Device: GeForce GTX 1080 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 791 dense feature groups (379.02 MB) transferred to GPU in 0.105557 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.499997


0.014911407306901088

In [11]:
accuracy

0.026285507606149033