In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
import optuna   
import cudf
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load data
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train = train.query('date > 85').reset_index(drop = True)  #去掉前85天
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #float64换成float32limit memory use
#tree和neural network的区别是tree里不要人为加入bias 所以fillna通常用很大的极限值 这样就把有missing的分到一个单独的brunch上
train.fillna(-9999,inplace=True)
train = train.query('weight > 0').reset_index(drop = True) #drop掉weight为零的
features = [c for c in train.columns if 'feature' in c]
X = train[features].values
y = (train['resp'] > 0).astype('int') #tree只能做single target 现在选的是resp这个指标 大于0为1 小于等于0为0 可调

In [4]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [5]:
FOLDS=5
import gc
gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, group_gap=20)
splits = list(gkf.split(y, groups=train['date'].values))
del train
gc.collect()
for fold, (train_indices, test_indices) in enumerate(splits): #分5个folds 但后一个会覆盖前一个 最后只剩第5个 这样做因为time-series split最后的auc最大 因为最后的tree和validation数据多 见群里分析图
    X_train, X_valid = X[train_indices], X[test_indices]
    y_train, y_valid = y[train_indices], y[test_indices]

In [6]:
# Created the Xgboost specific DMatrix data format from the numpy array to optimise memory consumption
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

In [7]:
def objective(trial):
    
# params specifies the XGBoost hyperparameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 600), #一共几个树叶：400-600
        'max_depth': trial.suggest_int('max_depth', 10, 20), #每个树的深度：10-20
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1), #每个树update的幅度
        'subsample': trial.suggest_uniform('subsample', 0.50, 1), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method': 'gpu_hist',  
        'objective': 'binary:logistic'
    }
    
    bst = xgb.train(params, dtrain) #用贝叶森选的parameter在trian上建model
    preds = bst.predict(dvalid) #在tree上做预测 预测结果为0-1的probability
    pred_labels = np.rint(preds) #把预测结果转换成0和1 
# trials will be evaluated based on their accuracy on the test set
    accuracy = sklearn.metrics.accuracy_score(y_valid, pred_labels) #预测结果与真实值比较
    return accuracy

In [8]:
study = optuna.create_study(direction='maximize') #用optuna这个包做贝叶森 跑得更快
study.optimize(objective,n_trials=20) #trial越多学的越多 跑的时间越长
#You can increase n_trials parameter

[32m[I 2021-01-24 11:39:00,579][0m A new study created in memory with name: no-name-389db1e8-2d82-427a-941b-8baa8a9b880a[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-01-24 11:39:03,419][0m Trial 0 finished with value: 0.5193769932792303 and parameters: {'n_estimators': 448, 'max_depth': 10, 'learning_rate': 0.09173534908043456, 'subsample': 0.9875714129412798, 'colsample_bytree': 0.5373008049241719, 'gamma': 6}. Best is trial 0 with value: 0.5193769932792303.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-01-24 11:39:07,953][0m Trial 1 finished with value: 0.5146363075112039 and parameters: {'n_estimators': 419, 'max_depth': 15, 'learning_rate': 0.08438579306565921, 'subsample': 0.8849743693191983, 'colsample_bytree': 0.6179600233449092, 'gamma': 6}. Best is trial 1 with value: 0.5146363075112039.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-01-24 11:39:12,591][0m Trial 2 finished with value: 0.5134847239238292 and parameters: {'n_estimators': 549, 'max_depth': 15, 'learning_rate': 0.08393104415003413, 'subsample': 0.5197202552686686, 'colsample_bytree': 0.6010766309189856, 'gamma': 4}. Best is trial 2 with value: 0.5134847239238292.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-01-24 11:39:19,553][0m Trial 3 finished with value: 0.5161941441974582 and parameters: {'n_estimators': 540, 'max_depth': 17, 'learning_rate': 0.05564343345380777, 'subsample': 0.9572176444592828, 'colsample_bytree': 0.5062623208298063, 'gamma': 7}. Best is trial 2 with value: 0.5134847239238292.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-01-24 11:39:21,778][0m Trial 4 finished with value: 0.5163508875190731 and parameters: {'n_estimators': 452, 'max_depth': 12, 'learning_rate': 0.043597142177279655, 'subsample': 0.5398298129783408, 'colsample_bytree': 0.8965255891090069, 'gamma': 4}. Best is trial 2 with value: 0.5134847239238292.[0m


In [9]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))
#希望score是在0.52-0.53
#inference要用到params

Best trial: score 0.5134847239238292, params {'n_estimators': 549, 'max_depth': 15, 'learning_rate': 0.08393104415003413, 'subsample': 0.5197202552686686, 'colsample_bytree': 0.6010766309189856, 'gamma': 4}
