# How big are lgb trees in rf mode? Any smaller than sk trees?
#   - Abort! -- doesn't support multiclass output (see bottom)
#   - [-] Figure out repr to evaluate model_size ~ (n_species, n_recs)

# lgb reference
#   - https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
#   - https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py
#   - https://github.com/Microsoft/LightGBM/blob/master/docs/Features.rst
#   - https://lightgbm.readthedocs.io/en/latest/Python-Intro.html
#   - https://sites.google.com/view/lauraepp/parameters

In [None]:
from notebooks import *

In [None]:
# Generate data
random_state = np.random.RandomState(0)
n = 100
f = 10
X = random_state.rand(n, f)
classes = list(string.ascii_lowercase)
y = np_sample(classes, n=n, replace=True)
yi = lambda y: np.array([classes.index(_y) for _y in y])  # lgb api wants num labels, not str labels
(X_train, X_test, y_train, y_test) = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=0)
display(
    (X.shape, y.shape),
    y[:5],
    (X_train.shape, y_train.shape),
    (X_test.shape, y_test.shape),
)

((100, 10), (100,))

array(['f', 'q', 'g', 'b', 'l'], dtype='<U1')

((80, 10), (80,))

((20, 10), (20,))

In [None]:
# Make lgb data
lgb_train = lgb.Dataset(X_train, yi(y_train), free_raw_data=False)
lgb_test = lgb.Dataset(X_test, yi(y_test), free_raw_data=False)

In [None]:
lgb_train_kwargs = dict(
    # lgb.train args that feel like they belong in lgb_params instead

    num_boost_round=10,
    #   - Default: 100
    #   - "Note: internally, LightGBM constructs num_class * num_iterations trees for multi-class classification problems"

    # early_stopping_rounds=None,
    #   - Default: None (disabled)
    #   - "Activates early stopping. The model will train until the validation score stops improving."
    #   - "Requires at least one validation data and one metric. If there's more than one, will check all of them except
    #     the training data."
    #   - "If early stopping occurs, the model will add ``best_iteration`` field"

    # learning_rates=None,
    #   - Default: None
    #   - Dynamic learning rate
    #   - learning_rates: list, callable or None, optional (default=None)
    #       List of learning rates for each boosting round or a customized function that calculates ``learning_rate``
    #       in terms of current number of round (e.g. yields learning rate decay).

)
lgb_params = dict(
    # https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst

    # Core Parameters
    objective='multiclass',
    #   - Must also set num_class
    # boosting='gbdt',  # Default
    boosting='rf',
    #   - XXX Ah crap, rf mode doesn't allow multiclass:
    #       - https://github.com/Microsoft/LightGBM/blob/v2.1.2/src/boosting/rf.hpp#L40
    #       - https://github.com/Microsoft/LightGBM/issues/881
    #   - References for rf mode
    #       - https://github.com/Microsoft/LightGBM/blob/master/docs/FAQ.rst -> search "forest"
    #       - https://github.com/Microsoft/LightGBM/issues/691
    #       - Discussion on sampling without (lgb rf) vs. with (typical rf) replacement:
    #           - https://github.com/Microsoft/LightGBM/pull/884
    #           - https://github.com/Microsoft/LightGBM/issues/883
    #           - https://github.com/Microsoft/LightGBM/issues/1038
    #       - https://github.com/Microsoft/LightGBM/issues/47
    #       - https://github.com/Microsoft/LightGBM/issues/1431
    #       - Code
    #           - https://github.com/Microsoft/LightGBM/pull/678
    #           - https://github.com/Microsoft/LightGBM/blob/v2.1.2/src/boosting/rf.hpp
    # num_iterations=100,
    #   - [Deprecated: moved to .train]
    # learning_rate=.01,  # Default
    #   - Overridden to 1.0 in rf mode [https://github.com/Microsoft/LightGBM/blob/v2.1.2/src/boosting/rf.hpp#L43]
    # num_leaves=31,
    # tree_learner='serial',
    #   - Single machine, multicore: 'serial' (default)
    #   - [ignore] Distributed training: 'feature' | 'data' | 'voting'
    #   - https://github.com/Microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst
    # num_threads=0,
    # device_type='cpu',
    #   - 'cpu' (default) | 'gpu'
    #   - Would gpu need rebuild? https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-gpu-version
    #   - "Note: it is recommended to use the smaller max_bin (e.g. 63) to get the better speed up" (below)
    #   - "Note: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the
    #     accuracy for some tasks. You can set gpu_use_dp=true to enable 64-bit float point, but it will slow down
    #     the training"
    seed=0,
    #   - Generates all seeds (e.g. data_random_seed)

    # Learning Control Parameters
    # max_depth=-1,
    #   - -1 = no limit
    min_data_in_leaf=1,
    #   - 20 (default) is too big when data is small [https://github.com/Microsoft/LightGBM/issues/907]
    # bagging_fraction=1.0,  # Default
    bagging_fraction=0.632,  # Simulate data exposure from sampling with replacement, like rf
    #   - Requires bagging_freq > 0
    #   - Sample data per tree, without replacement; see these issues for with/without replacement discussion:
    #       - https://github.com/Microsoft/LightGBM/pull/884
    #       - https://github.com/Microsoft/LightGBM/issues/883
    #       - https://github.com/Microsoft/LightGBM/issues/1038
    # bagging_freq=0,  # Default
    bagging_freq=1,
    #   - Bag at every kth iteration (0 = disable)
    #   - Requires bagging_fraction < 1
    # feature_fraction=1.0,  # Default
    feature_fraction=np.sqrt(f) / f,  # Like sk's default max_features='auto' behavior
    #   - Default: 1.0
    #   - Sample features per tree
    # early_stopping_round=0,
    #   - [Deprecated: moved to .train]
    # max_delta_step=0.0,
    #   - "used to limit the max output of tree leaves"
    #   - "<= 0 means no constraint"
    #   - "the final max output of leaves is learning_rate * max_delta_step"
    # lambda_l1=0.0,
    #   - L1 regularization
    # lambda_l2=0.0,
    #   - L2 regularization
    # min_gain_to_split=0.0,
    #   - "the minimal gain to perform split"

    # IO Parameters
    #   - TODO Grok more of these: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst#io-parameters
    verbosity=1,
    max_bin=63,
    #   - Default: 255
    #   - "it is recommended to use the smaller max_bin (e.g. 63) to get the better speed up" (from cpu/gpu)
    #   - "max number of bins that feature values will be bucketed in"
    #   - "small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)"
    #   - "LightGBM will auto compress memory according to max_bin. For example, LightGBM will use uint8_t for
    #     feature value if max_bin=255"
    # min_data_in_bin=3,
    #   - "use this to avoid one-data-one-bin (potential over-fitting)"
    # bin_construct_sample_cnt=200000,
    #   - "number of data that sampled to construct histogram bins"
    #   - "setting this to larger value will give better training result, but will increase data loading time"
    #   - "set this to larger value if data is very sparse"
    #   - TODO Tune with len(X)
    # two_round=False,
    #   - "set this to true if data file is too big to fit in memory"
    # save_binary=False,
    #   - "save the dataset (including validation data) to a binary file. This speed ups the data loading for the next time"
    # enable_load_from_binary_file=True,
    #   - "set this to true to enable autoloading from previous saved binary datasets"

    # Objective Parameters
    num_class=len(classes),
    #   - Default: 1
    #   - Required for objective='multiclass'

    # Metric Parameters
    #   - [ignore]

    # Network Parameters
    #   - [ignore]

    # GPU Parameters
    #   - [ignore]

)

In [None]:
evals_result = {}
gbm = lgb.train(
    **lgb_train_kwargs,
    params=lgb_params,
    train_set=lgb_train,
    valid_sets=[lgb_train, lgb_test],
    valid_names=['train', 'test'],
    evals_result=evals_result,
    verbose_eval=1,
    #   - Print every n rounds
    keep_training_booster=True,
    #   - Default: False
    #   - Whether to retain memory for further training [https://github.com/Microsoft/LightGBM/issues/668]
    #   - FIXME .save_model only includes params if keep_training_booster=True
    #       - Seems like it was intended to include params in both cases? [https://github.com/Microsoft/LightGBM/issues/1364]
)
display(
    # evals_result,  # evals_result['test']['multi_logloss']: np.ndarray, same as the verbose output
)

LightGBMError: Check failed: num_tree_per_iteration_ == 1 at /Users/travis/miniconda3/conda-bld/lightgbm_1530780821674/work/compile/src/boosting/rf.hpp, line 41 .


In [None]:
# XXX Ah crap, rf mode doesn't allow multiclass:
#   - https://github.com/Microsoft/LightGBM/blob/v2.1.2/src/boosting/rf.hpp#L40
#   - https://github.com/Microsoft/LightGBM/issues/881