In [3]:
import psutil
import os

process = psutil.Process(os.getpid())
print('Initial memory (in MB) =', process.memory_info().rss / 1e6)

import lightgbm as lgb
import numpy as np
import resource
import time
import pandas as pd

print('After imports memory (in MB) =', process.memory_info().rss / 1e6)

# Should take up 7*10MB*4 (7 columns of 10 million numbers in 8 byte float64 format) = 560 MB
y_train = np.random.random(10_000_000)
X_train = pd.DataFrame(
    data = np.random.random((10_000_000, 6)),
    # If more separete numpy arrays are used, lightgbm performs extra data copy and uses twice more memory.
    #'col1': np.random.random(10_000_000),
    #'col2': np.random.random(10_000_000),
    columns = ['col'+str(i) for i in range(6)],
)


print('With numpy arrays memory =', process.memory_info().rss / 1e6)

# Create dataset for lightgbm. 
# The dataset uses less memory than numpy, as features are somehow compressed/bucketed by quantlies or something :D. 
# `construct` actually performs the copy and compression, before that, no memory is initialized.
lgb_train = lgb.Dataset(X_train.values, y_train).construct()
del X_train, y_train

print('After constructing lgb.Dataset and freeing numpy memory =', process.memory_info().rss / 1e6)
# 300 MB on my PC means that the dataset takes around 120MB which means 
# features are probably bucketed by quantiles into 255 buckets taking 1 byte of memory per number (7x smaller) 
# and labels are kept as precise 8 byte floats.

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 2,
}
gbm = lgb.train(params,
    lgb_train,
    num_boost_round = 1,
)

print('After training memory =', process.memory_info().rss / 1e6)
print('Peak memory (when both numpy and Dataset are allocated) memory = ', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e3)

Initial memory (in MB) = 49.733632
After imports memory (in MB) = 122.273792
With numpy arrays memory = 682.84416
After constructing lgb.Dataset and freeing numpy memory = 281.919488
After training memory = 282.84928
Peak memory (when both numpy and Dataset are allocated) memory =  861.036
