In [54]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, Booster
import lightgbm as lgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

print(f"lightgbm version: {lgb.__version__}")

lightgbm version: 3.2.1


In [55]:

# Load the data
# Replace 'train.csv' and 'test.csv' with your actual file paths
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Assuming 'target' is your target variable
# Modify these according to your actual feature and target columns
X = train_data.drop('target', axis=1)
y = train_data['target']

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [56]:

# Initialize LightGBM model
model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    num_leaves=31,
    random_state=42
)

# Train the model
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],

)

# Make predictions on validation set
val_predictions = model.predict(X_val)

# Evaluate binary classification model
val_accuracy = accuracy_score(y_val, val_predictions)
val_roc_auc = roc_auc_score(y_val, val_predictions)

print()
print(f"Test Acc: {val_accuracy:.4f}")
print(f"Test AUC: {val_roc_auc:.4f}")



X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

# Make predictions on test set
test_predictions = model.predict(X_test)

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_predictions)


print()
print(f"Test Acc: {test_accuracy:.4f}")
print(f"Test AUC: {test_roc_auc:.4f}")

model1_proba = model.predict_proba(X_test)

print(model1_proba[:10])

[1]	valid_0's binary_logloss: 0.61994
[2]	valid_0's binary_logloss: 0.559848
[3]	valid_0's binary_logloss: 0.510197
[4]	valid_0's binary_logloss: 0.468044
[5]	valid_0's binary_logloss: 0.432372
[6]	valid_0's binary_logloss: 0.4022
[7]	valid_0's binary_logloss: 0.376218
[8]	valid_0's binary_logloss: 0.353479
[9]	valid_0's binary_logloss: 0.33374
[10]	valid_0's binary_logloss: 0.316868
[11]	valid_0's binary_logloss: 0.302091
[12]	valid_0's binary_logloss: 0.289117
[13]	valid_0's binary_logloss: 0.277986
[14]	valid_0's binary_logloss: 0.268068
[15]	valid_0's binary_logloss: 0.2593
[16]	valid_0's binary_logloss: 0.251863
[17]	valid_0's binary_logloss: 0.245192
[18]	valid_0's binary_logloss: 0.239468
[19]	valid_0's binary_logloss: 0.234108
[20]	valid_0's binary_logloss: 0.229936
[21]	valid_0's binary_logloss: 0.225798
[22]	valid_0's binary_logloss: 0.222088
[23]	valid_0's binary_logloss: 0.218774
[24]	valid_0's binary_logloss: 0.216127
[25]	valid_0's binary_logloss: 0.21406
[26]	valid_0's b

In [57]:
for p in dir(model):
    if not p.startswith("__") and not callable(getattr(model, p)):
        print(p, getattr(model, p))

_Booster <lightgbm.basic.Booster object at 0xffff173015d0>
_base_doc 
    Build a gradient boosting model from the training set (X, y).

    Parameters
    ----------
    X : array-like or sparse matrix of shape = [n_samples, n_features]
        Input feature matrix.
    y : array-like of shape = [n_samples]
        The target values (class labels in classification, real numbers in regression).
    sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
        Weights of training data.
    init_score : array-like of shape = [n_samples] or None, optional (default=None)
        Init score of training data.
    eval_set : list or None, optional (default=None)
        A list of (X, y) tuple pairs to use as validation sets.
    eval_names : list of strings or None, optional (default=None)
        Names of eval_set.
    eval_sample_weight : list of arrays or None, optional (default=None)
        Weights of eval data.
    eval_class_weight : list or None, optional 

In [58]:
# save model to pickle file
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

#save the model tree structure                  
model.booster_.save_model('model.txt')
print("Model saved as model.txt")

Model saved as model.txt


In [59]:
booster_model = Booster(
    model_file='model.txt'
)

# restore model from pickle file
with open('model.pkl', 'rb') as f:
    orig_model = pickle.load(f)


# create stub classier
model2 = LGBMClassifier()

# populae the stub classifier with the attributes of the original model
for p in dir(orig_model):
    if not p.startswith("__") and not callable(getattr(orig_model, p)):
        print(p, getattr(orig_model, p))
        try:
            setattr(model2, p, getattr(orig_model, p))
        except:
            print(f">>>>>>Error setting {p}")


# Load the booster model into the new classifier
model2._Booster = booster_model
model2.fitted_ = True

X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

# Make predictions on test set
test_predictions = model2.predict(X_test)

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_predictions)

print()
print(f"Test Acc: {test_accuracy:.4f}")
print(f"Test AUC: {test_roc_auc:.4f}")

# Get probability predictions
model2_proba = model2.predict_proba(X_test)

# Check if the probability predictions are the same
print(f"probabilities the same {np.allclose(model1_proba, model2_proba)}")

_Booster <lightgbm.basic.Booster object at 0xffff1694b050>
_base_doc 
    Build a gradient boosting model from the training set (X, y).

    Parameters
    ----------
    X : array-like or sparse matrix of shape = [n_samples, n_features]
        Input feature matrix.
    y : array-like of shape = [n_samples]
        The target values (class labels in classification, real numbers in regression).
    sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
        Weights of training data.
    init_score : array-like of shape = [n_samples] or None, optional (default=None)
        Init score of training data.
    eval_set : list or None, optional (default=None)
        A list of (X, y) tuple pairs to use as validation sets.
    eval_names : list of strings or None, optional (default=None)
        Names of eval_set.
    eval_sample_weight : list of arrays or None, optional (default=None)
        Weights of eval data.
    eval_class_weight : list or None, optional 