In [1]:
import pandas as pd
import numpy as np
from lightgbm import  Booster, LGBMClassifier
import pickle
import lightgbm as lgb

from sklearn.metrics import accuracy_score, roc_auc_score

print(f"lightgbm version: {lgb.__version__}")

lightgbm version: 4.6.0


In [2]:

# Load the data
test_data = pd.read_csv('data/test.csv')


## Rebuild the LGBM Claasifier

In [3]:
booster_model = Booster(
    model_file='model.txt'
)

# restore model from pickle file
with open('pipeline.pkl', 'rb') as f:
    pipeline2 = pickle.load(f)

print(pipeline2)


Pipeline(steps=[('scaler', StandardScaler()),
                ('lgbm',
                 LGBMClassifier(max_depth=5, n_jobs=-1, random_state=42))])


In [4]:
pipeline2.__dict__

{'steps': [('scaler', StandardScaler()),
  ('lgbm', LGBMClassifier(max_depth=5, n_jobs=-1, random_state=42))],
 'memory': None,
 'verbose': False}

In [5]:
pipeline2.named_steps

{'scaler': StandardScaler(),
 'lgbm': LGBMClassifier(max_depth=5, n_jobs=-1, random_state=42)}

In [6]:

# create stub classier
model2 = LGBMClassifier()

# populae the stub classifier with the attributes of the original model
mdl = pipeline2.named_steps['lgbm']
for p in dir(mdl):
    if not p.startswith("__") and not callable(getattr(mdl, p)):
        print(p, getattr(mdl, p))
        try:
            setattr(model2, p, getattr(mdl, p))
        except:
            print(f">>>>>>Error setting {p}")


# Load the booster model into the new classifier
model2._Booster = booster_model
model2.fitted_ = True

# overlay the classifier in the pipeline with the reconstructed model
# pipeline2.named_steps['lgbm'] = None #model2
pipeline2.__dict__["steps"][1] = ("lgbm", model2)

print(pipeline2)


_Booster <lightgbm.basic.Booster object at 0xffff29f75bd0>
_LGBMClassifier__is_multiclass False
>>>>>>Error setting _LGBMClassifier__is_multiclass
_base_doc 
    Build a gradient boosting model from the training set (X, y).

    Parameters
    ----------
    X : numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]
        Input feature matrix.
    y : numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]
        The target values (class labels in classification, real numbers in regression).
    sample_weight : numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)
        Weights of training data. Weights should be non-negative.
    init_score : numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_sampl

In [7]:

X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

# Make predictions on test set
test_predictions = pipeline2.predict(X_test)

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_predictions)

print()
print(f"Test Acc: {test_accuracy:.4f}")
print(f"Test AUC: {test_roc_auc:.4f}")

# Get probability predictions
model2_proba = pipeline2.predict_proba(X_test)

print(model2_proba[:10])

# Save the model as pickle file
with open('pipeline3.pkl', 'wb') as f:
    pickle.dump(pipeline2, f)


Test Acc: 0.9510
Test AUC: 0.9510
[[0.00803537 0.99196463]
 [0.05653244 0.94346756]
 [0.04401297 0.95598703]
 [0.99335159 0.00664841]
 [0.84520952 0.15479048]
 [0.00725711 0.99274289]
 [0.00394439 0.99605561]
 [0.97750684 0.02249316]
 [0.03604028 0.96395972]
 [0.13477993 0.86522007]]


## Reuse the rebuilt LGBM Classifier

In [8]:
# restore model from pickle file
with open('pipeline3.pkl', 'rb') as f:
    pipeline3 = pickle.load(f)


# Make predictions on test set
test_predictions = pipeline3.predict(X_test)

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_predictions)

print()
print(f"Test Acc: {test_accuracy:.4f}")
print(f"Test AUC: {test_roc_auc:.4f}")

# Get probability predictions
model3_proba = pipeline3.predict_proba(X_test)

print(model3_proba[:10])


print(f"probability match: {np.all(model2_proba == model3_proba)}")



Test Acc: 0.9510
Test AUC: 0.9510
[[0.00803537 0.99196463]
 [0.05653244 0.94346756]
 [0.04401297 0.95598703]
 [0.99335159 0.00664841]
 [0.84520952 0.15479048]
 [0.00725711 0.99274289]
 [0.00394439 0.99605561]
 [0.97750684 0.02249316]
 [0.03604028 0.96395972]
 [0.13477993 0.86522007]]
probability match: True
