In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, Booster
import lightgbm as lgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

print(f"lightgbm version: {lgb.__version__}")

lightgbm version: 3.2.1


In [2]:

# Load the data
# Replace 'train.csv' and 'test.csv' with your actual file paths
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Assuming 'target' is your target variable
# Modify these according to your actual feature and target columns
X = train_data.drop('target', axis=1)
y = train_data['target']

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [3]:
# make pipline with lgbm classifier
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("lgbm", LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, num_leaves=31, random_state=42))                      
])

In [4]:

# Train the model
pipeline.fit(
    X_train, y_train,
    lgbm__eval_set=[(X_val, y_val)],
    lgbm__verbose=False,

)

# Make predictions on validation set
val_predictions = pipeline.predict(X_val)

# Evaluate binary classification model
val_accuracy = accuracy_score(y_val, val_predictions)
val_roc_auc = roc_auc_score(y_val, val_predictions)

print()
print(f"Test Acc: {val_accuracy:.4f}")
print(f"Test AUC: {val_roc_auc:.4f}")



X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

# Make predictions on test set
test_predictions = pipeline.predict(X_test)

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_predictions)


print()
print(f"Test Acc: {test_accuracy:.4f}")
print(f"Test AUC: {test_roc_auc:.4f}")

model1_proba = pipeline.predict_proba(X_test)

print(model1_proba[:10])


Test Acc: 0.9350
Test AUC: 0.9349

Test Acc: 0.9510
Test AUC: 0.9510
[[0.00803537 0.99196463]
 [0.05653244 0.94346756]
 [0.04401297 0.95598703]
 [0.99335159 0.00664841]
 [0.84520952 0.15479048]
 [0.00725711 0.99274289]
 [0.00394439 0.99605561]
 [0.97750684 0.02249316]
 [0.03604028 0.96395972]
 [0.13477993 0.86522007]]


In [5]:
print(pipeline)
print(pipeline.named_steps['lgbm'])

Pipeline(steps=[('scaler', StandardScaler()),
                ('lgbm', LGBMClassifier(max_depth=5, random_state=42))])
LGBMClassifier(max_depth=5, random_state=42)


In [6]:
lgbm_model = pipeline.named_steps['lgbm']

for p in dir(lgbm_model):
    if not p.startswith("__") and not callable(getattr(lgbm_model, p)):
        print(p, getattr(lgbm_model, p))

_Booster <lightgbm.basic.Booster object at 0xffff0d95f610>
_base_doc 
    Build a gradient boosting model from the training set (X, y).

    Parameters
    ----------
    X : array-like or sparse matrix of shape = [n_samples, n_features]
        Input feature matrix.
    y : array-like of shape = [n_samples]
        The target values (class labels in classification, real numbers in regression).
    sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
        Weights of training data.
    init_score : array-like of shape = [n_samples] or None, optional (default=None)
        Init score of training data.
    eval_set : list or None, optional (default=None)
        A list of (X, y) tuple pairs to use as validation sets.
    eval_names : list of strings or None, optional (default=None)
        Names of eval_set.
    eval_sample_weight : list of arrays or None, optional (default=None)
        Weights of eval data.
    eval_class_weight : list or None, optional 

In [7]:
# save model to pickle file
with open('pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

#save the model tree structure                  
pipeline.named_steps["lgbm"].booster_.save_model('model.txt')
print("Model saved as model.txt")

Model saved as model.txt


In [8]:
booster_model = Booster(
    model_file='model.txt'
)

# restore pipeline from pickle file
with open('pipeline.pkl', 'rb') as f:
    pipeline2 = pickle.load(f)


# create stub classier
model2 = LGBMClassifier()

# populae the stub classifier with the attributes of the original model
mdl = pipeline2.named_steps['lgbm']
for p in dir(mdl):
    if not p.startswith("__") and not callable(getattr(mdl, p)):
        print(p, getattr(mdl, p))
        try:
            setattr(model2, p, getattr(mdl, p))
        except:
            print(f">>>>>>Error setting {p}")


# Load the booster model into the new classifier
model2._Booster = booster_model
model2.fitted_ = True

pipeline2.set_params(lgbm=model2)


X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

# Make predictions on test set
test_predictions = pipeline2.predict(X_test)

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_predictions)

print()
print(f"Test Acc: {test_accuracy:.4f}")
print(f"Test AUC: {test_roc_auc:.4f}")

# Get probability predictions
model2_proba = pipeline2.predict_proba(X_test)

# Check if the probability predictions are the same
print(f"probabilities the same {np.allclose(model1_proba, model2_proba)}")

_Booster <lightgbm.basic.Booster object at 0xffff07c79a50>
_base_doc 
    Build a gradient boosting model from the training set (X, y).

    Parameters
    ----------
    X : array-like or sparse matrix of shape = [n_samples, n_features]
        Input feature matrix.
    y : array-like of shape = [n_samples]
        The target values (class labels in classification, real numbers in regression).
    sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
        Weights of training data.
    init_score : array-like of shape = [n_samples] or None, optional (default=None)
        Init score of training data.
    eval_set : list or None, optional (default=None)
        A list of (X, y) tuple pairs to use as validation sets.
    eval_names : list of strings or None, optional (default=None)
        Names of eval_set.
    eval_sample_weight : list of arrays or None, optional (default=None)
        Weights of eval data.
    eval_class_weight : list or None, optional 