In [16]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin , clone
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)



In [13]:


class BinaryClassifier(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.model = LGBMClassifier() 

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

class MulticlassClassifier(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.model = ExtraTreesClassifier()  

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)


In [3]:
df = pd.read_csv('final.csv')

In [14]:

class ProbabilityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        probabilities = self.model.predict_proba(X)
        return probabilities[:, 1].reshape(-1, 1)  

X = df.drop(['status_binary','status_encoded_multiclass'],axis=1)
y_binary = df['status_binary']
y_multiclass = df['status_encoded_multiclass']

# Splitting data into train and test sets
X_train, X_test, y_binary_train, y_binary_test, y_multiclass_train, y_multiclass_test = train_test_split(
    X, y_binary, y_multiclass, test_size=0.2, random_state=42
)



In [15]:
class AugmentWithBinaryProb(BaseEstimator, TransformerMixin):
    def __init__(self, estimator=None):
        self.estimator = estimator if estimator is not None else LGBMClassifier()

    def fit(self, X, y=None, y_binary=None):
        if y_binary is None:
            raise ValueError("y_binary must be provided via fit(..., y_binary=...)")
        self.estimator_ = clone(self.estimator).fit(X, y_binary)
        return self

    def transform(self, X):
        p1 = self.estimator_.predict_proba(X)[:, 1].reshape(-1, 1)
        return np.hstack([X, p1])

combined_pipeline = Pipeline([
    ("augment", AugmentWithBinaryProb(estimator=LGBMClassifier())),
    ("multiclass_clf", ExtraTreesClassifier(random_state=42)),
])


combined_pipeline.fit(
    X_train,
    y_multiclass_train,
    augment__y_binary=y_binary_train
)


[LightGBM] [Info] Number of positive: 35770, number of negative: 2324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 38094, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.938993 -> initscore=2.733820
[LightGBM] [Info] Start training from score 2.733820


In [8]:
y_pred = combined_pipeline.predict(X_test)

acc  = accuracy_score(y_multiclass_test, y_pred)
f1m  = f1_score(y_multiclass_test, y_pred, average="macro")
f1w  = f1_score(y_multiclass_test, y_pred, average="weighted")
bacc = balanced_accuracy_score(y_multiclass_test, y_pred)

print(f"Accuracy           : {acc:.4f}")
print(f"Macro F1           : {f1m:.4f}")
print(f"Weighted F1        : {f1w:.4f}")
print(f"Balanced Accuracy  : {bacc:.4f}\n")
print("Classification report:")
print(classification_report(y_multiclass_test, y_pred))

Accuracy           : 0.9150
Macro F1           : 0.2944
Weighted F1        : 0.8978
Balanced Accuracy  : 0.2830

Classification report:
              precision    recall  f1-score   support

           0       0.15      0.06      0.08       398
           1       0.14      0.06      0.09       208
           2       0.10      0.04      0.05        28
           3       0.94      0.98      0.96      8890

    accuracy                           0.91      9524
   macro avg       0.33      0.28      0.29      9524
weighted avg       0.88      0.91      0.90      9524

