In [1]:
# ...existing code...
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

Kepler_data=pd.read_csv('Combined.csv')
Kepler_data.info()
X=Kepler_data.drop(columns=['koi_disposition'])
y=Kepler_data['koi_disposition']
le=LabelEncoder()
le.fit(y)
y_trf=le.transform(y)
print(le.classes_)
X_train,X_test,y_train,y_test=train_test_split(X,y_trf,test_size=0.2,random_state=42,stratify=y)

class OOFStackingClassifier(BaseEstimator, ClassifierMixin):
    """
    Out-Of-Fold stacking classifier that builds default base learners and a default meta-learner
    if none are provided. Use base_models/meta_model kwargs to override.
    """
    def __init__(self, base_models=None, meta_model=None, n_splits=5, random_state=42):
        self.n_splits = n_splits
        self.random_state = random_state

        # default base learners
        if base_models is None:
            rf = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None,
                                        max_samples=0.8, min_samples_leaf=1, n_estimators=500,
                                        oob_score=True, random_state=self.random_state)
            xgb = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                                subsample=1, colsample_bytree=1,
                                eval_metric='logloss', use_label_encoder=False, random_state=self.random_state)
            gb = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=500,
                                            subsample=1, random_state=self.random_state)
            lgb = LGBMClassifier(n_estimators=500, learning_rate=0.05,
                                 subsample=0.8, colsample_bytree=0.8, random_state=self.random_state)
            svc = SVC(C=2.0, kernel='rbf', probability=True, random_state=self.random_state)
            base_models = [rf, xgb, gb, lgb, svc]

        # default meta learner
        if meta_model is None:
            meta_model = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=self.random_state)

        self.base_models = base_models
        self.meta_model = meta_model
        self.fitted_base_models = []

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        n_samples = X.shape[0]

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        oof_preds = np.zeros((n_samples, len(self.base_models)))

        # Out-of-fold predictions for each base model
        for i, model in enumerate(self.base_models):
            oof = np.zeros(n_samples)
            for train_idx, val_idx in skf.split(X, y):
                mdl_clone = clone(model)
                mdl_clone.fit(X[train_idx], y[train_idx])
                # try predict_proba, fallback to decision_function, fallback to predict
                if hasattr(mdl_clone, "predict_proba"):
                    oof[val_idx] = mdl_clone.predict_proba(X[val_idx])[:, 1]
                elif hasattr(mdl_clone, "decision_function"):
                    # scale decision_function to [0,1] via sigmoid-like mapping
                    df = mdl_clone.decision_function(X[val_idx])
                    oof[val_idx] = 1 / (1 + np.exp(-df))
                else:
                    oof[val_idx] = mdl_clone.predict(X[val_idx])
            oof_preds[:, i] = oof

        # Train meta-model on OOF predictions
        self.meta_model.fit(oof_preds, y)

        # Retrain base models on full dataset and save them
        self.fitted_base_models = [clone(m).fit(X, y) for m in self.base_models]

        return self

    def predict_proba(self, X):
        X = np.asarray(X)
        # build meta-features from fully trained base models
        meta_features = np.column_stack([
            (m.predict_proba(X)[:, 1] if hasattr(m, "predict_proba")
             else (1 / (1 + np.exp(-m.decision_function(X)))) if hasattr(m, "decision_function")
             else m.predict(X))
            for m in self.fitted_base_models
        ])
        # return meta-model probabilities if available, else wrap single-column scores
        if hasattr(self.meta_model, "predict_proba"):
            return self.meta_model.predict_proba(meta_features)
        else:
            probs = self.meta_model.predict(meta_features)
            # ensure shape (n_samples, 2)
            probs = np.vstack([1 - probs, probs]).T
            return probs

    def predict(self, X):
        probs = self.predict_proba(X)
        # assume binary prob in column 1
        return (probs[:, 1] > 0.5).astype(int)

# Example usage (keeps previous notebook variables/flow)
# ...existing code...
stack_clf = OOFStackingClassifier(n_splits=5, random_state=42)   # uses defaults defined above
stack_clf.fit(X_train, y_train)
y_pred = stack_clf.predict(X_test)
y_pred_proba = stack_clf.predict_proba(X_test)[:, 1]

accuracy_score(y_test,y_pred)
# ...existing code...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9716 entries, 0 to 9715
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   koi_period       9716 non-null   float64
 1   koi_duration     9716 non-null   float64
 2   koi_depth        9716 non-null   float64
 3   koi_model_snr    9716 non-null   float64
 4   koi_prad         9716 non-null   float64
 5   koi_steff        9716 non-null   float64
 6   koi_srad         9716 non-null   float64
 7   koi_disposition  9716 non-null   float64
dtypes: float64(8)
memory usage: 607.4 KB
[0. 1.]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3668, number of negative: 2549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6217, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589995 -> initscore=0.363945
[LightGBM] [Info] Start training from score 0.363945




[LightGBM] [Info] Number of positive: 3668, number of negative: 2549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6217, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589995 -> initscore=0.363945
[LightGBM] [Info] Start training from score 0.363945




[LightGBM] [Info] Number of positive: 3668, number of negative: 2550
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6218, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589900 -> initscore=0.363553
[LightGBM] [Info] Start training from score 0.363553




[LightGBM] [Info] Number of positive: 3668, number of negative: 2550
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6218, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589900 -> initscore=0.363553
[LightGBM] [Info] Start training from score 0.363553




[LightGBM] [Info] Number of positive: 3668, number of negative: 2550
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6218, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589900 -> initscore=0.363553
[LightGBM] [Info] Start training from score 0.363553


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 4585, number of negative: 3187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 7772, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589938 -> initscore=0.363710
[LightGBM] [Info] Start training from score 0.363710




0.8955761316872428

In [3]:
# ...existing code...
import pickle

# write using a file object
with open(r'Stacked.pkl', 'wb') as f:
    pickle.dump(stack_clf, f)
# ...existing code...