# Stacking 

In [4]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


class MyStackingClassifier:

    def __init__(self, X_train , y_train , X_test , y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
        self.meta_model = None
        self.test_meta_model = None

    # 1. K- Fold Cross Validation to generate OOF predictions
    def k_fold_cross_validation(self, model , k = 5) :
        kf = KFold(n_splits = k , shuffle = True , random_state= 42)

        # Empty array to store out of fold predictions
        oof_preds = np.zeros(len(self.X_train))

        for train_idx , valid_idx in kf.split(self.X_train):
            X_tr , X_val = self.X_train[train_idx] , self.X_train[valid_idx]
            y_tr = self.y_train[train_idx]

            # Train on k - 1fold
            model.fit(X_tr , y_tr)

            # Predict validation Fold
            oof_preds[valid_idx] = model.predict(X_val)
        return oof_preds 



    # 2. Train  Base model on Full Train Data (Level - 0 Training)
    def train_level_0(self, model):
        model.fit(self.X_train , self.y_train)
        test_preds = model.predict(self.X_test)
        return test_preds

    # 3. Train Meta-Model (Level - 1)
    def train_level_1(self, meta_model , train_meta_X, test_meta_X):
        meta_model.fit(train_meta_X , self.y_train)

        # Store the trained meta_model and test predictions for final use
        self.meta_model = meta_model
        self.test_meta_model = test_meta_X

    def StackingClassifier(self):
    
        #1. Define Base Learners (Weak Learners)
        base_learners = [
            ('dt', DecisionTreeClassifier()),
            ('knn', KNeighborsClassifier()),
            ('rf', RandomForestClassifier()),
            ('gb', GradientBoostingClassifier()),
            ('gn',GaussianNB())
        ]
    
        # Final Learner (Meta Model / Level-1 Model)
        meta_learner = LogisticRegression()
    
        # These will store
        # -train_meta_X -> OOF prediction from each base model
        # -test_meta_X -> Test prediction from each base model
        train_meta_X = []
        test_meta_X  = []
    
        # 2. Loop Through each base learner
        for model_name , model in base_learners :
    
            # 2a. Get OOF predictions for training set using K-Fold
            # -> These predicitons are used to TRAIN the meta-model
    
            oof_preds = self.k_fold_cross_validation(model)
            train_meta_X.append(oof_preds)  # shape : (n_samples_train)
    
            # 2b. Train model on full training data and predict test set
               # -> These predictions are used to TEST the meta-model
            test_preds = self.train_level_0(model)
            test_meta_X.append(test_preds)
    
        # 3. Stack predictions horizontally 
        # Convert list of arrays -> 2D Numpy array
    
        # Each bse model beomes one columns
        train_meta_X = np.array(train_meta_X).T   # Shape : (n_train_samples , n_models)
        test_meta_X = np.array(test_meta_X).T   # Shape : (n_test_samples, n_models)
    
        # 4. Train Meta-Model (Level-1 Model)
        self.train_level_1(meta_learner, train_meta_X, test_meta_X)


from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the simple dataset
data = load_iris()
X = data.data
y = data.target

# Split into training + testing 
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size= 0.2 , random_state = 42 , stratify = y)

# Create and train stacking model
model = MyStackingClassifier(X_train, y_train, X_test, y_test)
model.StackingClassifier()

# Predict using meta-model
y_pred = model.meta_model.predict(model.test_meta_model)

# Check accuracy
print("Predictions:", y_pred[:10])
print("Actual:", y_test[:10])
print("Accuracy:", accuracy_score(y_test, y_pred))


Predictions: [0 2 1 1 0 1 0 0 2 1]
Actual: [0 2 1 1 0 1 0 0 2 1]
Accuracy: 0.9666666666666667


In [7]:
# Sklearn API
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Base models (Level-0 learners)
base_learners = [
    ('dt', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier()),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('gn', GaussianNB())
]

# Meta Model (Level -1 Learner)
meta_model = LogisticRegression()

# Build Stacking CLassifier
stack_model = StackingClassifier(
    estimators = base_learners,
    final_estimator = meta_model,
    cv = 5 # K-Fold to generate OOF predictions
)

#Train
stack_model.fit(X_train , y_train)

# Predict
stack_model.predict(X_test)
print("ACCURACY",accuracy_score(y_test, y_pred))

ACCURACY 0.9666666666666667


# Blending

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Main Train-Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Blending requires splitting training into train + validation
X_train_blend, X_val_blend, y_train_blend, y_val_blend = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42, stratify=y_train
)

# Base models (Level-0)
base_learners = [
    ('dt', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier()),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('gn', GaussianNB())
]

# Meta-model (Level-1)
meta_model = LogisticRegression()

# Store predictions
val_meta_X = []   # predictions on validation set
test_meta_X = []  # predictions on test set

# 1. Train each base model on blending-train and predict validation + test
for name, model in base_learners:
    # Train on blending training set
    model.fit(X_train_blend, y_train_blend)

    # Predict on validation (train data for meta-model)
    val_meta_X.append(model.predict(X_val_blend))

    # Predict on test (test data for meta-model)
    test_meta_X.append(model.predict(X_test))

# Convert lists to 2D arrays and transpose
val_meta_X = np.array(val_meta_X).T     # shape: (n_val, n_models)
test_meta_X = np.array(test_meta_X).T   # shape: (n_test, n_models)

# 2. Train meta-model on validation predictions
meta_model.fit(val_meta_X, y_val_blend)

# 3. Final predictions on test predictions
final_pred = meta_model.predict(test_meta_X)

# 4. Check accuracy
print("\nBlending Accuracy:", accuracy_score(y_test, final_pred))



Blending Accuracy: 0.9666666666666667


In [None]:
# SKlearn Does not provide Blending api


# Hard Voting Classifier

In [12]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Base models
clf1 = DecisionTreeClassifier()
clf2 = KNeighborsClassifier()
clf3 = RandomForestClassifier()

# Hard Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('dt', clf1),
        ('knn', clf2),
        ('rf', clf3)
    ],
    voting='hard'   # Majority voting
)

# Train
voting_clf.fit(X_train, y_train)

# Predict
y_pred = voting_clf.predict(X_test)

# Accuracy
print("Hard Voting Accuracy:", accuracy_score(y_test, y_pred))


Hard Voting Accuracy: 0.9666666666666667


In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Base models (must support predict_proba)
clf1 = DecisionTreeClassifier()
clf2 = KNeighborsClassifier()
clf3 = RandomForestClassifier()

# Soft Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('dt', clf1),
        ('knn', clf2),
        ('rf', clf3)
    ],
    voting='soft'   # Probability-based voting
)

# Train
voting_clf.fit(X_train, y_train)

# Predict
y_pred = voting_clf.predict(X_test)

# Accuracy
print("Soft Voting Accuracy:", accuracy_score(y_test, y_pred))


Soft Voting Accuracy: 0.9333333333333333
