In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv(r'..\data\train.csv')
test = pd.read_csv(r'..\data\test.csv')


In [15]:
base_models = [
    ("rf", RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)),
    ("gb", GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)),
    ("lgbm", LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, num_leaves=30, random_state=42, n_jobs=-1))
]


In [21]:

# Initialize meta-features
meta_features_train = np.zeros((X_train.shape[0], len(base_models)))
meta_features_test = np.zeros((X_test.shape[0], len(base_models)))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model_scores = {}

for i, (name, model) in enumerate(base_models):
    scores = []
    test_fold_preds = []  # Collect test set predictions from each fold
    
    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        # Train model
        model.fit(X_tr, y_tr)

        # Store out-of-fold predictions for training meta-model
        meta_features_train[val_idx, i] = model.predict(X_val)

        # Collect predictions for test set (to be averaged later)
        test_fold_preds.append(model.predict(X_test))

        # Correct accuracy calculation
        acc = accuracy_score(y_val, model.predict(X_val))  # Fix here
        scores.append(acc)

    # Compute average accuracy across folds
    model_scores[name] = np.mean(scores)

    # Compute mean test set predictions over all folds
    meta_features_test[:, i] = np.mean(np.column_stack(test_fold_preds), axis=1)

print("Model Scores:", model_scores)

[LightGBM] [Info] Number of positive: 2800, number of negative: 2763
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 5563, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503326 -> initscore=0.013302
[LightGBM] [Info] Start training from score 0.013302
[LightGBM] [Info] Number of positive: 2800, number of negative: 2763
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 5563, number of used features: 35
[LightGBM] [Info] [binary:

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define Neural Network
meta_model_nn = Sequential([
    Dense(16, activation='relu', input_shape=(meta_features_train.shape[1],)),
    Dropout(0.2),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile Model
meta_model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Model
meta_model_nn.fit(meta_features_train, y_train, epochs=30, batch_size=16, verbose=0)

# Predictions
final_probs = meta_model_nn.predict(meta_features_test).flatten()
final_preds = (final_probs > 0.5).astype(int)

# Evaluate
accuracy = accuracy_score(y_test, final_preds)
print(f"Neural Network Stacking Accuracy: {accuracy:.4f}")


Neural Network Stacking Accuracy: 0.7959


In [24]:
# Assign weights (higher weight for better models)
weights = [0.1, 0.1, 0.8]  # Example: If LGBM is performing best, give it more weight

# Weighted sum of probabilities
final_probs = (
    weights[0] * meta_features_test[:, 0] +
    weights[1] * meta_features_test[:, 1] +
    weights[2] * meta_features_test[:, 2]
)

# Convert probabilities to binary predictions
final_preds = (final_probs > 0.5).astype(int)

# Evaluate
accuracy = accuracy_score(y_test, final_preds)
print(f"Weighted Stacking Accuracy: {accuracy:.4f}")


Weighted Stacking Accuracy: 0.8028
