In [4]:
!pip install lightgbm xgboost catboost


Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m2.6 MB/

In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from tqdm import tqdm

# Load Dataset
print("Loading dataset...")
data = pd.read_csv('/content/SMOTETomek_data.csv')
print(f"Original dataset shape: {data.shape}")

# Stratified Sampling (40% of data)
print("Performing stratified sampling (40%)...")
data_sampled, _ = train_test_split(data, test_size=0.6, stratify=data['outcome_group'], random_state=42)
print(f"Sampled dataset shape: {data_sampled.shape}")

# Label Encoding for Specific Columns
print("Applying label encoding...")
label_encode_cols = ['age_group_intake', 'outcome_group']
label_encoders = {}
for col in tqdm(label_encode_cols, desc="Label Encoding"):
    le = LabelEncoder()
    data_sampled[col] = le.fit_transform(data_sampled[col])
    label_encoders[col] = le
print("Label encoding completed.")

# One-Hot Encoding for Categorical Variables
print("Applying one-hot encoding...")
one_hot_cols = ['animal_type', 'month_of_outcome', 'breed_type', 'color_group', 'intake_condition_group']
encoder = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), one_hot_cols)],
    remainder='passthrough'
)

encoded_data = encoder.fit_transform(data_sampled)
encoded_feature_names = encoder.get_feature_names_out()
encoded_df = pd.DataFrame(encoded_data, columns=encoded_feature_names, index=data_sampled.index)
print("One-hot encoding completed.")

# Fix Column Names
encoded_df.columns = encoded_df.columns.str.replace('remainder__', '')

# Define Target Variable
print("Defining target variable...")
X = encoded_df.drop(columns=['outcome_group'])  # Ensure 'outcome_group' is present in encoded_df
y = encoded_df['outcome_group']

# Convert categorical columns to category dtype before splitting
print("Converting categorical columns...")
for col in tqdm(one_hot_cols, desc="Categorical Conversion"):
    if col in X.columns:
        X[col] = X[col].astype('category')
print("Categorical conversion completed.")

# Stratified K-Fold Cross-Validation
print("Applying Stratified K-Fold splitting...")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, test_idx = next(skf.split(X, y))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
print("Data split completed.")

# Best Model Parameters
print("Initializing model hyperparameters...")
best_lgbm_params = {'boosting_type': 'gbdt', 'num_leaves': 804, 'learning_rate': 0.20067763503155892, 'feature_fraction': 0.47078436497155735, 'bagging_fraction': 0.8647396028800656, 'bagging_freq': 3, 'min_child_samples': 5}
best_xgb_params = {'learning_rate': 0.43924978681535976, 'n_estimators': 173, 'max_depth': 12, 'min_child_weight': 2, 'gamma': 0.050045804877533295, 'subsample': 0.6562434935218406, 'enable_categorical': True}
best_catboost_params = {'learning_rate': 0.18366553939455046, 'depth': 9, 'l2_leaf_reg': 1.1598842691079305, 'bagging_temperature': 0.33121618267553227, 'iterations': 783}
print("Model hyperparameters set.")

# Best Stacking Model Configuration
print("Initializing base models...")
base_models = [
    ('lgbm', LGBMClassifier(**best_lgbm_params, random_state=42, verbose=-1)),
    ('xgb', XGBClassifier(**best_xgb_params, random_state=42, verbosity=0)),
    ('catboost', CatBoostClassifier(**best_catboost_params, random_seed=42, logging_level='Silent'))
]
print("Base models initialized.")

meta_model = SVC(probability=True, random_state=42)
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    stack_method='predict_proba'
)

# Train Stacking Model
print("Training the final stacking model...")
with tqdm(total=1, desc="Training Stacking Model") as pbar:
    stacking_clf.fit(X_train, y_train)
    pbar.update(1)
print("Model training completed.")

# Evaluate Model
print("Evaluating model...")
stacking_preds = stacking_clf.predict(X_test)
stacking_accuracy = accuracy_score(y_test, stacking_preds)
print(f"Stacking Model Accuracy: {stacking_accuracy:.4f}")

# Save Model
print("Saving final stacking model...")
model_filename = 'final_stacking_model.pkl'
joblib.dump(stacking_clf, model_filename)
print(f"Final Stacking Model Saved as {model_filename}")


Loading dataset...
Original dataset shape: (240533, 11)
Performing stratified sampling (40%)...
Sampled dataset shape: (96213, 11)
Applying label encoding...


Label Encoding: 100%|██████████| 2/2 [00:00<00:00, 70.96it/s]


Label encoding completed.
Applying one-hot encoding...
One-hot encoding completed.
Defining target variable...
Converting categorical columns...


Categorical Conversion: 100%|██████████| 5/5 [00:00<00:00, 27025.15it/s]


Categorical conversion completed.
Applying Stratified K-Fold splitting...
Data split completed.
Initializing model hyperparameters...
Model hyperparameters set.
Initializing base models...
Base models initialized.
Training the final stacking model...


Training Stacking Model: 100%|██████████| 1/1 [10:37<00:00, 637.38s/it]


Model training completed.
Evaluating model...
Stacking Model Accuracy: 0.8678
Saving final stacking model...
Final Stacking Model Saved as final_stacking_model.pkl
