# MUSHROOM3a
## 0.98440

In [2]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from packaging import version
import sklearn
from sklearn import preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import PowerTransformer, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import FunctionTransformer

assert sys.version_info >= (3, 7)
assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

# Custom transformer to shift values
class ShiftTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, shift=1.0):
        self.shift = shift

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X + self.shift

    def inverse_transform(self, X):
        return X - self.shift

# Load train.csv and apply sample size since the dataset is huge
mushroom_df = pd.read_csv('train.csv').sample(frac=0.1, random_state=42)

# Removing Duplicates
mushroom_df.drop_duplicates(inplace=True)

# Modifying Column Names for Better Readability
mushroom_df.columns = mushroom_df.columns.str.replace('-', '_')

# Assign Feature variables and target variable
X = mushroom_df.drop('class', axis=1)
y = mushroom_df['class']

# Encode the target variable
label_encoder = preprocessing.LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define attributes
surrogate_attribs = ["id"]
cat_attribs = ['cap_shape', 'cap_surface', 'cap_color', 'does_bruise_or_bleed', 'gill_attachment', 'gill_spacing', 'gill_color', 'stem_root', 'stem_surface', 'stem_color', 'veil_type', 'veil_color', 'has_ring', 'ring_type', 'spore_print_color', 'habitat', 'season']
power_attribs = ['cap_diameter', 'stem_height', 'stem_width']

cat_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=True)),  # Keep sparse matrix output
    ("to_dense", FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),  # Convert to dense format
    ("imputer", KNNImputer(n_neighbors=15)),  # Apply KNN imputation on the dense data
])

power_pipeline = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5)),
    ("shift_up", ShiftTransformer(shift=abs(mushroom_df[power_attribs].min().min()) + 1)),
    ("log_transform", PowerTransformer(method='box-cox')),
    ("shift_down", ShiftTransformer(shift=-(abs(mushroom_df[power_attribs].min().min()) + 1))),
    ("scaler", RobustScaler())
])

# Define the ColumnTransformer to apply different transformations to different columns
preprocessing = ColumnTransformer([
    ("surrogate", "passthrough", surrogate_attribs),
    ("categorical", cat_pipeline, cat_attribs),
    ("power_transformer", power_pipeline, power_attribs),
])

# Define models with updated hyperparameters
catboost_model = CatBoostClassifier(depth=12, iterations=500, learning_rate=0.07, verbose=0)
random_forest_model = RandomForestClassifier()
lgbm_model = LGBMClassifier(num_leaves=31, n_estimators=500, learning_rate=0.05)

# Split data into training and validation sets for proper fitting
X_train, X_valid, y_train, y_valid = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Apply preprocessing to training and validation data
X_train_preprocessed = preprocessing.fit_transform(X_train)
X_valid_preprocessed = preprocessing.transform(X_valid)

# Load test data
X_test = pd.read_csv('test.csv')

# Modifying Column Names
X_test.columns = X_test.columns.str.replace('-', '_')

# Apply preprocessing to test data
X_test_preprocessed = preprocessing.transform(X_test)


# Define the stacking classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('random_forest', random_forest_model),
        ('lgbm', lgbm_model)
    ],
    final_estimator=RandomForestClassifier(),
    passthrough=True
)

# Fit the stacking classifier to the training data
stacking_clf.fit(X_train_preprocessed, y_train)


# Directly apply the fitted stacking model to make predictions
y_pred_stack_encoded = stacking_clf.predict(X_test_preprocessed)
y_pred_stack = label_encoder.inverse_transform(y_pred_stack_encoded)

# Save predictions to CSV file
predictions_stack_df = pd.DataFrame({'id': X_test['id'], 'class': y_pred_stack})
predictions_stack_df.to_csv('Mushroom3a.csv', index=False)

print("Predictions saved to Mushroom3a.csv")
    

[LightGBM] [Info] Number of positive: 136372, number of negative: 112983
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1270
[LightGBM] [Info] Number of data points in the train set: 249355, number of used features: 129
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.546899 -> initscore=0.188149
[LightGBM] [Info] Start training from score 0.188149
[LightGBM] [Info] Number of positive: 109098, number of negative: 90386
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028788 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1270
[LightGBM] [Info] Number of data points in the train set: 199484, number of used features: 129
[LightGBM] [I