In [1]:
import os
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import matthews_corrcoef, make_scorer
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Custom print function for logging in Kaggle
__print__ = print
def print(string):
    os.system(f'echo \"{string}\"')
    __print__(string)

# Custom MCC metric function for XGBoost
def mcc_eval_metric(preds, dtrain):
    labels = dtrain.get_label()
    preds = (preds > 0.5).astype(int)
    mcc = matthews_corrcoef(labels, preds)
    return 'mcc', mcc

# Load the datasets
print('1/7: Loading datasets...')
df_sub = pd.read_csv('/kaggle/input/playground-series-s4e8/sample_submission.csv')
df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')
print('1/7: Datasets loaded successfully.\n')

# Drop 'id' column
df_train = df_train.drop(columns=['id'])
df_test = df_test.drop(columns=['id'])

# Encode categorical variables
print('2/7: Encoding categorical variables...')
cat_cols_train = df_train.select_dtypes(include=['object']).columns
cat_cols_train = cat_cols_train[cat_cols_train != 'class']

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_train[cat_cols_train] = ordinal_encoder.fit_transform(df_train[cat_cols_train].astype(str))
df_test[cat_cols_train] = ordinal_encoder.transform(df_test[cat_cols_train].astype(str))

le = LabelEncoder()
df_train['class'] = le.fit_transform(df_train['class'])
print('2/7: Categorical variables encoded.\n')

# Split data into features and target
X = df_train.drop(['class'], axis=1)
y = df_train['class']

# Sampling data for hyperparameter tuning
print('3/7: Sampling data for hyperparameter tuning...')
sample_size = 100000  # sample size for tuning
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)
print(f'3/7: Sampled {sample_size} records for hyperparameter tuning.\n')

# Split sampled data into training and validation sets
print('4/7: Splitting sampled data into training and validation sets...')
train_X_sample, val_X_sample, train_y_sample, val_y_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)
print('4/7: Data splitting completed.\n')

# Define the base model with GPU support
base_model = XGBClassifier(
    objective='binary:logistic',  # Ensure it's a binary classification
    tree_method='hist',       # Use GPU for training
    random_state=42
)

# Custom scorer
mcc_scorer = make_scorer(matthews_corrcoef)

# Declare params
print('5/7: Declare params...')
params = {'colsample_bytree': 0.6, 'max_depth': 14, 'min_child_weight': 7, 'n_estimators': 100}
print(f'5/7: Declare params completed.\n')

# Train the best model on the entire training data
print('6/7: Training the best model on full dataset...')
start_time = time.time()
eval_set = [(train_X_sample, train_y_sample), (val_X_sample, val_y_sample)]
best_model = XGBClassifier(
    objective='binary:logistic',
    tree_method='hist',
    colsample_bytree=params['colsample_bytree'],
    max_depth=params['max_depth'],
    min_child_weight=params['min_child_weight'],
    n_estimators=params['n_estimators'],
    random_state=42
)
best_model.fit(X, y, eval_metric=mcc_eval_metric, eval_set=eval_set, verbose=True)
end_time = time.time()
print(f'6/7: Model training completed in {end_time - start_time:.2f} seconds.\n')

# Predict on test set
print('7/7: Making predictions on test set...')
test_preds = best_model.predict(df_test)

# Ensure predictions are in correct format
test_preds = test_preds.astype(int)

# Map predictions back to original labels
test_pred_class = le.inverse_transform(test_preds)

# Align predictions with sample submission format
df_sub['class'] = test_pred_class

# Check the final submission data
print('Submission data sample after mapping:')
print(df_sub.head())

# Save predictions
print('7/7: Saving predictions to submission.csv...')
df_sub.to_csv('submission.csv', index=False)
print('7/7: Predictions saved successfully to submission.csv.')

# Read the saved submission file to verify
submission = pd.read_csv('submission.csv')
print('Saved submission file sample:')
print(submission.head())


1/7: Loading datasets...
1/7: Loading datasets...
1/7: Datasets loaded successfully.

1/7: Datasets loaded successfully.

2/7: Encoding categorical variables...
2/7: Encoding categorical variables...
2/7: Categorical variables encoded.

2/7: Categorical variables encoded.

3/7: Sampling data for hyperparameter tuning...
3/7: Sampling data for hyperparameter tuning...
3/7: Sampled 100000 records for hyperparameter tuning.

3/7: Sampled 100000 records for hyperparameter tuning.

4/7: Splitting sampled data into training and validation sets...
4/7: Splitting sampled data into training and validation sets...
4/7: Data splitting completed.

4/7: Data splitting completed.

5/7: Declare params...
5/7: Declare params...
5/7: Declare params completed.

5/7: Declare params completed.

6/7: Training the best model on full dataset...
6/7: Training the best model on full dataset...
[0]	validation_0-logloss:0.47184	validation_0-mcc:0.90167	validation_1-logloss:0.47022	validation_1-mcc:0.90778
[1]	va

## Conclusion
In this notebook, we successfully developed a model to classify mushrooms as edible or poisonous using the XGBoost algorithm. The model's performance was evaluated using the Matthews correlation coefficient (MCC), and the predictions were saved in the correct format for submission. 

By following this process, we ensured that the data was properly preprocessed, the model was accurately trained and evaluated, and the predictions were correctly formatted and saved.

Thank you for following along with this notebook. If you found this helpful, please consider giving an upvote. Your support is greatly appreciated!