In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# ==========================================
# 1. SETUP & PREPROCESSING
# ==========================================
print("Loading Data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

TARGET = 'personality_cluster'
ID_COL = 'participant_id'

# Separate features and target
X = train.drop(columns=[TARGET, ID_COL])
y = train[TARGET]
X_test = test.drop(columns=[ID_COL], errors='ignore')

# Encode Target (Cluster A-E -> 0-4)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Define Preprocessing
# - Numeric: Median Impute + Scale
# - Categorical: Frequent Impute + OneHotEncode
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop'
)

# Helper function to Train, Tune, and Save
def run_model(name, model, param_dist):
    print(f"\n=== Training {name} ===")
    
    # Create Pipeline
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Fast Hyperparameter Tuning
    search = RandomizedSearchCV(
        clf,
        param_distributions=param_dist,
        n_iter=10,             # 10 random combinations
        scoring='f1_weighted', # Suitable for Multiclass
        cv=3,                  # 3-Fold CV
        n_jobs=-1,             
        random_state=42,
        verbose=1
    )
    
    # Fit
    search.fit(X, y_encoded)
    print(f"  Best Params: {search.best_params_}")
    print(f"  Best CV Score: {search.best_score_:.4f}")
    
    # Predict on Test
    best_model = search.best_estimator_
    preds_encoded = best_model.predict(X_test)
    
    # CatBoost returns shape (N, 1) sometimes, so we flatten it
    preds_encoded = preds_encoded.flatten()
    
    preds_labels = le.inverse_transform(preds_encoded.astype(int))
    
    # Save Submission
    filename = f"submission_{name.lower().replace(' ', '_')}.csv"
    sub = pd.DataFrame({
        'participant_id': test['participant_id'],
        'personality_cluster': preds_labels
    })
    sub.to_csv(filename, index=False)
    print(f"Saved: {filename}")

# ==========================================
# 2. CATBOOST DEFINITION (Multiclass)
# ==========================================

# CatBoost
cat_params = {
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'model__depth': [4, 6, 8],
    'model__iterations': [500, 1000],
    'model__l2_leaf_reg': [1, 3, 5, 7] # Regularization helps prevent overfitting
}

run_model(
    'CatBoost', 
    CatBoostClassifier(
        loss_function='MultiClass',
        eval_metric='Accuracy',
        random_state=42, 
        verbose=0, 
        thread_count=-1,
        allow_writing_files=False
    ), 
    cat_params
)

print("\nCatBoost training complete!")

Loading Data...

=== Training CatBoost ===
Fitting 3 folds for each of 10 candidates, totalling 30 fits
  Best Params: {'model__learning_rate': 0.05, 'model__l2_leaf_reg': 5, 'model__iterations': 500, 'model__depth': 4}
  Best CV Score: 0.7227
âœ… Saved: submission_catboost.csv

CatBoost training complete!
