1) Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Optuna
import optuna
from optuna import TrialPruned

# Sklearn utilities
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import RobustScaler
# ML models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier



2) Load Data

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

TARGET = "segment"

# Drop ID columns if exist
id_cols = [col for col in ["id", "player_id"] if col in train.columns]
train = train.drop(columns=id_cols, errors="ignore")
test_ids = test[id_cols] if len(id_cols) > 0 else None
test_X = test.drop(columns=id_cols, errors="ignore")



In [None]:
train.head()

3) Outlier Removal â€” Isolation Forest

In [None]:
numeric_cols = train.select_dtypes(include=["float64", "int64"]).columns.drop(TARGET)
iso = IsolationForest(contamination=0.03, random_state=42)
outlier_pred = iso.fit_predict(train[numeric_cols])
train_clean = train[outlier_pred == 1].reset_index(drop=True)
print("Outliers removed:", (outlier_pred == -1).sum())

4) Encode target if categorical

In [None]:
y = train_clean[TARGET]
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)


5) Feature Selection via Correlation Heatmap

In [None]:
X_num = train_clean[numeric_cols]
corr_with_target = X_num.corrwith(pd.Series(y))
selected_numeric = corr_with_target[abs(corr_with_target) > 0.15].index.tolist()
print("Selected numeric features:", selected_numeric)

categorical_features = train_clean.select_dtypes(include=['object']).columns.tolist()
categorical_features = [c for c in categorical_features if c != TARGET]

X = pd.concat([train_clean[selected_numeric], train_clean[categorical_features]], axis=1)
X_test = pd.concat([test_X[selected_numeric], test_X[categorical_features]], axis=1)

corr_matrix = X_num[selected_numeric].corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap of Selected Numeric Features")
plt.show()

plt.figure(figsize=(8,6))
sns.barplot(x=corr_with_target[selected_numeric].index, y=corr_with_target[selected_numeric].values)
plt.xticks(rotation=45, ha='right')
plt.ylabel("Correlation with Target")
plt.title("Numeric Features Correlation with Target")
plt.show()

6) Fill missing values AFTER feature selection

In [None]:
missing_cols_in_test = set(X.columns) - set(X_test.columns)
for col in missing_cols_in_test:
    X_test[col] = np.nan

X_test = X_test[X.columns]

numeric_cols_selected = selected_numeric
categorical_cols_selected = categorical_features

X_numeric_means = X[numeric_cols_selected].mean()
X_categorical_modes = X[categorical_cols_selected].mode().iloc[0]

X[numeric_cols_selected] = X[numeric_cols_selected].fillna(X_numeric_means)
X[categorical_cols_selected] = X[categorical_cols_selected].fillna(X_categorical_modes)

X_test[numeric_cols_selected] = X_test[numeric_cols_selected].fillna(X_numeric_means)
X_test[categorical_cols_selected] = X_test[categorical_cols_selected].fillna(X_categorical_modes)


7) Scale numeric features

In [None]:


scaler = RobustScaler()
X[selected_numeric] = scaler.fit_transform(X[selected_numeric])
X_test[selected_numeric] = scaler.transform(X_test[selected_numeric])

for col in categorical_features:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [None]:
X_test

8) Define Optuna Objective (CatBoost)

In [None]:
def objective_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 200, 1000),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'verbose': 0
    }
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(
            X_train, y_train,
            cat_features=categorical_cols_selected,
            eval_set=(X_val, y_val),
            early_stopping_rounds=50,
            verbose=False
        )
        preds = model.predict(X_val)
        f1_scores.append(f1_score(y_val, preds, average='macro'))
    
    return np.mean(f1_scores)

9) Run Optuna Study

In [None]:
study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=12)

print("Best Optuna params:", study_cat.best_params)
print("Best CV F1-score (macro):", study_cat.best_value)

10) Train Final CatBoost Model

In [None]:
best_params = study_cat.best_params
final_model = CatBoostClassifier(
    iterations=best_params['iterations'],
    depth=best_params['depth'],
    learning_rate=best_params['learning_rate'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    verbose=0
)
final_model.fit(X, y, cat_features=categorical_cols_selected)

In [None]:

predictions = final_model.predict(X_test)
if 'le' in locals() or 'le' in globals():
    predictions_decoded = le.inverse_transform(predictions.flatten().astype(int))
else:
    predictions_decoded = predictions.flatten()

if test_ids is not None and len(id_cols) > 0:
    id_column_name = id_cols[0]
    
    submission_df = pd.DataFrame()
    submission_df[id_column_name] = test_ids[id_column_name]
    submission_df['prediction'] = predictions_decoded
    
    submission_df.to_csv("submission.csv", index=False)
    
    print(f"Submission file 'submission.csv' created with columns: {list(submission_df.columns)}")
else:
    # If no ID column was found/stored, create a 1-column CSV as strictly requested by the user
    submission_df = pd.DataFrame({'prediction': predictions_decoded})
    submission_df.to_csv("submission.csv", index=False)
    
    print("No ID column detected. Submission file 'submmission.csv' created with a single 'prediction' column.")