In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv("/kaggle/input/ob-data/train.csv")
test = pd.read_csv("/kaggle/input/ob-data/test.csv")

In [None]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

In [None]:
print("\n--- Basic Information ---")
print(train.info())

In [None]:
print("\n--- Summary Statistics (Numerical Columns) ---")
print(train.describe().T)

In [None]:
# Feature-Target Correlation
target_map = {cat:i for i, cat in enumerate(train["WeightCategory"].unique())}
train["TargetNum"] = train["WeightCategory"].map(target_map)

print("\n--- Feature Correlation with Target ---")
print(train[num_cols + ["TargetNum"]].corr()["TargetNum"].sort_values(ascending=False))

In [None]:
X = train.drop(columns=["WeightCategory", "id"])
y = train["WeightCategory"]

print("Features shape:", X.shape, "Target shape:", y.shape)

In [None]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=False)
test_encoded = pd.get_dummies(test.drop(columns=["id"], errors="ignore"), columns=cat_cols, drop_first=False)

test_encoded = test_encoded.reindex(columns=X_encoded.columns, fill_value=0)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_np = X_encoded.values
y_np = y_encoded
test_np = test_encoded.values

print("Train shape:", X_encoded.shape, "Test shape:", test_encoded.shape)

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for tr_idx, va_idx in cv.split(X_np, y_np):
        X_tr, X_va = X_np[tr_idx], X_np[va_idx]
        y_tr, y_va = y_np[tr_idx], y_np[va_idx]

        model = RandomForestClassifier(**params, n_jobs=-1, random_state=42)
        model.fit(X_tr, y_tr)
        preds = model.predict(X_va)
        scores.append(accuracy_score(y_va, preds))

    return float(np.mean(scores))




In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=80, show_progress_bar=True)  # can increase trials

print("Best CV score:", round(study.best_value * 100, 2), "%")
print("Best params:", study.best_params)

In [None]:
K = 5
top_trials = sorted([t for t in study.trials if t.value is not None], key=lambda t: t.value, reverse=True)[:K]

print(f"\nTop {K} Optuna Trials to be used in ensemble:")
for i, t in enumerate(top_trials, 1):
    print(f"\nModel {i}:")
    print(f"  CV Score: {t.value:.6f}")
    print(f"  Parameters:")
    for param_name, param_value in t.params.items():
        print(f"    {param_name}: {param_value}")

In [None]:
K = 5
X_tr_full, X_hold, y_tr_full, y_hold = train_test_split(X_np, y_np, test_size=0.15, random_state=99, stratify=y_np)

models = []
for i, t in enumerate(top_trials):
    params = t.params.copy()
    m = RandomForestClassifier(**params, n_jobs=-1, random_state=42+i)
    m.fit(X_tr_full, y_tr_full)
    models.append(m)
    print(f" Trained ensemble member {i+1} (trial value={t.value:.4f})")

# Evaluate ensemble on holdout
probs = np.mean([m.predict_proba(X_hold) for m in models], axis=0)
ensemble_preds = np.argmax(probs, axis=1)
ensemble_acc = accuracy_score(y_hold, ensemble_preds)
print(f"Ensemble holdout accuracy: {ensemble_acc*100:.4f}%")

In [None]:
print("Retraining ensemble members on full training data and predicting test...")

final_models = []
for i, t in enumerate(top_trials):
    params = t.params.copy()
    m = RandomForestClassifier(**params, n_jobs=-1, random_state=42+i)
    m.fit(X_np, y_np)
    final_models.append(m)
    print(f" Final model {i+1} trained.")

test_probas = np.mean([m.predict_proba(test_np) for m in final_models], axis=0)
test_preds = np.argmax(test_probas, axis=1)
test_labels = le.inverse_transform(test_preds)

In [None]:

submission = pd.DataFrame({"id": test["id"], "WeightCategory": test_labels})
submission.to_csv("Submission_randomForest.csv", index=False)
print("Saved submission_optuna_ensemble.csv. Ensemble test distribution:")
print(submission["WeightCategory"].value_counts(normalize=True).round(3) * 100)
