In [None]:
import numpy as np
import joblib

# importing necessary libraries
import optuna
import wandb
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
from feature_engineering import create_features


train = pd.read_csv("merged_dataset.csv")

train = create_features(train)
# Separate the target variable
X = train.drop(["NObeyesdad", "id"], axis=1)
y = train["NObeyesdad"]

# Identify categorical and numerical columns
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [
    cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]
]

# Preprocessing for numerical data
numerical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Define the model
rf_model = RandomForestClassifier(random_state=42)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", rf_model)])

# Encoding the target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Initialize W&B project
wandb.init(project="my-space", entity="herczeg-gyrgy", sync_tensorboard=True)

# Callback for logging Optuna optimization to W&B
def optuna_callback(study, trial):
    wandb.log({"Best Value": study.best_value, "Current Value": trial.value})


# Preprocessing for categorical data
categorical_cols = [
    cname
    for cname in X.columns
    if X[cname].dtype == "object" or X[cname].dtype.name == "category"
]
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)],
    remainder="passthrough",
)

# Configurations for hyperparameter optimization with W&B and Optuna integration
config = {
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "num_class": len(np.unique(y_train)),
}

wandb.config.update(config)

# Defining the objective function for Optuna study with WandB logging
def objective(trial):
    param = {
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "nthread": -1,
        "num_class": len(np.unique(y_train)),
        "n_estimators": trial.suggest_int("n_estimators", 200, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.5, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.6, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.6, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "max_delta_step": trial.suggest_int("max_delta_step",1,10),
        "max_leaves": trial.suggest_int("max_leaves", 32, 512),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "eta": trial.suggest_float("eta", 1e-8, 1.0, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 5.0, log=True),
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        ),
        "use_label_encoder": False,
    }

    wandb.config.update(param)  # Updating config in WandB with each trial's parameters

    X_preprocessed = preprocessor.fit_transform(X_train)
    clf = xgb.XGBClassifier(**param, enable_categorical=True)

    score = cross_val_score(clf, X_preprocessed, y_train, cv=5, n_jobs=-1).mean()

    wandb.log({"score": score})  # Log the score

    return score


# Running Optuna optimization with W&B integration
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, callbacks=[optuna_callback])

# Saving the best model
best_model = xgb.XGBClassifier(**study.best_trial.params)
X_preprocessed = preprocessor.fit_transform(X_train)
best_model.fit(X_preprocessed, y_train)
joblib.dump(best_model, 'best_model.pkl')

# Log the best model to WandB
wandb.save('best_model.pkl')

# Printing best trial info
best_trial = study.best_trial

print(f"Best trial score: {best_trial.value}")
for key, value in best_trial.params.items():
    print(f"{key}: {value}")

# Close the W&B run
wandb.finish()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
2024-03-18 21:11:52.769819: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-18 21:11:53.352541: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-18 21:11:53.359843: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: 

[I 2024-03-18 21:12:06,389] A new study created in memory with name: no-name-10a6c3de-8e7e-4edc-a972-5d341d78a085
[I 2024-03-18 21:13:47,851] Trial 0 finished with value: 0.8982235583492757 and parameters: {'n_estimators': 1244, 'learning_rate': 0.06722730803702337, 'subsample': 0.994128493589817, 'colsample_bytree': 0.929561652271103, 'colsample_bylevel': 0.6028016239163578, 'colsample_bynode': 0.9240595645149696, 'reg_lambda': 0.18449321351385206, 'reg_alpha': 3.7138576370153657e-06, 'max_delta_step': 7, 'max_leaves': 302, 'min_child_weight': 269, 'max_depth': 2, 'eta': 4.766815627074624e-08, 'gamma': 3.7979381167660786e-05, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.8982235583492757.
[I 2024-03-18 21:15:20,267] Trial 1 finished with value: 0.901448483192129 and parameters: {'n_estimators': 1994, 'learning_rate': 0.016933207802496356, 'subsample': 0.9451844628808974, 'colsample_bytree': 0.6984739387585234, 'colsample_bylevel': 0.949645234012213, 'colsample_bynode': 0.

0,1
Best Value,▁▂▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
Current Value,▇▁█▇▇███████▅█▇█▇█▇▇▆▅▇▇██████▇█████████
score,▇▁█▇▇███████▅█▇█▇█▇▇▆▅▇▇██████▇█████████

0,1
Best Value,0.91397
Current Value,0.91276
score,0.91276


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8973d6a1-4867-433a-bd13-9dab8e184951' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>