In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [17]:
import sys

sys.path.append('..')
from src.data.preprocessing import DataPreprocessor

df = pd.read_csv('../data/raw/data.csv')
X = df.drop('quality', axis=1)
y = df['quality']

dp = DataPreprocessor()
X_train, X_test, y_train, y_test = dp.fit_transform(X, y)

In [18]:
gc_results = {}

In [19]:
params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 4, 5, 6, 8],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [None, 5, 10],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 4, 6]
}

gc = GridSearchCV(
    RandomForestClassifier(random_state=42),
    params,
    scoring='accuracy',
    n_jobs=-1
)

gc.fit(X_train, y_train)

gc_results['RandomForestClassifier'] = {
    'model': gc.best_estimator_,
    'best_params': gc.best_params_,
    'accuracy_score': gc.best_score_
}

In [20]:
params = {
    'max_depth': [3, 4, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'n_estimators': [100, 150, 200],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0.0, 0.1, 0.2, 0.4],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

gc = GridSearchCV(
    XGBClassifier(random_state=42),
    params,
    scoring='accuracy',
    n_jobs=-1
)

gc.fit(X_train, y_train)

gc_results['XGBClassifier'] = {
    'model': gc.best_estimator_,
    'best_params': gc.best_params_,
    'accuracy_score': gc.best_score_
}

In [21]:
gc_results

{'RandomForestClassifier': {'model': RandomForestClassifier(max_depth=8, min_samples_split=4, random_state=42),
  'best_params': {'max_depth': 8,
   'max_features': 'sqrt',
   'max_leaf_nodes': None,
   'min_samples_leaf': 1,
   'min_samples_split': 4,
   'n_estimators': 100},
  'accuracy_score': np.float64(0.8866421568627452)},
 'XGBClassifier': {'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=1.0, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                feature_weights=None, gamma=0.1, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=0.15, max_bin=None, max_cat_threshold=None,
                max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
                max_leaves=None, min_child_weight=1, missing=nan,
         

## Optuna

In [23]:
optuna_results = {}

### Random Forest

In [36]:
import optuna
from sklearn.model_selection import cross_val_score

def RF(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 200),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    }

    clf = RandomForestClassifier(**params, random_state=42)

    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=5, scoring='accuracy').mean()

    return score

study = optuna.create_study(direction='maximize')
study.optimize(RF, n_trials=50)

optuna_results['RandomForestClassifier'] = study

[I 2026-01-02 01:12:41,781] A new study created in memory with name: no-name-1721e074-7818-4ca2-bda4-fa05a2e1f5b5
[I 2026-01-02 01:12:41,883] Trial 0 finished with value: 0.8717830882352942 and parameters: {'n_estimators': 41, 'max_depth': 23, 'max_features': 'sqrt', 'max_leaf_nodes': 7, 'min_samples_split': 14, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8717830882352942.
[I 2026-01-02 01:12:42,138] Trial 1 finished with value: 0.8702236519607844 and parameters: {'n_estimators': 109, 'max_depth': 32, 'max_features': 'log2', 'max_leaf_nodes': 11, 'min_samples_split': 13, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8717830882352942.
[I 2026-01-02 01:12:42,413] Trial 2 finished with value: 0.8710049019607844 and parameters: {'n_estimators': 124, 'max_depth': 19, 'max_features': 'log2', 'max_leaf_nodes': 14, 'min_samples_split': 19, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8717830882352942.
[I 2026-01-02 01:12:42,784] Trial 3 finished with value: 0.869442

In [None]:
def XGB(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10, 200),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }
        
    clf = XGBClassifier(**params, random_state=42)

    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=5, scoring='accuracy').mean()

    return score

study = optuna.create_study(direction='maximize')
study.optimize(XGB, n_trials=50)

optuna_results['XGBClassifier'] = study

[I 2026-01-02 01:10:54,192] A new study created in memory with name: no-name-adc6fcd9-76b2-4028-9f98-c75011c756fb
[I 2026-01-02 01:10:54,269] Trial 0 finished with value: 0.8749080882352942 and parameters: {'max_depth': 6, 'learning_rate': 0.03621691005971648, 'n_estimators': 119, 'min_child_weight': 12, 'gamma': 5.745567428477994e-06, 'subsample': 0.4935941083684259, 'colsample_bytree': 0.6930215870933208}. Best is trial 0 with value: 0.8749080882352942.
[I 2026-01-02 01:10:54,318] Trial 1 finished with value: 0.8639583333333334 and parameters: {'max_depth': 8, 'learning_rate': 0.026135281985661674, 'n_estimators': 25, 'min_child_weight': 7, 'gamma': 8.892532470293681e-05, 'subsample': 0.7356983606119052, 'colsample_bytree': 0.7339749257901653}. Best is trial 0 with value: 0.8749080882352942.
[I 2026-01-02 01:10:54,357] Trial 2 finished with value: 0.8639583333333334 and parameters: {'max_depth': 16, 'learning_rate': 0.029219639998598293, 'n_estimators': 20, 'min_child_weight': 5, 'ga

In [42]:
optuna_results['RandomForestClassifier'].best_params


{'n_estimators': 63,
 'max_depth': 26,
 'max_features': 'log2',
 'max_leaf_nodes': 19,
 'min_samples_split': 4,
 'min_samples_leaf': 1}

In [43]:
optuna_results['RandomForestClassifier'].best_value

0.8811672794117648

In [47]:
import joblib

RF_best_params = optuna_results['RandomForestClassifier'].best_params
RF_model = RandomForestClassifier(**RF_best_params, random_state=42)
RF_model.fit(X_train, y_train)

XGB_best_params = optuna_results['XGBClassifier'].best_params
XGB_model = XGBClassifier(**XGB_best_params, random_state=42)
XGB_model.fit(X_train, y_train)


joblib.dump(RF_model, '../models/RandomForestClassifier.pkl')
joblib.dump(XGB_model, '../models/XGBClassifier.pkl')

['../models/XGBClassifier.pkl']