In [None]:
import polars as pl
import xgboost
from calc_metrics import calculate_basic_metrics
from setup.constants import PROJECT_ROOT
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [None]:
# Create schema for csv reading
wine_schema = {
    "free sulfur dioxide": pl.Float64,
    "total sulfur dioxide": pl.Float64,
    "quality": pl.Float64,
}

In [None]:
# Split red wine data into training and test
DATA_PATH = PROJECT_ROOT / "data"
red_wine_data = pl.read_csv(
    DATA_PATH / "winequality-red.csv", separator=";", schema_overrides=wine_schema
)
r_X = red_wine_data.drop(pl.col("quality"))
r_y = red_wine_data.get_column("quality")
r_X_train, r_X_test, r_y_train, r_y_test = train_test_split(
    r_X, r_y, train_size=0.8, test_size=0.2, stratify=r_y
)
print(len(r_X_train), len(r_y_train), len(r_X_test), len(r_y_test))

In [None]:
# Split white wine data into training and test
white_wine_data = pl.read_csv(
    DATA_PATH / "winequality-white.csv", separator=";", schema_overrides=wine_schema
)
w_X = white_wine_data.drop(pl.col("quality"))
w_y = white_wine_data.get_column("quality")
w_X_train, w_X_test, w_y_train, w_y_test = train_test_split(
    w_X, w_y, train_size=0.8, test_size=0.2, stratify=w_y
)
print(len(w_X_train), len(w_y_train), len(w_X_test), len(w_y_test))

In [None]:
results = []  # <- For storing the results before concating the df

results_schema = {
    "model": pl.String,
    "dataset": pl.String,
    "accuracy": pl.Float64,
    "precision": pl.Float64,
    "recall": pl.Float64,
    "f1": pl.Float64,
    "balanced_accuracy": pl.Float64,
    "overall_test_set_performance": pl.Float64,
}

# Create dfs to store the results of the models
results_df = pl.DataFrame(schema=results_schema)

In [None]:
# Create a parameter grid for AdaBoost
ab_param_grid = {
    "n_estimators": [10, 25, 50, 100],
    "learning_rate": [0.001, 0.01, 0.1, 1],
}

# Create a base AB model to run grid search on
ab_model = AdaBoostClassifier(estimator=DecisionTreeClassifier())

In [None]:
# Train the AB model for red wine
r_ab_model = (
    GridSearchCV(
        estimator=ab_model, param_grid=ab_param_grid, scoring="balanced_accuracy", refit=True
    )
    .fit(r_X_train, r_y_train)
    .best_estimator_
)

# Calculate statistics for red wine AB model
r_ab_model_stats = calculate_basic_metrics(
    estimator=r_ab_model,
    X=r_X_test,
    y=r_y_test,
    dataset="red_wine",
    model_name="AdaBoostClassifier",
)
print(r_ab_model_stats)
results.append(r_ab_model_stats)

In [None]:
# Train the AB model for white wine
w_ab_model = (
    GridSearchCV(
        estimator=ab_model, param_grid=ab_param_grid, scoring="balanced_accuracy", refit=True, cv=3
    )
    .fit(w_X_train, w_y_train)
    .best_estimator_
)

# Calculate statistics for white wine AB model
w_ab_model_stats = calculate_basic_metrics(
    estimator=w_ab_model,
    X=w_X_test,
    y=w_y_test,
    dataset="white_wine",
    model_name="AdaBoostClassifier",
)
print(w_ab_model_stats)
results.append(w_ab_model_stats)

In [None]:
# Create a parameter grid for GradientBoost
gb_param_grid = {
    "learning_rate": [0.001, 0.01, 0.1, 1],
    "n_estimators": [10, 25, 50],
    "criterion": ["friedman_mse", "squared_error"],
    "min_samples_leaf": [1, 2],
    "max_depth": [1, 3],
    "max_features": [None],
}

# Create a base GB model to run grid search on
gb_model = GradientBoostingClassifier()

In [None]:
# Train the GB model for red wine
r_gb_model = (
    GridSearchCV(
        estimator=gb_model,
        param_grid=gb_param_grid,
        scoring="balanced_accuracy",
        refit=True,
        cv=3,
        n_jobs=-1,
    )
    .fit(r_X_train, r_y_train)
    .best_estimator_
)

# Calculate statistics for red wine GB model
r_gb_model_stats = calculate_basic_metrics(
    estimator=r_gb_model,
    X=r_X_test,
    y=r_y_test,
    dataset="red_wine",
    model_name="GradientBoostingClassifier",
)
results.append(r_gb_model_stats)
print(r_gb_model_stats)

In [None]:
# Train the GB model for white wine
w_gb_model = (
    GridSearchCV(
        estimator=gb_model,
        param_grid=gb_param_grid,
        scoring="balanced_accuracy",
        refit=True,
        cv=3,
        n_jobs=-1,
    )
    .fit(w_X_train, w_y_train)
    .best_estimator_
)

# Calculate statistics for white wine GB model
w_gb_model_stats = calculate_basic_metrics(
    estimator=w_gb_model,
    X=w_X_test,
    y=w_y_test,
    dataset="white_wine",
    model_name="GradientBoostingClassifier",
)
results.append(w_gb_model_stats)
print(w_gb_model_stats)

In [None]:
xgboost.config_context(use_rmm=True)

In [None]:
# Create a parameter grid for XGBoost
xgb_param_grid = {
    "learning_rate": [0.1, 0.3, 0.5],
    "gamma": [0, 1],
    "max_depth": [3, 6, 12],
    "lambda": [0, 1],
    "max_leaves": [0, 6],
}

# Create a base XGBoost model to run grid search on
xgb_model = XGBClassifier(
    booster="gbtree",
    device="cpu",
)

In [None]:
# Transform data so that XGBoost can use it
label_encoder = LabelEncoder()

r_y_train_xgb = label_encoder.fit_transform(r_y_train)
r_y_test_xgb = label_encoder.transform(r_y_test)

w_y_train_xgb = label_encoder.fit_transform(w_y_train)
w_y_test_xgb = label_encoder.fit_transform(w_y_test)

In [None]:
# Train the XGBoost model for red wine
r_xgb_model = (
    GridSearchCV(
        estimator=xgb_model,
        param_grid=xgb_param_grid,
        scoring="balanced_accuracy",
        refit=True,
        cv=3,
        n_jobs=-1,
    )
    .fit(r_X_train, r_y_train_xgb)
    .best_estimator_
)

# Calculate statistics for red wine XGBoost model
r_xgb_model_stats = calculate_basic_metrics(
    estimator=r_xgb_model, X=r_X_test, y=r_y_test_xgb, dataset="red_wine", model_name="XGBoost"
)
results.append(r_xgb_model_stats)
print(r_xgb_model_stats)

In [None]:
# Train the XGBoost model for white wine
w_xgb_model = (
    GridSearchCV(
        estimator=xgb_model,
        param_grid=xgb_param_grid,
        scoring="balanced_accuracy",
        refit=True,
        cv=3,
        n_jobs=-1,
    )
    .fit(w_X_train, w_y_train_xgb)
    .best_estimator_
)

# Calculate statistics for white wine XGBoost model
w_xgb_model_stats = calculate_basic_metrics(
    estimator=w_xgb_model, X=w_X_test, y=w_y_test_xgb, dataset="white_wine", model_name="XGBoost"
)
results.append(w_xgb_model_stats)
print(w_xgb_model_stats)

In [None]:
# Concat results into one dataframe
for result in results:
    results_df = pl.concat([results_df, result], how="vertical")

In [None]:
# Inspect performance results on red wine dataset
print(
    results_df.filter(pl.col("dataset") == "red_wine").sort(
        pl.col("overall_test_set_performance"), descending=True
    )
)

In [None]:
# Inspect performance results on white wine datasets
print(
    results_df.filter(pl.col("dataset") == "white_wine").sort(
        pl.col("overall_test_set_performance"), descending=True
    )
)