In [1]:
from helpers.book_matrix import S21UserBookMatrixBuilder
from helpers.pipeline_manager import S21AgePipelineManager

In [2]:
ratios = (0.7, 0.15, 0.15)

builder = S21UserBookMatrixBuilder('../datasets/data/Ratings.csv', '../datasets/data/Users.csv')
split = builder.build_split(
    ratios,
    seed=42,
    min_age=5,
    max_age=100,
    top_n_books=20000,
)

X_train, X_val, X_test, y = split.X_train, split.X_val, split.X_test, split.y

pm = S21AgePipelineManager(random_state=42, n_iter=5)

In [3]:
pm.fit_linear(X_train, y)
lin_model, lin_params = pm.get_linear()

In [4]:
pm.fit_linear(X_train, y, 'PCA')
lin_model_pca, lin_params_pca = pm.get_linear()

In [5]:
pm.fit_linear(X_train, y, 'UMAP')
lin_model_umap, lin_params_umap = pm.get_linear()

In [6]:
pm.fit_forest(X_train, y)
rf_model, rf_params = pm.get_forest()

In [None]:
pm.fit_forest(X_train, y, 'PCA')
forest_model_pca, forest_params_pca = pm.get_forest()

In [None]:
pm.fit_forest(X_train, y, 'UMAP')
forest_model_umap, forest_params_umap = pm.get_forest()

In [None]:
models = (
    ("Ridge", lin_model),
    ("Ridge+PCA", lin_model_pca),
    ("Ridge+UMAP", lin_model_umap),
    ("RF", rf_model),
    ("RF+PCA", forest_model_pca),
    ("RF+UMAP", forest_model_umap),
)

pm.evaluate(models, X_train, X_val, y)

Ridge [train] -> MAE: 7.745, RMSE: 10.396, R2: 0.415
Ridge [val]   -> MAE: 17.325, RMSE: 56.023, R2: -15.988
Ridge+PCA [train] -> MAE: 11.074, RMSE: 13.577, R2: 0.002
Ridge+PCA [val]   -> MAE: 11.085, RMSE: 13.588, R2: 0.001
Ridge+UMAP [train] -> MAE: 11.052, RMSE: 13.553, R2: 0.006
Ridge+UMAP [val]   -> MAE: 11.111, RMSE: 13.625, R2: -0.005
RF [train] -> MAE: 11.004, RMSE: 13.485, R2: 0.016
RF [val]   -> MAE: 11.073, RMSE: 13.579, R2: 0.002
RF+PCA [train] -> MAE: 9.403, RMSE: 11.756, R2: 0.252
RF+PCA [val]   -> MAE: 10.822, RMSE: 13.637, R2: -0.007
RF+UMAP [train] -> MAE: 10.216, RMSE: 12.612, R2: 0.139
RF+UMAP [val]   -> MAE: 11.094, RMSE: 13.591, R2: 0.000


Дольше всего обучается UMAP, затем идет PCA и меньше всего времени занимает обучение без уменьшения размерности