In [None]:
import os

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

from srai.datasets import AirbnbMulticityDataset
from srai.models import Evaluator, RegressionBaseModel, Trainer, Vectorizer

In [None]:
gdf_airbnb = AirbnbMulticityDataset().load(os.getenv("HF_TOKEN"))
gdf_airbnb = gdf_airbnb.loc[gdf_airbnb["city"].isin(["paris"])]

In [None]:
numerical_columns = [
    "number_of_reviews",
    "minimum_nights",
    "availability_365",
    "calculated_host_listings_count",
    "number_of_reviews_ltm",
]

vectorizer = Vectorizer(
    gdf_dataset=gdf_airbnb,
    target_column_name="price",
    numerical_columns=numerical_columns,
    embedder_type="Hex2VecEmbedder",
    h3_resolution=8,
)

In [None]:
dataset_airbnb = vectorizer.get_dataset()
embedding_size = dataset_airbnb["X"].shape[1]

In [None]:
train_indices, test_indices = train_test_split(
    range(len(dataset_airbnb)),
    test_size=0.2,
)

train_split = Subset(dataset_airbnb, train_indices)
test_split = Subset(dataset_airbnb, test_indices)

In [None]:
regression_model = RegressionBaseModel(embedding_size)

In [None]:
loss_fn = nn.L1Loss()
optimizer = optim.Adam(regression_model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args = {
    "batch_size": 32,
    "task": "regression",
    "epochs": 10,
    "device": device,
    "metric2look4": "MAE",
}
trainer = Trainer(
    model=regression_model,
    train_dataset=train_split,
    eval_dataset=test_split,
    training_args=args,
    optimizer=optimizer,
    loss_fn=loss_fn,
)

In [None]:
model, _, _ = trainer.train()

In [None]:
evaluator = Evaluator(task="regression", device=device)

In [None]:
evaluator.evaluate(model, test_split)