In [None]:
import os

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from srai.datasets import AirbnbMulticityDataset
from srai.models import RegressionBaseModel, Vectorizer

In [None]:
gdf_airbnb = AirbnbMulticityDataset().load(os.getenv("HF_TOKEN"))
gdf_airbnb = gdf_airbnb.loc[gdf_airbnb["city"].isin(["paris"])]

In [None]:
numerical_columns = [
    "number_of_reviews",
    "minimum_nights",
    "availability_365",
    "calculated_host_listings_count",
    "number_of_reviews_ltm",
]

vectorizer = Vectorizer(
    gdf_dataset=gdf_airbnb,
    target_column_name="price",
    numerical_columns=numerical_columns,
    embedder_type="Hex2VecEmbedder",
    h3_resolution=8,
)

In [None]:
dataset_airbnb = vectorizer.get_dataset()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset_airbnb["X"],
    dataset_airbnb["y"].reshape(-1, 1),
    train_size=0.8,
    shuffle=True,
)
X_train = X_train.to(vectorizer.device)
X_test = X_test.to(vectorizer.device)
y_train = y_train.to(vectorizer.device)
y_test = y_test.to(vectorizer.device)

In [None]:
batch_size = 32
train_data = DataLoader(list(zip(X_train, y_train)), batch_size=batch_size, shuffle=True)
test_data = DataLoader(list(zip(X_test, y_test)), batch_size=batch_size)

In [None]:
for x, y in train_data:
    print(x.shape)
    print(y.shape)
    break

In [None]:
regression_model = RegressionBaseModel(X_train.shape[1])
regression_model

In [None]:
# TODO: Trainer part