In [None]:
import catboost as cb
import polars as pl
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# Data

In [None]:
# Specify features and target
NUMERIC_FEATURES = [
    "acousticness",
    "energy",
    "loudness",
    "speechiness",
    "danceability",
    "instrumentalness",
    "liveness",
    "mode",
    "tempo",
    "valence",
]
OHE_FEATURES = [
    "key",
    "time_signature",
]
TARGET = "playlist_name"

In [None]:
# Load data
features = pl.read_parquet("features.parquet")
playlists = pl.read_parquet("playlists.parquet")

# Join
raw = playlists.join(features, left_on=["track_id", "track_name"], right_on=["id", "track"], how="left")

In [None]:
def train_test_split(df: pl.DataFrame, train_fraction: float = 0.8) -> tuple[pl.DataFrame, pl.DataFrame]:
    # Note: also works with pl.LazyFrame
    df = df.with_columns(pl.all().shuffle(seed=1)).with_row_index("_row")
    df_train = df.filter(pl.col("_row") < pl.col("_row").max() * train_fraction)
    df_test = df.filter(pl.col("_row") >= pl.col("_row").max() * train_fraction)
    return df_train.drop("_row"), df_test.drop("_row")

# PyTorch

## Data

In [None]:
# OHE features and target
df = raw.to_dummies([*OHE_FEATURES, TARGET])

In [None]:
# Get column info
feature_cols = NUMERIC_FEATURES + [c for ohe_col in OHE_FEATURES for c in df.columns if c.startswith(ohe_col)]
num_features = len(feature_cols)
class_cols = [c for c in df.columns if c.startswith(TARGET)]
num_classes = len(class_cols)

num_features, num_classes

In [None]:
# Split data
df_train, df_test = train_test_split(df[[*feature_cols, *class_cols]], 0.8)

# Split into X and y
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]
y_train = df_train[class_cols]
y_test = df_test[class_cols]

# Get datasets
ds_train = df_train.to_torch("dataset", label=class_cols, dtype=pl.Float32)
ds_test = df_test.to_torch("dataset", label=class_cols, dtype=pl.Float32)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
X_train.head()

## Model

In [None]:
# Get device
# TODO: enable CUDA
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# Define model
model = nn.Sequential(
    nn.Linear(num_features, 16),
    nn.ReLU(),
    nn.Linear(16, 32),
    nn.ReLU(),
    # nn.Linear(32, 32),
    # nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, num_classes),
).to(device)

# Define criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model

In [None]:
# Specify training parameters
num_epochs = 16
batch_size = 16
batches_per_epoch = len(ds_train) // batch_size

# Initialize dataloaders
dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=True)

# Train model
model.train()  # set to training mode
for epoch in range(num_epochs):
    with tqdm(total=len(dl_train), desc=f"Epoch {epoch + 1}", unit="batch") as bar:
        for i, (X, y) in enumerate(dl_train):
            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            y_pred = model(X)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            # Calculate accuracy
            acc = (torch.argmax(y_pred, 1) == torch.argmax(y, 1)).float().mean()

            # Update bar
            bar.update(i)
            bar.set_postfix({"CrossEntropyLoss": f"{loss:.3f}", "Accuracy": f"{acc:.3f}"})

    # TODO: save best weights based on test set: https://machinelearningmastery.com/building-a-multiclass-classification-model-in-pytorch/

In [None]:
# Prepare to score
model.eval()  # set to evaluation mode
pred_train = model(X_train.to_torch(dtype=pl.Float32))
pred_test = model(X_test.to_torch(dtype=pl.Float32))
true_train = y_train.to_torch(dtype=pl.Float32)
true_test = y_test.to_torch(dtype=pl.Float32)

# Score
ce_train = criterion(pred_train, true_train)
ce_test = criterion(pred_test, true_test)
acc_train = (torch.argmax(pred_train, 1) == torch.argmax(true_train, 1)).float().mean()
acc_test = (torch.argmax(pred_test, 1) == torch.argmax(true_test, 1)).float().mean()

print(f"Train - CrossEntropyLoss: {ce_train:.3f}, Accuracy: {acc_train:.3f}")
print(f"Test - CrossEntropyLoss: {ce_test:.3f}, Accuracy: {acc_test:.3f}")

# CatBoost

## Data

In [None]:
# Get data
df = raw.clone()

In [None]:
# Get column info
feature_cols = NUMERIC_FEATURES + OHE_FEATURES
num_features = len(feature_cols)
class_col = TARGET
num_classes = len(df[class_col].unique())

num_features, num_classes

In [None]:
# Split data
df_train, df_test = train_test_split(df[[*feature_cols, class_col]], 0.8)

# Split into X and y
X_train = df_train[feature_cols].to_numpy()
X_test = df_test[feature_cols].to_numpy()
y_train = df_train[class_col].to_numpy()
y_test = df_test[class_col].to_numpy()

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

## Model

In [None]:
# Define model
model = cb.CatBoostClassifier(
    iterations=1024,
    early_stopping_rounds=16,
    verbose=False,
)

In [None]:
# Train model
model.fit(X_train, y_train, eval_set=(X_test, y_test))

In [None]:
# Prepare to score
pred_train = model.predict(X_train)[:, 0]
pred_test = model.predict(X_test)[:, 0]
true_train = y_train
true_test = y_test

# Score
acc_train = (pred_train == true_train).mean()
acc_test = (pred_test == true_test).mean()

print(f"Train - Accuracy: {acc_train:.3f}")
print(f"Test - Accuracy: {acc_test:.3f}")