In [None]:
# Standard Library
from itertools import chain

# Data Science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score, 
                             roc_auc_score, 
                             roc_curve, 
                             auc, 
                             confusion_matrix)
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Deep Learning
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [None]:
random_state = 42

In [None]:
data = pd.read_json('dataset.json')

In [None]:
data = data[["number_of_items", "price_of_basket", "items_ix", "label"]]

In [None]:
# We can split our dataset into a train, validation and test set and transform it to numpy arrays
x_train, x_test, y_train, y_test = train_test_split(data.drop("label", axis=1).values, 
                                                    data["label"].values,
                                                    stratify=data["label"].values,
                                                    test_size=0.3, 
                                                    random_state=random_state)
x_val, x_test, y_val, y_test = train_test_split(x_test, 
                                                y_test,
                                                stratify=y_test,
                                                test_size=0.5, 
                                                random_state=random_state)

print("train size:", len(x_train))
print("validation size:", len(x_val))
print("test size:", len(x_test))

In [None]:
subset_indices = [0, 1]

In [None]:
scaler = MinMaxScaler()
x_train[:, subset_indices] = scaler.fit_transform(x_train[:, subset_indices])
x_val[:, subset_indices] = scaler.transform(x_val[:, subset_indices])
x_test[:, subset_indices] = scaler.transform(x_test[:, subset_indices])

In [None]:
items_set = set(chain.from_iterable(data["items_ix"].values))

In [None]:
np.max(list(items_set))

### Exploding our dataset

In [None]:
# Train
items_train = pd.DataFrame(x_train[:, 2], columns=["basket_items"])
items_train["label"] = y_train
items_train = items_train.explode("basket_items")

# Validation
items_val = pd.DataFrame(x_val[:, 2], columns=["basket_items"])
items_val["label"] = y_val
items_val = items_val.explode("basket_items")

In [None]:
class EmbeddingsClassifier(torch.nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(EmbeddingsClassifier, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, 1)
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        x = self.embeddings(x)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

In [None]:
def BCELoss_class_weighted(weights):

    def loss(input, target):
        input = torch.clamp(input,min=1e-7,max=1-1e-7)
        bce = - weights[1] * target * torch.log(input) - (1 - target) * weights[0] * torch.log(1 - input)
        return torch.mean(bce)

    return loss

In [None]:
# Define our weighs
w = 0.05
weights = torch.tensor([w, 1-w], dtype=torch.float)

# Define our model, criterion and optimizer
model = EmbeddingsClassifier(vocab_size=np.max(list(items_set)) + 1,
                             embedding_dim=5,)
criterion = BCELoss_class_weighted(weights=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [None]:
x_train = torch.from_numpy(pd.get_dummies(items_train, columns=["basket_items"], dtype=np.int16).values)
y_train = torch.from_numpy(items_train["label"].values.astype(np.int16)).long()
x_val = torch.from_numpy(pd.get_dummies(items_val, columns=["basket_items"], dtype=np.int16).values)
y_val = torch.from_numpy(items_val["label"].values.astype(np.int16)).long()

In [None]:
from helper import training_loop

train_dataloader = DataLoader(TensorDataset(x_train, y_train), batch_size=256, shuffle=True, num_workers=8, pin_memory=True)
val_dataloader = DataLoader(TensorDataset(x_val, y_val), batch_size=256, shuffle=True, num_workers=8, pin_memory=True)

model, training_loss, validate_loss, embeddings_hist = training_loop(
            model=model, 
            criterion=criterion, 
            optimizer=optimizer, 
            train_dataloader=train_dataloader, 
            val_dataloader=val_dataloader, 
            num_epochs=3,
            device=torch.device("cuda"))

### Using our trained embeddings

In [None]:
def return_one_hot_encoding(item_list: list) -> torch.Tensor:
    one_hot_encoding = torch.zeros(np.max(list(items_set)) + 1)
    for item in item_list:
        one_hot_encoding[item] = 1
    return one_hot_encoding

In [None]:
def return_torch_one_hot_encoding(x: np.ndarray) -> torch.Tensor:
    return torch.stack([return_one_hot_encoding(item_list) for item_list in x[:, 2]])

In [None]:
# Define our Binary Classifier
class BinaryClassifier(torch.nn.Module):
    def __init__(self,
                 vocab_size: int,
                 embedding_dim: int,
                 regular_dim: int,
                 hidden_dim: int,
                 dropout_prob: float):
        super(BinaryClassifier).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = torch.nn.Linear(embedding_dim + regular_dim, hidden_dim)
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.fc2 = torch.nn.Linear(hidden_dim, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, 
                x_items: torch.Tensor, 
                x_regular: torch.Tensor) -> torch.Tensor:
        embeddings = self.embedding(x_items)
        embeddings = torch.mean(embeddings, dim=1)
        combined = torch.cat([embeddings, x_regular], dim=1)
        x = self.fc1(combined)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.sigmoid(x)
        return x

In [None]:
# Rule of thumb: embedding_dim = 4th root of vocab_size
print(round((np.max(list(items_set)) + 1)**(1/4)))

In [None]:
y_train.sum() / len(y_train)

In [None]:
def BCELoss_class_weighted(weights):

    def loss(input, target):
        input = torch.clamp(input,min=1e-7,max=1-1e-7)
        bce = - weights[1] * target * torch.log(input) - (1 - target) * weights[0] * torch.log(1 - input)
        return torch.mean(bce)

    return loss

In [None]:
# Define our weighs
w = 0.05
weights = torch.tensor([w, 1-w], dtype=torch.float)

# Define our model, criterion and optimizer
model = BinaryClassifier(vocab_size=np.max(list(items_set)) + 1,
                         embedding_dim=5,
                         regular_dim=2,
                         hidden_dim=100,
                         dropout_prob=0.5)
# criterion = torch.nn.BCELoss(weight=weights)
criterion = BCELoss_class_weighted(weights=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [None]:
model.eval()

In [None]:
# Train data
x_train_items = return_torch_one_hot_encoding(x_train).long()
x_train_regular = torch.from_numpy(x_train[:, subset_indices].astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1)

# Validation data
x_val_items = return_torch_one_hot_encoding(x_val).long()
x_val_regular = torch.from_numpy(x_val[:, subset_indices].astype(np.float32))
y_val = torch.from_numpy(y_val.astype(np.float32)).view(-1, 1)

# Test data
x_test_items = return_torch_one_hot_encoding(x_test).long()
x_test_regular = torch.from_numpy(x_test[:, subset_indices].astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32)).view(-1, 1)

In [None]:
print(x_train_items.shape)
print(x_train_regular.shape)
print(y_train.shape)

In [None]:
# Create our train and validation datasets
train_dataset = TensorDataset(x_train_items, x_train_regular, y_train)
val_dataset = TensorDataset(x_val_items, x_val_regular, y_val)

In [None]:
# Move model to GPU if available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

In [None]:
batch_size = 4096
shuffle = True
num_workers = 8

train_dataloader = DataLoader(train_dataset, 
                              batch_size=batch_size, 
                              shuffle=shuffle, 
                              num_workers=num_workers,
                              pin_memory=True)
val_dataloader = DataLoader(val_dataset, 
                            batch_size=batch_size, 
                            shuffle=shuffle, 
                            num_workers=num_workers,
                            pin_memory=True)

In [None]:
# Define our train step
def train() -> float:
    model.train()
    train_loss = 0

    for x_items, x_regular, y in train_dataloader:
        x_items = x_items.to(device)
        x_regular = x_regular.to(device)
        y = y.to(device)

        # Forward pass
        optimizer.zero_grad()
        y_pred = model(x_items, x_regular)
        loss = criterion(y_pred, y)

        # Backward pass
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    return train_loss / len(train_dataloader)

# Define our validation step
def validate() -> float:
    model.eval()
    val_loss = 0

    for x_items, x_regular, y in val_dataloader:
        x_items = x_items.to(device)
        x_regular = x_regular.to(device)
        y = y.to(device)

        # Forward pass
        y_pred = model(x_items, x_regular)
        loss = criterion(y_pred, y)
        val_loss += loss.item()
    
    return val_loss / len(val_dataloader)

In [None]:
# Define necessary variables
num_epochs = 25
best_loss = float('inf')
epochs_since_improvement = 0
patience = 8  # Number of epochs to wait for improvement

# Track metrics
train_losses = []
val_losses = []

model = model.to(device)
criterion = criterion#.to(device)

embedding_hist = []

for params in model.parameters():
    params.requires_grad = True

# Training loop
for epoch in range(num_epochs):
    train_loss = train()  # Perform training steps
    train_losses.append(train_loss)  # Track training loss
    
    # Calculate validation loss
    val_loss = validate()  # Perform validation steps
    val_losses.append(val_loss)  # Track validation loss

    print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {train_loss:.4f} - Validation Loss: {val_loss:.4f}")
    
    # Check for improvement
    if val_loss < best_loss:
        best_loss = val_loss
        epochs_since_improvement = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        epochs_since_improvement += 1
    
    # Check if early stopping criteria met
    if epochs_since_improvement > patience:
        print(f"Early stopping triggered. No improvement for {patience} epochs.")
        break

    embedding_hist.append(model.embedding.weight.data)

# Load the best model checkpoint
model.load_state_dict(torch.load('best_model.pt'))

In [None]:
plt.plot(train_losses, label="train loss")
plt.plot(val_losses, label="validation loss")
plt.legend()
plt.show()

In [None]:
for i in range(len(embedding_hist) - 1):
    print(embedding_hist[i] == embedding_hist[i + 1])

In [None]:
y_pred_proba = model(x_test_items.to(device), x_test_regular.to(device)).detach().cpu().numpy()
y_pred = np.where(y_pred_proba > 0.5, 1, 0).squeeze()
# y_test = y_test.numpy().squeeze()

In [None]:
print("accuracy:", accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1:", f1_score(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
model.embedding.weight.detach().cpu().numpy().shape

In [None]:
def plot_pca(x: np.ndarray, labels: np.ndarray):
    # Instantiate our PCA object
    pca = PCA(n_components=3)

    # Fit our PCA object to the data
    pca_results = pca.fit_transform(x)
    print(pca.explained_variance_ratio_)

    # Create a dataframe for the reduced data
    pca_df = pd.DataFrame(data=pca_results, columns=["PC1", "PC2", "PC3"])

    # Collect the data
    xs = pca_df["PC1"]
    ys = pca_df["PC2"]
    zs = pca_df["PC3"]
    c = labels

    # Plot the data
    ax = plt.figure().add_subplot(projection='3d')
    ax.scatter(xs, ys, zs, c=c, cmap="viridis", alpha=0.5)

    # Apply alpha only to a subset of data points
    # Define the condition to select the subset
    ax.scatter(xs[labels], ys[labels], zs[labels], c=labels, cmap='hot', alpha=1.0)

    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    plt.show()    

In [None]:
basket = Basket()

In [None]:
labels = np.zeros(203)
for i in [basket.items_list.index(i) for i in basket.departments_to_items["Meat/Seafood"]]:
    labels[i] = 1

In [None]:
plot_pca(model.embedding.weight.detach().cpu().numpy(), labels)

In [None]:
def plot_tsne(x: np.ndarray, labels: np.ndarray):
    # Instantiate our T-SNE object
    tsne = TSNE(n_components=3, learning_rate='auto')

    # Fit our T-SNE object to the data
    tsne_results = tsne.fit_transform(x)

    # Create a dataframe for the reduced data
    tsne_df = pd.DataFrame(data=tsne_results, columns=["dim1", "dim2", "dim3"])
    
    # Collect the data
    xs = tsne_df["dim1"]
    ys = tsne_df["dim2"]
    zs = tsne_df["dim3"]
    c = labels

    # Plot the data
    ax = plt.figure().add_subplot(projection='3d')
    ax.scatter(xs, ys, zs, c=c, cmap="viridis")
    
    ax.set_xlabel("dim1")
    ax.set_ylabel("dim2")
    ax.set_zlabel("dim3")
    plt.show()

In [None]:
plot_tsne(model.embedding.weight.detach().cpu().numpy(), labels)