## Overview
### Two Tower Architecture
This model is based on the two-tower architecture
The two towers are used to learn representations of both the user and the item. The two-tower model is based on queries and the candidate they both shared a low-dimensional vector space. In our case, a query is customer and its transactions features and the candidate is the articles.

I will try to use mlflow to keep track of experiments (and improve myself with this tool).



![Alt text](https://miro.medium.com/v2/resize:fit:420/1*JbK2gjfLC4IFoM6AVWLaUQ.png)

![Alt text](https://miro.medium.com/v2/resize:fit:420/format:webp/0*aJHT3_bGIvERfhaY)

[image source](https://medium.com/smartnews-inc/user-behavior-sequence-for-items-recommendation-in-smartnews-ads-2376622f6192)



Source: 
- https://medium.com/smartnews-inc/user-behavior-sequence-for-items-recommendation-in-smartnews-ads-2376622f6192
- https://cloud.google.com/blog/products/ai-machine-learning/scaling-deep-retrieval-tensorflow-two-towers-architecture?hl=en

In [2]:
import pandas as pd
import numpy as np
import preprocessing
import torch

## Reading data and feature selections

In [3]:
articles_df = pd.read_csv("data/articles.csv", encoding="utf-8")
print("Shape before: ", articles_df.shape)
articles_df = preprocessing.preprocess_articles(articles_df)
print("Shape after: ", articles_df.shape)

Shape before:  (105542, 25)
Shape after:  (105542, 24)


In [4]:
customers_df = pd.read_csv("data/customers.csv", encoding="utf-8")
print("Shape before: ", customers_df.shape)
customers_df = preprocessing.preprocess_customers(customers_df)
print("Shape after: ", customers_df.shape)

Shape before:  (1371980, 7)
Shape after:  (1356119, 5)


In [5]:
transaction_df = pd.read_csv("data/transactions_train.csv", encoding="utf-8")
print("Shape before: ", transaction_df.shape)
transaction_df = preprocessing.preprocess_transactions(transaction_df)
print("Shape after: ", transaction_df.shape)

Shape before:  (31788324, 5)
Shape after:  (31788324, 11)


In [6]:
# join the three data sources to make the data compatible with out retrieval model.
df = pd.merge(
    pd.merge(
        transaction_df[["article_id", "customer_id", "t_dat", "price", "month_sin", "month_cos"]], articles_df[["article_id", "garment_group_name", "index_group_name"]], on="article_id", how="left"
    ),
    customers_df[["customer_id", "age", "club_member_status", "age_group"]],
    on="customer_id",
    how="left",
)

In [7]:
df.head(2)

Unnamed: 0,article_id,customer_id,t_dat,price,month_sin,month_cos,garment_group_name,index_group_name,age,club_member_status,age_group
0,663713001,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,2018-09-20,0.050831,-0.866025,-0.5,"Under-, Nightwear",Ladieswear,24.0,ACTIVE,19-25
1,541518023,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,2018-09-20,0.030492,-0.866025,-0.5,"Under-, Nightwear",Ladieswear,24.0,ACTIVE,19-25


In [8]:
df.shape

(31788324, 11)

## Train/Validate/Test split and Tensors

In [9]:
from sklearn.model_selection import train_test_split

seed = 2024
torch.manual_seed(seed)

<torch._C.Generator at 0x2269ec532f0>

In [10]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=seed)

In [11]:
print("Training shape: ", train_df.shape)
print("Validation shape: ", val_df.shape)

Training shape:  (28609491, 11)
Validation shape:  (3178833, 11)


In [12]:
# from torch.utils.data import Dataset, DataLoader
# from sklearn.preprocessing import LabelEncoder


# class CachedDataset(Dataset):
#     def __init__(self, df):
#         self.encoders = {}
#         self.data = {}

#         # Process each column based on its type
#         for col in df.columns:
#             # Get column data
#             col_data = df[col].values

#             # Handle different data types
#             if pd.api.types.is_numeric_dtype(df[col]) or col in ["article_id"]:
#                 # For numeric data, convert to float32
#                 self.data[col] = torch.tensor(col_data.astype(np.float32))

#             else:
#                 print("Col: ", col)
#                 # For categorical/string data, use label encoding
#                 self.encoders[col] = LabelEncoder()
#                 encoded_data = self.encoders[col].fit_transform(col_data)
#                 self.data[col] = torch.tensor(encoded_data, dtype=torch.long)

#     def __len__(self):
#         return len(next(iter(self.data.values())))

#     def __getitem__(self, idx):
#         return {key: value[idx] for key, value in self.data.items()}

#     def get_feature_dims(self):
#         """Return the number of unique values for each categorical feature"""
#         feature_dims = {}
#         for col, encoder in self.encoders.items():
#             feature_dims[col] = len(encoder.classes_)
#         return feature_dims

In [13]:
from torch.utils.data import Dataset, DataLoader


class CachedDataset(Dataset):
    def __init__(self, df):
        self.data = {col: torch.tensor(df[col].values) if pd.api.types.is_numeric_dtype(df[col]) else df[col].values for col in df.columns}

    def __len__(self):
        return len(next(iter(self.data.values())))

    def __getitem__(self, idx):
        return {key: value[idx] for key, value in self.data.items()}

In [14]:
BATCH_SIZE = 64  # 2048

# Create datasets
train_dataset = CachedDataset(train_df)
val_dataset = CachedDataset(val_df)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)

val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

In [15]:
train_dataset[0:2]

{'article_id': array(['898983001', '826760003'], dtype=object),
 'customer_id': array(['1a7980d01d2858df0d407e65140b0a50ef875c970cd65781a2b6251fca5e8861',
        '1f014e0075e19fa35dbd9f731f72b2b649571890b7ab3bdb164cb385d5cb1b72'],
       dtype=object),
 't_dat': array(['2020-05-18T00:00:00.000000000', '2020-01-17T00:00:00.000000000'],
       dtype='datetime64[ns]'),
 'price': tensor([0.0254, 0.0137], dtype=torch.float64),
 'month_sin': tensor([0.8660, 0.0000], dtype=torch.float64),
 'month_cos': tensor([-0.5000,  1.0000], dtype=torch.float64),
 'garment_group_name': array(['Shoes', 'Unknown'], dtype=object),
 'index_group_name': array(['Ladieswear', 'Ladieswear'], dtype=object),
 'age': tensor([32., 26.], dtype=torch.float64),
 'club_member_status': array(['ACTIVE', 'ACTIVE'], dtype=object),
 'age_group': ['26-35', '26-35']
 Categories (8, object): ['0-18' < '19-25' < '26-35' < '36-45' < '46-55' < '56-65' < '66-80' < '80+']}

In [16]:
user_id_list = train_df["customer_id"].unique().tolist()
item_id_list = train_df["article_id"].unique().tolist()

In [17]:
garment_group_list = train_df["garment_group_name"].unique().tolist()
index_group_list = train_df["index_group_name"].unique().tolist()

In [18]:
print(f"Number of transactions: {len(train_df):,}")
print(f"Number of users: {len(user_id_list):,}")
print(f"Number of items: {len(item_id_list):,}")
print(garment_group_list)

Number of transactions: 28,609,491
Number of users: 1,347,924
Number of items: 104,060
['Shoes', 'Unknown', 'Jersey Basic', 'Trousers', 'Jersey Fancy', 'Dressed', 'Under-, Nightwear', 'Knitwear', 'Skirts', 'Shirts', 'Trousers Denim', 'Outdoor', 'Swimwear', 'Dresses Ladies', 'Blouses', 'Accessories', 'Shorts', 'Socks and Tights', 'Special Offers', 'Dresses/Skirts girls', 'Woven/Jersey/Knitted mix Baby']


## Two Tower Model

- Query model: Generates a query representation given user and transaction features.
- Candidate model: Generates an item representation given item features.

**Both models produce embeddings that live in the same embedding space.**


For the query embedding I will use:
- `customer_id`: ID of the customer.
- `age`: age of the customer at the time of purchase.
- `month_sin`, `month_cos`: time of year the purchase was made.

For the candidate embedding I will use:
- `article_id`: ID of the item.
- `garment_group_name`: type of garment.
- `index_group_name`: menswear/ladieswear etc.





In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [20]:
from torch import nn
from typing import List

In [21]:
class StringLookup:

    def __init__(self, vocabulary: List[str], mask_token=None):
        self.vocab = {word: idx for idx, word in enumerate(vocabulary)}
        self.vocab["<UNK>"] = len(self.vocab)
        self.mask_token = mask_token

    def __call__(self, inputs: np.ndarray) -> torch.Tensor:
        # Convert string inputs to indices
        indices = [self.vocab.get(x, self.vocab["<UNK>"]) for x in inputs]
        return torch.tensor(indices, dtype=torch.long)

    def __len__(self):
        return len(self.vocab)


class StringEmbedding(nn.Module):
    def __init__(self, user_id_list: List[str], embedding_dimension: int):
        super().__init__()

        # Create the vocabulary space
        self.string_lookup = StringLookup(user_id_list)

        # The real embeddings
        self.embedding = nn.Embedding(num_embeddings=len(self.string_lookup), embedding_dim=embedding_dimension)

    def forward(self, user_ids: torch.Tensor) -> torch.Tensor:
        # Convert user IDs to indices
        return self.embedding(self.string_lookup(user_ids))

    # def forward(self, user_ids):
    #     user_indices = torch.tensor([self.user_to_index.get(user_id, self.user_to_index["<UNK>"]) for user_id in user_ids], device=self.embedding.weight.device)
    #     return self.embedding(user_indices)

In [22]:
from typing import Dict


class QueryTower(nn.Module):

    def __init__(self, user_id_list: List[str], embedding_dimension: int):
        super().__init__()

        self.user_embedding = StringEmbedding(user_id_list, embedding_dimension)
        self.age_normalization = nn.BatchNorm1d(1)

        self.dense_nn = nn.Sequential(nn.Linear(in_features=embedding_dimension + 3, out_features=embedding_dimension), nn.ReLU(), nn.Linear(embedding_dimension, embedding_dimension))

    def forward(self, inputs: Dict[str, torch.Tensor]) -> torch.Tensor:
        user_embedding = self.user_embedding(inputs["customer_id"])
        normalized_age = self.age_normalization(inputs["age"].float().unsqueeze(1))
        month_sin = inputs["month_sin"].float().unsqueeze(1)
        month_cos = inputs["month_cos"].float().unsqueeze(1)

        concatenated_inputs = torch.cat([user_embedding, normalized_age, month_sin, month_cos], dim=1)

        outputs = self.dense_nn(concatenated_inputs)
        return outputs

In [23]:
QueryTower(user_id_list, 128)(train_dataset[0:2]).shape

torch.Size([2, 128])

The candidate model is very similar to the query model.

it has two categorical features which will be one-hot encode.

In [24]:
class ItemTower(nn.Module):

    def __init__(self, item_id_list: List[str], garment_group_list: List[str], index_group_list: List[str], embedding_dimension: int):
        super().__init__()
        self.item_embedding = StringEmbedding(item_id_list, embedding_dimension)

        # Garment group setup
        self.garment_group_lookup = StringLookup(vocabulary=garment_group_list)
        self.garment_group_size = len(garment_group_list)

        # Index group setup
        self.index_group_lookup = StringLookup(vocabulary=index_group_list)
        self.index_group_size = len(index_group_list)

        input_dim = embedding_dimension + self.garment_group_size + self.index_group_size
        self.dense_nn = nn.Sequential(nn.Linear(input_dim, embedding_dimension), nn.ReLU(), nn.Linear(embedding_dimension, embedding_dimension))

    def forward(self, inputs: Dict[str, torch.Tensor]) -> torch.Tensor:

        # Convert article_id strings to embeddings
        item_embedding = self.item_embedding(inputs["article_id"])

        garment_indices = self.garment_group_lookup(inputs["garment_group_name"])
        garment_one_hot = torch.zeros((garment_indices.size(0), self.garment_group_size), dtype=torch.float32)
        garment_one_hot.scatter_(1, garment_indices.unsqueeze(1), 1.0)

        # Convert index group strings to one-hot encodings
        index_indices = self.index_group_lookup(inputs["index_group_name"])
        index_one_hot = torch.zeros((index_indices.size(0), self.index_group_size), dtype=torch.float32)
        index_one_hot.scatter_(1, index_indices.unsqueeze(1), 1.0)

        # Concatenate all features
        concatenated = torch.cat([item_embedding, garment_one_hot, index_one_hot], dim=1)

        outputs = self.dense_nn(concatenated)

        return outputs

In [25]:
ItemTower(item_id_list, garment_group_list, index_group_list, 128)(train_dataset[0:2]).shape

torch.Size([2, 128])

In [26]:
import torch.nn.functional as F
from typing import Dict, Tuple
from tqdm import tqdm


class TwoTowerModel(nn.Module):
    def __init__(self, query_tower: nn.Module, item_tower: nn.Module):
        super().__init__()
        self.query_tower = query_tower
        self.item_tower = item_tower

    def forward(self, batch: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
        query_embeddings = self.query_tower(batch["query"])
        item_embeddings = self.item_tower(batch["candidate"])

        # Normalize embeddings
        query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
        item_embeddings = F.normalize(item_embeddings, p=2, dim=1)

        return query_embeddings, item_embeddings

    def compute_loss(self, query_embeddings: torch.Tensor, item_embeddings: torch.Tensor) -> torch.Tensor:
        # Compute similarity matrix
        similarities = torch.matmul(query_embeddings, item_embeddings.t())

        # Create labels for the batch (diagonal is positive pairs)
        labels = torch.arange(query_embeddings.size(0), device=query_embeddings.device)

        # Compute cross entropy loss
        loss = F.cross_entropy(similarities, labels)

        return loss

In [27]:
def create_item_embeddings_index(model: TwoTowerModel, unique_items_loader: DataLoader, device: torch.device) -> Tuple[torch.Tensor, np.ndarray]:
    """Create an index of all unique item embeddings."""
    model.eval()
    all_embeddings = []
    all_item_ids = []

    with torch.no_grad():
        for batch in tqdm(unique_items_loader, desc="Creating item embeddings index"):
            item_embeddings = model.item_tower(batch)
            item_embeddings = F.normalize(item_embeddings, p=2, dim=1)
            all_embeddings.append(item_embeddings.cpu())
            all_item_ids.extend(batch["article_id"])

    return torch.cat(all_embeddings, dim=0), np.array(all_item_ids)

In [28]:
def evaluate_top_k(model: TwoTowerModel, val_loader: DataLoader, item_embeddings: torch.Tensor, item_ids: np.ndarray, k: int = 100, device: torch.device = None) -> float:
    """Evaluate the model using top-k accuracy."""
    model.eval()
    total_hits = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            # Get query embeddings
            query_embeddings = model.query_tower(batch["query"])
            query_embeddings = F.normalize(query_embeddings, p=2, dim=1)

            # Compute similarities with all items
            similarities = torch.matmul(query_embeddings, item_embeddings.t())

            # Get top-k items
            _, top_k_indices = similarities.topk(k)

            # Get actual item IDs that were purchased
            actual_items = batch["candidate"]["article_id"]

            # Check if actual items are in top-k predictions
            for actual_item, top_k_idx in zip(actual_items, top_k_indices):
                predicted_items = item_ids[top_k_idx.cpu()]
                if actual_item in predicted_items:
                    total_hits += 1
                total_samples += 1

    return total_hits / total_samples

In [29]:
def train_epoch(model: TwoTowerModel, train_loader: DataLoader, optimizer: torch.optim.Optimizer, device: torch.device) -> float:
    """Train for one epoch."""
    model.train()
    total_loss = 0
    num_batches = 0

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        # Move batch to device
        batch = {
            "query": {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch["query"].items()},
            "candidate": {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch["candidate"].items()},
        }

        # Forward pass
        query_embeddings, item_embeddings = model(batch)

        # Compute loss
        loss = model.compute_loss(query_embeddings, item_embeddings)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

    return total_loss / num_batches

In [30]:
# Example usage
def train_and_evaluate(model: TwoTowerModel, train_loader: DataLoader, val_loader: DataLoader, unique_items_loader: DataLoader, num_epochs: int = 10, device: torch.device = None):
    """Train and evaluate the model."""
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters())

    for epoch in range(num_epochs):
        # Train
        train_loss = train_epoch(model, train_loader, optimizer, device)

        # Create item embeddings index
        item_embeddings, item_ids = create_item_embeddings_index(model, unique_items_loader, device)
        item_embeddings = item_embeddings.to(device)

        # Evaluate
        top_k_accuracy = evaluate_top_k(model, val_loader, item_embeddings, item_ids, k=100, device=device)

        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Top-100 Accuracy: {top_k_accuracy:.4f}")

In [None]:
# Create dataset of unique items
unique_items_df = train_df.drop_duplicates("article_id")[["article_id", "garment_group_name", "index_group_name"]]
unique_items_dataset = CachedDataset(unique_items_df)
unique_items_loader = DataLoader(unique_items_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=11, pin_memory=True)

# Initialize model
model = TwoTowerModel(query_tower=QueryTower(user_id_list, embedding_dimension=32), item_tower=ItemTower(item_id_list, garment_group_list, index_group_list, embedding_dimension=32))

# Train and evaluate
train_and_evaluate(model=model, train_loader=train_loader, val_loader=val_loader, unique_items_loader=unique_items_loader, num_epochs=10)

Training:   0%|          | 0/447024 [00:00<?, ?it/s]