In [2]:
import torch
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import sys
import torch
from transformers import BertModel, BertTokenizer
sys.path.append("/home/glaurung/ai-ads")
sys.path.append("/home/glaurung/ai-ads/dlrm")
from dlrm import data_utils
import dlrm
from dlrm_s_pytorch import DLRM_Net
from dlrm_data_pytorch import RandomDataset
import numpy as np
import os
import pickle
import ad_copy_util
from torch.utils.data import DataLoader, TensorDataset
import common
import torch.nn as nn
import torch.nn.functional as F

In [4]:
df = pd.read_csv('../data/train/train_data.csv')


## Label Encode Categorical Data

In [6]:
# label_encoders = {
#     'ad_id': LabelEncoder(),
#     'device_type': LabelEncoder(),
#     'location': LabelEncoder(),
#     'browser': LabelEncoder(),  
#     'content_category': LabelEncoder(),
#     'ad_copy': LabelEncoder(),
#     'product_type': LabelEncoder(),
#     'ad_type': LabelEncoder(),
#     'time_of_day': LabelEncoder(),
#     'day_of_week': LabelEncoder(),
#     'interaction_type': LabelEncoder(),
#     'historical_ad_category': LabelEncoder()
# }
label_encoders = {
    'ad_id': LabelEncoder(),
    'location': LabelEncoder(),
    'product_type': LabelEncoder(),
    'ad_type': LabelEncoder(),
}

common.fit_label_encoders(label_encoders, df)

df_categorical = common.transform_with_label_encoders(label_encoders, df)


## Scale Continuous Data

In [9]:

# continuous_fields = ['age', 'site_visit_duration', 'time_spent_on_ad', 'pages_visited_this_session','ads_viewed_last_month', 'avg_time_spent_on_clicked_ads', 'site_visit_frequency']
continuous_fields = ['age']

common.fit_and_save_scaler(continuous_fields, df)

df_continuous = common.load_and_transform_scaler(continuous_fields, df)



In [10]:
# Format training features
categorical_features = [tuple(values) for values in df_categorical.to_numpy()]
continuous_features = [tuple(values) for values in df_continuous.to_numpy()]
target_feature = df['ad_clicked'].values


In [11]:
print("Categorical Features:")
print(categorical_features)

print("Continuous Features:")
print(continuous_features)

print("Target Feature")
print(target_feature)

print("Continuous features shape: ", np.array(continuous_features[0]).shape[0])
print("Categorical features shape: ", np.array(categorical_features[0]).shape[0])
print("Target feature: ", np.array(target_feature))
print(type(continuous_features[0]))

Categorical Features:
[(13, 0, 2, 2), (18, 3, 0, 0), (36, 4, 1, 2), (6, 3, 4, 0), (43, 3, 4, 0), (49, 4, 2, 1), (43, 4, 4, 1), (11, 2, 2, 1), (28, 1, 1, 1), (66, 4, 1, 2), (20, 4, 3, 2), (10, 3, 1, 2), (55, 4, 2, 0), (62, 2, 2, 0), (8, 3, 5, 1), (5, 1, 3, 3), (8, 1, 5, 1), (16, 1, 5, 1), (44, 2, 4, 1), (16, 4, 5, 2), (50, 3, 3, 3), (13, 4, 2, 3), (5, 2, 3, 2), (55, 3, 2, 3), (15, 3, 2, 2), (55, 0, 2, 3), (2, 2, 2, 0), (23, 2, 5, 1), (4, 4, 0, 2), (8, 2, 5, 2), (1, 2, 3, 3), (21, 4, 2, 1), (29, 3, 0, 3), (33, 1, 3, 2), (7, 1, 1, 0), (33, 2, 3, 0), (13, 1, 2, 0), (15, 1, 2, 3), (61, 0, 4, 1), (32, 2, 3, 1), (27, 1, 1, 3), (26, 3, 2, 3), (10, 2, 1, 3), (35, 2, 2, 2), (42, 4, 4, 0), (14, 0, 4, 0), (7, 0, 1, 1), (32, 3, 3, 3), (0, 1, 4, 0), (65, 3, 3, 2), (44, 4, 4, 3), (10, 1, 1, 2), (65, 4, 3, 1), (58, 0, 2, 3), (62, 0, 2, 2), (17, 4, 5, 3), (5, 2, 3, 3), (24, 3, 5, 3), (55, 4, 2, 3), (47, 0, 2, 2), (63, 0, 0, 1), (45, 0, 4, 3), (51, 3, 1, 2), (31, 4, 5, 0), (9, 1, 0, 3), (66, 4, 1, 2), (

### Concatenate Continuous features and ad copy embeddings

In [13]:
ad_copy_file = '../preprocessing/data/ad_copy.json'
embeddings_file = 'ad_copy_embeddings.pkl'
if os.path.exists(embeddings_file):
    with open(embeddings_file, 'rb') as file:
        ad_copy_embeddings_dict = pickle.load(file)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    device = torch.device("cpu")#"cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    ad_copy_embeddings_dict = common.generate_all_embeddings(ad_copy_file, model, tokenizer, device)

# Prepare continuous features with embeddings
continuous_features_flat = common.prepare_continuous_features_with_embeddings(df, df_continuous, ad_copy_embeddings_dict,'ad_copy')


In [14]:
category_cardinalities = []

# Loop through each category feature and calculate the cardinality
for column in label_encoders.keys():
    cardinality = len(df[column].unique())
    category_cardinalities.append(cardinality)

category_cardinalities_array = np.array(category_cardinalities)

print(category_cardinalities_array)

[68  5  6  4]


### Initialize DLRM

In [None]:
# # embedding_sizes,: Sizes of the embedding tables based on the cardinalities of the categorical features
# ln_emb = category_cardinalities_array

# original_m_spa = np.array(continuous_features[0]).shape[0]  # original number of continuous features
# ad_copy_embedding_size = 768  # size of each ad copy embedding

# # m_spa is the size of each embedding
# m_spa = original_m_spa #+ ad_copy_embedding_size

# ln_bot = np.array([m_spa])

# # ln_top = np.array([m_spa + embedding_size * len(categorical_features[0]), 16, 1])
# # ln_top = np.array([775, 600, 512, 400, 300, 200, 128, 64, 32, 16, 8, 1])
# # ln_top = np.array([m_spa, 16, 8, 1])
# ln_top = np.array([1])


# # Create DLRM model
# device = "cpu"
# model = DLRM_Net(
#     m_spa,
#     ln_emb,
#     ln_bot,
#     ln_top,
#     weighted_pooling="learned",
#     loss_function="bce",
#     arch_interaction_op="dot",
#     sigmoid_bot=-1,
#     sigmoid_top=len(ln_top) - 2,
# ).to(device)

In [None]:
# print(m_spa)
# print(ln_emb)
# print(ln_bot)
# print(ln_top)

#### Convert data to tensors

In [17]:
X_cat = torch.tensor(categorical_features, dtype=torch.long)
X_cont = torch.tensor(continuous_features, dtype=torch.float32)
Y = torch.tensor(target_feature, dtype=torch.float32).view(-1, 1)
dataset = TensorDataset(X_cont, X_cat, Y)

# Create dataset and data loader
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [18]:
# Fetch a single batch of data from DataLoader
x_cont, x_cat, y = next(iter(data_loader))
lS_o, lS_i = common.generate_offsets_and_indices_per_feature(x_cat)

# Print shapes of the batch data
print("Continuous features shape:", x_cont.shape)
print("Categorical features shape:", x_cat.shape)
print("Target features shape:", y.shape)

# You can also print the first few values to see the data
print("Sample continuous features:", x_cont[:2])
print("Sample categorical features:", x_cat[:2])
print("Sample target features:", y[:2])


Continuous features shape: torch.Size([8, 1])
Categorical features shape: torch.Size([8, 4])
Target features shape: torch.Size([8, 1])
Sample continuous features: tensor([[0.4510],
        [0.7059]])
Sample categorical features: tensor([[31,  4,  5,  2],
        [55,  3,  2,  1]])
Sample target features: tensor([[0.],
        [0.]])


In [None]:
import torch
from torch.utils.data import DataLoader

# Assuming you have the RandomDataset class defined as in your script

# Parameters for the dataset
m_den = 4  # number of dense features
ln_emb = [4,3,2]  # list containing the size of each embedding table
data_size = 100000  # total number of data points
mini_batch_size = 128  # size of each mini-batch
num_batches = data_size // mini_batch_size  # calculate the number of batches
num_indices_per_lookup = 5
num_indices_per_lookup_fixed = True

# Create instances of RandomDataset
train_data = RandomDataset(
    m_den=m_den,
    ln_emb=ln_emb,
    data_size=data_size,
    num_batches=num_batches,
    mini_batch_size=mini_batch_size,
    num_indices_per_lookup=num_indices_per_lookup,
    num_indices_per_lookup_fixed=num_indices_per_lookup_fixed
)

test_data = RandomDataset(
    m_den=m_den,
    ln_emb=ln_emb,
    data_size=data_size // 10,  # smaller size for the test set
    num_batches=num_batches // 10,
    mini_batch_size=mini_batch_size,
    num_indices_per_lookup=num_indices_per_lookup,
    num_indices_per_lookup_fixed=num_indices_per_lookup_fixed
)

# Define a collate function if required
def collate_fn(batch):
    # Implement this based on how you want to preprocess the batch data
    return batch

# Create Data Loaders
train_loader = DataLoader(
    train_data,
    batch_size=1,  # Because RandomDataset generates a batch at a time
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_data,
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)

# Now you can iterate over train_loader and test_loader for training and evaluation


In [None]:
print(test_loader)

In [None]:
# Create DLRM model
device = "cpu"
m_spa = max(ln_emb)  # Embedding size (maximum of ln_emb)
ln_bot = np.array([m_den, 128, 64])  # Bottom MLP architecture
ln_top = np.array([512,256,1])  # Top MLP architecture
ln_emb = np.array(ln_emb)
model = DLRM_Net(
    m_spa=m_spa,
    ln_emb=ln_emb,
    ln_bot=ln_bot,
    ln_top=ln_top,
    weighted_pooling="learned",
    loss_function="bce",
    arch_interaction_op="dot",
    sigmoid_bot=-1,  # Sigmoid activation for the bottom MLP
    sigmoid_top=len(ln_top) - 2,  # Sigmoid activation for the last layer of top MLP
).to(device)


In [None]:
learning_rate = .08
criterion = torch.nn.BCEWithLogitsLoss(reduction="mean")
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 200
model_save_path = 'trained_model.pt'

## Training 

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0
    num_samples = 0

    for batch in train_loader:  # Adjusted for RandomDataset output
        x_cont, lS_o, lS_i, y = batch[0]

        # Debugging: print the shapes of the inputs
        print("x_cont shape:", x_cont.shape)
        print("lS_o shapes:", [s_o.shape for s_o in lS_o])
        print("lS_i shapes:", [s_i.shape for s_i in lS_i])
        print("y shape:", y.shape)

        y_pred = model(x_cont, lS_o, lS_i)
        loss = criterion(y_pred, y.view(-1, 1))
        total_loss += loss.item()

        # Calculate accuracy
        predicted = torch.sigmoid(y_pred).round()
        total_correct += (predicted == y.view(-1, 1)).sum().item()
        num_samples += y.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / num_samples
    print(f"Epoch [{epoch+1}/{epochs}] - Training Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

    # Validation Loop
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_samples = 0
    with torch.no_grad():
        for batch in test_loader:  # Adjusted for RandomDataset output
            x_cont, lS_o, lS_i, y = batch[0]
            y_pred_val = model(x_cont, lS_o, lS_i)
            loss_val = criterion(y_pred_val, y.view(-1, 1))
            val_loss += loss_val.item()

            # Calculate accuracy
            predicted_val = torch.sigmoid(y_pred_val).round()
            val_correct += (predicted_val == y.view(-1, 1)).sum().item()
            val_samples += y.size(0)

    avg_val_loss = val_loss / len(test_loader)
    val_accuracy = val_correct / val_samples
    print(f"Epoch [{epoch+1}/{epochs}] - Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


In [None]:
import pandas as pd

# Load the test data

# Calculate the percentage of 'ad_clicked' equals 1
percentage_clicked = (df['ad_clicked'].sum() / len(df)) * 100

print(f"Percentage of ads clicked (ad_clicked = 1): {percentage_clicked:.2f}%")


In [None]:
# After processing
print(df_continuous.describe())
for col in df_categorical.columns:
    print(df_categorical[col].value_counts())


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Prepare data
X_train = np.hstack([df_continuous.to_numpy(), df_categorical.to_numpy()])
y_train = df['ad_clicked'].values
X_val = np.hstack([df_continuous_validation.to_numpy(), df_categorical_validation.to_numpy()])
y_val = df_validation['ad_clicked'].values

# Train a simple logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Evaluate the model
y_pred_train = lr_model.predict(X_train)
y_pred_val = lr_model.predict(X_val)
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))


In [None]:
# Check the first few batches
for i, (x_cont, x_cat, y) in enumerate(data_loader):
    if i < 5:
        print("Batch", i, "Targets:", y.squeeze().numpy())
