In [22]:
import torch
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import sys
import torch
from transformers import BertModel, BertTokenizer
sys.path.append("/home/glaurung/ai-ads")
sys.path.append("/home/glaurung/ai-ads/dlrm")
from dlrm import data_utils
import dlrm
import pickle
from dlrm_s_pytorch import DLRM_Net
import numpy as np
import ad_copy_util
from torch.utils.data import DataLoader, TensorDataset
import common

In [23]:
df_test = pd.read_csv('../data/test/test_data.csv')

##### this section is for POC model deployment. It's not useful for testing because you still need ctr for the users

In [24]:
# prompt = "Create a list of 10 catchy phrases that could be used in an advertisement for a new sports drink flavor called Jungle Torrent targeting 20 year old athletes."
# ad_copy_options = ['1. "Thirst to Win with Jungle Torrent!"', '2. "Start your Winning Streak with Jungle Torrent!" ', '3. "Outperform with Jungle Torrent!"', '4. "Hydrate to Dominate with Jungle Torrent!"', '5. "Stay Energized and Go the Distance with Jungle Torrent!"', '6. "Recharge with Jungle Torrent!"', '7. "Beat Your Best with Jungle Torrent!"', '8. "Go Wild with Jungle Torrent!"', '9. "Outpace the Competition with Jungle Torrent!"', '10. "Unlock Your Potential with Jungle Torrent!"']
# #ad_copy_util.generate_ad_copy_options(prompt, max_items=10, max_tokens=300, temperature=1)
# print(ad_copy_options)

# # Use BertModel to generate embeddings instead of OpenAI API to save time and credits.
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("using ", device)
# model = model.to(device)
# model.eval()
# ad_copy_embeddings = ad_copy_util.generate_text_embeddings(ad_copy_options, model, tokenizer, device)
# for i, embedding in enumerate(ad_copy_embeddings):
#     print(f"Embedding {i+1} shape: {embedding}")

#### Prepare categorical test data

In [25]:
label_encoders = {
    'ad_id': LabelEncoder(),
    'device_type': LabelEncoder(),
    'location': LabelEncoder(),
    'browser': LabelEncoder(),  
    'content_category': LabelEncoder(),
    'ad_copy': LabelEncoder(),
    'product_type': LabelEncoder(),
    'ad_type': LabelEncoder(),
    'time_of_day': LabelEncoder(),
    'day_of_week': LabelEncoder(),
    'interaction_type': LabelEncoder(),
    'historical_ad_category': LabelEncoder()
}

df_categorical_test = common.transform_with_label_encoders(label_encoders, df_test)

categorical_features = [tuple(values) for values in df_categorical_test.to_numpy()]

#### Prepare continuous test data

In [26]:

continuous_fields = ['age', 'site_visit_duration', 'time_spent_on_ad', 'pages_visited_this_session','ads_viewed_last_month', 'avg_time_spent_on_clicked_ads', 'site_visit_frequency']

df_continuous = common.load_and_transform_scaler(continuous_fields, df_test)

continuous_features = [tuple(values) for values in df_continuous.to_numpy()]

target_feature = df_test['ad_clicked'].values

#### Retrieve ad copy embeddings

In [27]:
embeddings_file = 'ad_copy_embeddings.pkl'
with open(embeddings_file, 'rb') as file:
    ad_copy_embeddings_dict = pickle.load(file)
continuous_features_flat = common.prepare_continuous_features_with_embeddings(df_test, df_continuous, ad_copy_embeddings_dict,'ad_copy')


#### Initialize the DLRM

In [29]:
category_cardinalities = []

# Loop through each category feature and calculate the cardinality
for column in label_encoders.keys():
    cardinality = len(df_test[column].unique())
    category_cardinalities.append(cardinality)

category_cardinalities_array = np.array(category_cardinalities)

# embedding_sizes: the sizes of the embedding tables based on the cardinalities of the categorical features
ln_emb = category_cardinalities_array

# original number of continuous features
original_m_spa = np.array(continuous_features[0]).shape[0]

# size of each ad copy embedding
ad_copy_embedding_size = 768  

# m_spa is the size of each embedding
m_spa = original_m_spa + ad_copy_embedding_size

ln_bot = np.array([m_spa])

# ln_top = np.array([m_spa + embedding_size * len(categorical_features[0]), 16, 1])
ln_top = np.array([775, 16, 1])

device = "cpu"
model = DLRM_Net(
    m_spa,
    ln_emb,
    ln_bot,
    ln_top,
    arch_interaction_op="dot",
    sigmoid_bot=-1,
    sigmoid_top=len(ln_top) - 2,
).to(device)
model.load_state_dict(torch.load("/home/glaurung/ai-ads/trained_model.pt"))
model.to(device)
model.eval()

DLRM_Net(
  (emb_l): ModuleList(
    (0): EmbeddingBag(84, 775, mode='sum')
    (1): EmbeddingBag(3, 775, mode='sum')
    (2): EmbeddingBag(5, 775, mode='sum')
    (3): EmbeddingBag(4, 775, mode='sum')
    (4): EmbeddingBag(6, 775, mode='sum')
    (5): EmbeddingBag(84, 775, mode='sum')
    (6): EmbeddingBag(6, 775, mode='sum')
    (7-8): 2 x EmbeddingBag(4, 775, mode='sum')
    (9): EmbeddingBag(7, 775, mode='sum')
    (10): EmbeddingBag(3, 775, mode='sum')
    (11): EmbeddingBag(6, 775, mode='sum')
  )
  (bot_l): Sequential()
  (top_l): Sequential(
    (0): Linear(in_features=775, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=1, bias=True)
    (3): Sigmoid()
  )
  (loss_fn): BCELoss()
)

In [32]:
X_cat = torch.tensor(categorical_features, dtype=torch.long)
X_cont = torch.tensor(continuous_features_flat, dtype=torch.float32)
Y = torch.tensor(target_feature, dtype=torch.float32).view(-1, 1)
dataset = TensorDataset(X_cont, X_cat, Y)

# Create dataset and data loader
test_loader = DataLoader(dataset, batch_size=64, shuffle=True)

### Test Trained Model

In [34]:
# Test the Model
test_loss = 0.0
criterion = torch.nn.BCEWithLogitsLoss(reduction="mean")
with torch.no_grad():
    for x_cont, x_cat, y in test_loader:
        lS_o, lS_i = common.generate_offsets_and_indices_per_feature(x_cat)
        y_pred = model(x_cont, lS_o, lS_i)
        loss = criterion(y_pred, y)
        test_loss += loss.item()

avg_test_loss = test_loss / len(test_loader)
print(f"Average Test Loss: {avg_test_loss:.4f}")

Average Test Loss: 0.6951
