In [25]:
import torch
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import sys
import torch
from transformers import BertModel, BertTokenizer
sys.path.append("/home/glaurung/ai-ads")
sys.path.append("/home/glaurung/ai-ads/dlrm")
from dlrm import data_utils
import dlrm
from dlrm_s_pytorch import DLRM_Net
import numpy as np
import os
import pickle
import ad_copy_util
from torch.utils.data import DataLoader, TensorDataset
import common

In [26]:
df = pd.read_csv('../data/train/train_data.csv')
df_validation = pd.read_csv('../data/validation/validation_data.csv')

## Label Encode Categorical Data

In [27]:
label_encoders = {
    'ad_id': LabelEncoder(),
    'device_type': LabelEncoder(),
    'location': LabelEncoder(),
    'browser': LabelEncoder(),  
    'content_category': LabelEncoder(),
    'ad_copy': LabelEncoder(),
    'product_type': LabelEncoder(),
    'ad_type': LabelEncoder(),
    'time_of_day': LabelEncoder(),
    'day_of_week': LabelEncoder(),
    'interaction_type': LabelEncoder(),
    'historical_ad_category': LabelEncoder()
}

common.fit_label_encoders(label_encoders, df)

df_categorical = common.transform_with_label_encoders(label_encoders, df)
df_categorical_validation = common.transform_with_label_encoders(label_encoders, df_validation)

In [28]:
print(df_categorical_validation)

     ad_id_encoded  device_type_encoded  location_encoded  browser_encoded  \
0               49                    0                 3                0   
1               52                    1                 4                1   
2               60                    0                 3                3   
3               51                    1                 0                0   
4               41                    0                 3                2   
..             ...                  ...               ...              ...   
995             23                    2                 1                1   
996             38                    2                 3                3   
997             71                    0                 2                0   
998             83                    0                 1                2   
999             71                    1                 4                2   

     content_category_encoded  ad_copy_encoded  product_type_en

## Scale Continuous Data

In [29]:

continuous_fields = ['age', 'site_visit_duration', 'time_spent_on_ad', 'pages_visited_this_session','ads_viewed_last_month', 'avg_time_spent_on_clicked_ads', 'site_visit_frequency']

common.fit_and_save_scaler(continuous_fields, df)

df_continuous = common.load_and_transform_scaler(continuous_fields, df)
df_continuous_validation = common.load_and_transform_scaler(continuous_fields, df_validation)

In [30]:
# Format training features
categorical_features = [tuple(values) for values in df_categorical.to_numpy()]
continuous_features = [tuple(values) for values in df_continuous.to_numpy()]
target_feature = df['ad_clicked'].values

# Format validation features
categorical_features_validation = [tuple(values) for values in df_categorical_validation.to_numpy()]
continuous_features_validation = [tuple(values) for values in df_continuous_validation.to_numpy()]
target_feature_validation = df_validation['ad_clicked'].values

In [31]:
print("Categorical Features:")
print(categorical_features)

print("Continuous Features:")
print(continuous_features)

print("Target Feature")
print(target_feature)

print("Continuous features shape: ", np.array(continuous_features[0]).shape[0])
print("Categorical features shape: ", np.array(categorical_features[0]).shape[0])
print("Target feature: ", np.array(target_feature))
print(type(continuous_features[0]))

Categorical Features:
[(27, 1, 4, 3, 1, 4, 4, 1, 0, 4, 2, 3), (78, 0, 0, 3, 2, 10, 5, 3, 2, 4, 0, 4), (78, 1, 3, 1, 2, 10, 5, 1, 2, 5, 2, 5), (31, 1, 4, 2, 4, 54, 5, 3, 2, 3, 0, 4), (8, 2, 0, 1, 0, 55, 4, 0, 2, 1, 0, 5), (38, 0, 4, 1, 3, 39, 0, 1, 1, 6, 0, 1), (79, 2, 1, 0, 5, 9, 3, 3, 0, 2, 0, 4), (69, 0, 0, 1, 1, 77, 0, 1, 0, 2, 1, 1), (27, 2, 4, 2, 1, 4, 4, 3, 2, 6, 2, 1), (27, 2, 4, 0, 1, 4, 4, 0, 3, 5, 1, 3), (7, 0, 1, 3, 0, 17, 1, 2, 0, 4, 1, 0), (27, 0, 1, 0, 1, 4, 4, 2, 3, 4, 0, 3), (72, 0, 2, 2, 4, 47, 4, 0, 3, 4, 2, 1), (46, 2, 2, 3, 2, 79, 0, 3, 3, 2, 0, 4), (44, 0, 1, 3, 5, 53, 2, 0, 0, 2, 1, 4), (36, 0, 2, 0, 5, 76, 0, 3, 0, 3, 2, 5), (48, 1, 0, 2, 0, 57, 3, 2, 3, 5, 2, 2), (8, 1, 0, 0, 0, 55, 4, 0, 0, 4, 0, 4), (42, 2, 3, 3, 2, 33, 5, 3, 2, 4, 1, 3), (9, 1, 1, 2, 3, 41, 1, 2, 0, 5, 2, 0), (19, 0, 3, 3, 5, 28, 4, 2, 2, 5, 1, 0), (1, 0, 4, 2, 3, 83, 5, 0, 3, 1, 1, 3), (81, 0, 2, 0, 2, 18, 0, 2, 3, 2, 0, 1), (61, 1, 3, 3, 1, 48, 1, 2, 0, 1, 0, 4), (4, 2, 3, 0, 4, 63, 5, 0, 3

### Concatenate Continuous features and ad copy embeddings

In [32]:
ad_copy_file = '../preprocessing/data/ad_copy.json'
embeddings_file = 'ad_copy_embeddings.pkl'
if os.path.exists(embeddings_file):
    with open(embeddings_file, 'rb') as file:
        ad_copy_embeddings_dict = pickle.load(file)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    device = torch.device("cpu")#"cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    ad_copy_embeddings_dict = common.generate_all_embeddings(ad_copy_file, model, tokenizer, device)

# Prepare continuous features with embeddings
continuous_features_flat = common.prepare_continuous_features_with_embeddings(df, df_continuous, ad_copy_embeddings_dict,'ad_copy')
continuous_features_flat_validation = common.prepare_continuous_features_with_embeddings(df_validation, df_continuous_validation,ad_copy_embeddings_dict,'ad_copy')

In [33]:
# # Use BertModel to generate embeddings instead of OpenAI API to save time and credits.
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("using ", device)
# model = model.to(device)
# model.eval()

# # Generate embeddings for each unique ad copy
# unique_ad_copies = df['ad_copy'].unique()
# ad_copy_embeddings_dict = {}
# for ad_copy in unique_ad_copies:
#     ad_copy_embeddings_dict[ad_copy] = ad_copy_util.generate_text_embeddings([ad_copy], model, tokenizer, device)

# # Create a new DataFrame for the embeddings
# df['ad_copy_embedding'] = df['ad_copy'].apply(lambda x: ad_copy_embeddings_dict[x])
# embeddings_df = pd.DataFrame(df['ad_copy_embedding'].tolist(), index=df.index)

# # Concatenate the continuous features with the embeddings
# df_continuous_with_embeddings = pd.concat([df_continuous, embeddings_df], axis=1)
# continuous_features = [tuple(values) for values in df_continuous_with_embeddings.to_numpy()]
# # Flatten the embedding arrays and concatenate with other continuous features
# continuous_features_flat = [tuple(list(values[:-1]) + values[-1].tolist()) for values in continuous_features]


In [34]:
category_cardinalities = []

# Loop through each category feature and calculate the cardinality
for column in label_encoders.keys():
    cardinality = len(df[column].unique())
    category_cardinalities.append(cardinality)

category_cardinalities_array = np.array(category_cardinalities)

print(category_cardinalities_array)

[84  3  5  4  6 84  6  4  4  7  3  6]


### Initialize DLRM

In [35]:
# embedding_sizes,: Sizes of the embedding tables based on the cardinalities of the categorical features
ln_emb = category_cardinalities_array

original_m_spa = np.array(continuous_features[0]).shape[0]  # original number of continuous features
ad_copy_embedding_size = 768  # size of each ad copy embedding

# m_spa is the size of each embedding
m_spa = original_m_spa + ad_copy_embedding_size

ln_bot = np.array([m_spa])

# ln_top = np.array([m_spa + embedding_size * len(categorical_features[0]), 16, 1])
ln_top = np.array([775, 16, 1])

# Create DLRM model
device = "cpu"
model = DLRM_Net(
    m_spa,
    ln_emb,
    ln_bot,
    ln_top,
    arch_interaction_op="dot",
    sigmoid_bot=-1,
    sigmoid_top=len(ln_top) - 2,
).to(device)

In [36]:
print(m_spa)
print(ln_emb)
print(ln_bot)
print(ln_top)

775
[84  3  5  4  6 84  6  4  4  7  3  6]
[775]
[775  16   1]


#### Convert data to tensors

In [37]:
X_cat = torch.tensor(categorical_features, dtype=torch.long)
X_cont = torch.tensor(continuous_features_flat, dtype=torch.float32)
Y = torch.tensor(target_feature, dtype=torch.float32).view(-1, 1)
dataset = TensorDataset(X_cont, X_cat, Y)

# Create dataset and data loader
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

X_cat_val = torch.tensor(categorical_features_validation, dtype=torch.long)
X_cont_val = torch.tensor(continuous_features_flat_validation, dtype=torch.float32)
Y_val = torch.tensor(target_feature_validation, dtype=torch.float32).view(-1, 1)
dataset_val = TensorDataset(X_cont_val, X_cat_val, Y_val)

# Create dataset and data loader
val_loader = DataLoader(dataset_val, batch_size=64, shuffle=True)

In [38]:
learning_rate = .001
criterion = torch.nn.BCEWithLogitsLoss(reduction="mean")
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 10
model_save_path = 'trained_model.pt'

## Training 

In [39]:
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    num_batches = 0
    
    for x_cont, x_cat, y in data_loader:
        lS_o, lS_i = common.generate_offsets_and_indices_per_feature(x_cat)
        y_pred = model(x_cont, lS_o, lS_i)
        loss = criterion(y_pred, y)
        total_loss += loss.item()
        num_batches += 1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / num_batches
    print(f"Epoch [{epoch+1}/{epochs}] - Training Loss: {avg_loss:.4f}")

    # Validation Loop
    model.eval() 
    val_loss = 0.0
    val_batches = 0
    with torch.no_grad():
        for x_cont_val, x_cat_val, y_val in val_loader:
            lS_o_val, lS_i_val = common.generate_offsets_and_indices_per_feature(x_cat_val)
            y_pred_val = model(x_cont_val, lS_o_val, lS_i_val)
            loss_val = criterion(y_pred_val, y_val)
            val_loss += loss_val.item()
            val_batches += 1
    
    avg_val_loss = val_loss / val_batches
    print(f"Epoch [{epoch+1}/{epochs}] - Validation Loss: {avg_val_loss:.4f}")

torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Epoch [1/10] - Training Loss: 0.6992
Epoch [1/10] - Validation Loss: 0.6990
Epoch [2/10] - Training Loss: 0.6988
Epoch [2/10] - Validation Loss: 0.6988
Epoch [3/10] - Training Loss: 0.6984
Epoch [3/10] - Validation Loss: 0.6983
Epoch [4/10] - Training Loss: 0.6980
Epoch [4/10] - Validation Loss: 0.6979
Epoch [5/10] - Training Loss: 0.6976
Epoch [5/10] - Validation Loss: 0.6979
Epoch [6/10] - Training Loss: 0.6973
Epoch [6/10] - Validation Loss: 0.6973
Epoch [7/10] - Training Loss: 0.6970
Epoch [7/10] - Validation Loss: 0.6962
Epoch [8/10] - Training Loss: 0.6967
Epoch [8/10] - Validation Loss: 0.6967
Epoch [9/10] - Training Loss: 0.6964
Epoch [9/10] - Validation Loss: 0.6962
Epoch [10/10] - Training Loss: 0.6962
Epoch [10/10] - Validation Loss: 0.6959
Model saved to trained_model.pt
