In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
import joblib
from model.DLRM_Net import DLRM_Net
from model.DLRM_Dataset import DLRM_Dataset
import torch.nn as nn
import common
import ad_copy_util
import numpy as np
import os
from sklearn.decomposition import PCA
import pickle
from transformers import BertModel, BertTokenizer

### Label Encode Categorical Features

In [20]:
df = pd.read_csv('../data/train/train_data.csv')

categorical_cols = ['location', 'product_type', 'ad_type']
label_encoders = {}
encoded_categorical_data = np.empty((df.shape[0], len(categorical_cols)))

for i, col in enumerate(categorical_cols):
    le = LabelEncoder()
    encoded_categorical_data[:, i] = le.fit_transform(df[col])
    label_encoders[col] = le

joblib.dump(label_encoders, 'model_artifacts/label_encoders.joblib')

['model_artifacts/label_encoders.joblib']

### Load and Generate Ad Copy Embeddings

In [21]:
ad_copy_file = '../preprocessing/data/ad_copy.json'
embeddings_file = 'model_artifacts/ad_copy_embeddings.pkl'
if os.path.exists(embeddings_file):
    with open(embeddings_file, 'rb') as file:
        ad_copy_embeddings_dict = pickle.load(file)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    device = torch.device("cpu")
    model.to(device)
    model.eval()
    ad_copy_embeddings_dict = common.generate_all_embeddings(ad_copy_file, model, tokenizer, device, 1024)
    
# this maps embeddings to ad_copy
embeddings_list = df['ad_copy'].map(ad_copy_embeddings_dict).tolist()
ad_copy_embeddings = np.vstack(embeddings_list)


In [22]:
print(ad_copy_embeddings)

[[ 0.11563431 -0.08019764 -0.14963417 ... -0.510401    0.13010667
   0.20229894]
 [-0.05821146  0.27160457  0.01331483 ... -0.5414637   0.13168468
   0.06821588]
 [ 0.07516024 -0.03162972  0.08582542 ... -0.4126556   0.19505903
   0.04414247]
 ...
 [-0.18419725 -0.173914   -0.14313711 ... -0.38819295 -0.29105833
   0.51544636]
 [-0.13988926 -0.13345717 -0.24335426 ... -0.48182994 -0.02182761
   0.3863272 ]
 [ 0.11424109 -0.03569167 -0.02830822 ... -0.33884987  0.06618364
   0.25962827]]


### Apply PCA to Ad Copy Embeddings

In [23]:
pca = PCA(n_components=30)
reduced_embeddings = pca.fit_transform(ad_copy_embeddings)

pca_model_path = 'model_artifacts/pca_model.pkl'
with open(pca_model_path, 'wb') as file:
    pickle.dump(pca, file)

In [24]:
print(reduced_embeddings)

[[-0.49451908 -1.2166103  -2.394447   ...  0.2286228   0.34895322
  -0.32854813]
 [ 3.160706   -2.648871   -1.7516261  ... -0.09235296  0.0289941
  -0.74444884]
 [ 2.2588067   2.4183207  -1.3575181  ... -0.50542367 -0.5700436
  -0.06037248]
 ...
 [ 0.37339655  3.7028925   1.5538087  ...  0.65753716  0.11671749
   0.44288847]
 [-2.6146228  -2.4884183   1.1762469  ...  0.41475865 -0.26969522
   0.49320617]
 [-0.9713625   2.0740376   1.8903064  ...  0.08653836 -0.4026144
  -0.32249522]]


### Scale Ad Copy Embeddings

In [25]:

embeddings_scaler = common.fit_and_save_scaler(reduced_embeddings, 'model_artifacts/embeddings_scaler.pkl')
scaled_ad_copy_embeddings = common.load_and_transform_scaler(reduced_embeddings, 'model_artifacts/embeddings_scaler.pkl')

scaled_embeddings_path = 'model_artifacts/scaled_ad_copy_embeddings.pkl'
with open(scaled_embeddings_path, 'wb') as file:
    pickle.dump(scaled_ad_copy_embeddings, file)

In [26]:
print(scaled_ad_copy_embeddings)

[[0.3006587  0.29868007 0.02697474 ... 0.5685825  0.64071965 0.32893556]
 [0.6406016  0.09449828 0.11362825 ... 0.4638026  0.49596602 0.13467014]
 [0.55672324 0.8168726  0.16675478 ... 0.3289591  0.22495353 0.45419925]
 ...
 [0.3813765  1.         0.5592073  ... 0.70859796 0.53565323 0.68927026]
 [0.10348502 0.11737227 0.5083113  ... 0.62934494 0.36083508 0.71277344]
 [0.25631136 0.7677919  0.6045679  ... 0.52220017 0.3007007  0.33176285]]


### Scale Continuous Fields

In [27]:
continuous_fields = ['age', 'site_visit_frequency']
df_continuous = df[continuous_fields]
continuous_scaler = common.fit_and_save_scaler(df_continuous, 'model_artifacts/continuous_scaler.pkl')
scaled_continuous_features = common.load_and_transform_scaler(df_continuous, 'model_artifacts/continuous_scaler.pkl')



In [28]:
print("Original shape:", ad_copy_embeddings.shape)
print("Reduced shape:", reduced_embeddings.shape)

Original shape: (8000, 768)
Reduced shape: (8000, 30)


### Combine Categorical, Continuous, and Embedding Data

In [29]:
combined_features = np.hstack((scaled_continuous_features, scaled_ad_copy_embeddings, encoded_categorical_data))
print(combined_features)

[[0.37254902 0.17820526 0.3006587  ... 0.         2.         2.        ]
 [0.62745098 0.08605793 0.64060158 ... 3.         0.         0.        ]
 [0.03921569 0.08068623 0.55672324 ... 4.         1.         2.        ]
 ...
 [0.41176471 0.20290479 0.3813765  ... 4.         0.         2.        ]
 [0.21568627 0.18243169 0.10348502 ... 2.         3.         3.        ]
 [0.33333333 0.05982505 0.25631136 ... 1.         1.         1.        ]]


In [30]:
print("Shape of df_continuous_array:", scaled_continuous_features.shape)
print("Shape of embeddings_array:", scaled_ad_copy_embeddings.shape)
print("Shape of combined_features:", combined_features.shape)

Shape of df_continuous_array: (8000, 2)
Shape of embeddings_array: (8000, 30)
Shape of combined_features: (8000, 35)


In [31]:
# Sample a row index to examine
row_index = 5

original_row = df.iloc[row_index][continuous_fields].values
print("Original row:", original_row)

# Get the corresponding row from the combined array
combined_row = combined_features[770, :len(continuous_fields)]
print("Combined array row:", combined_row)
print(combined_features[:5, :])

Original row: [59 4.226534379409992]
Combined array row: [0.11764706 0.37470094]
[[3.72549020e-01 1.78205264e-01 3.00658703e-01 2.98680067e-01
  2.69747376e-02 4.18405771e-01 0.00000000e+00 9.99999166e-01
  7.18948305e-01 4.18236554e-02 2.68972397e-01 2.67280817e-01
  2.67930508e-01 1.58325344e-01 5.46252251e-01 2.01159701e-01
  5.19606650e-01 5.95319510e-01 5.67930698e-01 2.35751793e-01
  4.50963259e-01 5.48844278e-01 2.91155517e-01 1.01641893e-01
  4.90868390e-01 2.11466908e-01 1.56110346e-01 2.52998233e-01
  6.54318511e-01 5.68582475e-01 6.40719652e-01 3.28935564e-01
  0.00000000e+00 2.00000000e+00 2.00000000e+00]
 [6.27450980e-01 8.60579296e-02 6.40601575e-01 9.44982767e-02
  1.13628253e-01 3.04740667e-01 1.87279791e-01 7.49659657e-01
  2.83815384e-01 0.00000000e+00 4.21317071e-01 3.66452634e-01
  9.71843004e-02 2.95820594e-01 4.41854030e-01 5.70750535e-02
  6.51248574e-01 7.06772327e-01 2.63707101e-01 3.18650544e-01
  2.80897141e-01 4.63493943e-01 1.21021986e-01 5.50440133e-01
  2

In [32]:
print(df.head())

                                user_id                                 ad_id  \
0  6461380c-f03c-432d-abf8-d4f9fb1f4e28  31516856-be90-4155-b681-87c31fa946b8   
1  8d58fb8a-5656-408d-9bf0-330fee82bf12  407fe646-bb2d-4f25-a630-606450b83434   
2  db166483-dd99-4420-8265-2981477a359d  7f400246-d613-4d91-89eb-fa3e301ea58b   
3  a50ad23e-90ed-4845-b62f-c849263eeb90  16feceb3-178f-41b5-b057-9b765543f843   
4  1d9d9266-5cd3-4b25-be2d-8fd74af7cd28  98736a79-cd33-490e-b18b-7f000b2796be   

   age       location                                            ad_copy  \
0   37         Africa  Captivate the Crowd with your Yeet Charisma Co...   
1   50  North America  Captivate the Crowd with Yeet Charisma Couture...   
2   20  South America  Experience Cinematic Brilliance at Home with F...   
3   33  North America  Efficiency Meets Elegance with Aurora Hybrid.B...   
4   21  North America  Experience Unmatched Elegance with Serenity Lu...   

  product_type  ad_clicked  ad_type  pages_visited_this_

### Verify Target Data Distribution

In [33]:
# Calculate the percentage of 'ad_clicked' equals 1
percentage_clicked = (df['ad_clicked'].sum() / len(df)) * 100

print(f"Percentage of ads clicked (ad_clicked = 1): {percentage_clicked:.2f}%")

Percentage of ads clicked (ad_clicked = 1): 41.04%


### Initialize Model

In [34]:
                      # Number of continuous features + ad_copy_embeddings length
num_dense_features = len(scaled_continuous_features[0]) + scaled_ad_copy_embeddings.shape[1]  
cat_embedding_sizes = [len(label_encoders[col].classes_) for col in categorical_cols]

model = DLRM_Net(num_dense_features=num_dense_features, cat_embedding_sizes=cat_embedding_sizes)
model_save_path = 'model_artifacts/trained_model.pt'
learning_rate = 0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
X = torch.tensor(combined_features, dtype=torch.float32)
y = torch.tensor(df['ad_clicked'].to_numpy(), dtype=torch.float32).unsqueeze(1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = DLRM_Dataset(X_train, y_train)
val_dataset = DLRM_Dataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

### Train Model

In [36]:
num_epochs = 500
num_continuous_features = 2
num_embedding_features = len(scaled_ad_copy_embeddings[0])
num_categorical_features = 3 

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for features, labels in train_loader:
        optimizer.zero_grad()
        x_dense = features[:, :num_continuous_features + num_embedding_features]
        x_cat = features[:, num_continuous_features + num_embedding_features:num_continuous_features + num_embedding_features + num_categorical_features]
        outputs = model(x_dense, x_cat)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for features, labels in val_loader:
            x_dense = features[:, :num_continuous_features + num_embedding_features]
            x_cat = features[:, num_continuous_features + num_embedding_features:num_continuous_features + num_embedding_features + num_categorical_features]
            outputs = model(x_dense, x_cat)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Epoch [1/500], Training Loss: 0.6811, Validation Loss: 0.6870
Epoch [2/500], Training Loss: 0.6766, Validation Loss: 0.6848


Epoch [3/500], Training Loss: 0.6795, Validation Loss: 0.6831
Epoch [4/500], Training Loss: 0.6780, Validation Loss: 0.6817
Epoch [5/500], Training Loss: 0.6762, Validation Loss: 0.6808
Epoch [6/500], Training Loss: 0.6730, Validation Loss: 0.6801
Epoch [7/500], Training Loss: 0.6740, Validation Loss: 0.6796
Epoch [8/500], Training Loss: 0.6723, Validation Loss: 0.6782
Epoch [9/500], Training Loss: 0.6720, Validation Loss: 0.6753
Epoch [10/500], Training Loss: 0.6667, Validation Loss: 0.6694
Epoch [11/500], Training Loss: 0.6590, Validation Loss: 0.6592
Epoch [12/500], Training Loss: 0.6456, Validation Loss: 0.6450
Epoch [13/500], Training Loss: 0.6309, Validation Loss: 0.6272
Epoch [14/500], Training Loss: 0.6092, Validation Loss: 0.6018
Epoch [15/500], Training Loss: 0.5757, Validation Loss: 0.5722
Epoch [16/500], Training Loss: 0.5321, Validation Loss: 0.5413
Epoch [17/500], Training Loss: 0.5082, Validation Loss: 0.5093
Epoch [18/500], Training Loss: 0.4742, Validation Loss: 0.5009