In [None]:
import torch
from model.DLRM_Net import DLRM_Net
from model.DLRM_Dataset import DLRM_Dataset
from torch.utils.data import DataLoader
import joblib
import pandas as pd
import torch.nn as nn
import numpy as np
import common
import os
import pickle
from sklearn.decomposition import PCA

In [26]:
df_test = pd.read_csv('../data/test/test_data.csv')

### Generate or Load Ad Copy Embeddings for Test Data

In [27]:
# Load or generate ad copy embeddings
embeddings_file = 'model_artifacts/ad_copy_embeddings.pkl'
if os.path.exists(embeddings_file):
    with open(embeddings_file, 'rb') as file:
        ad_copy_embeddings_dict = pickle.load(file)
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    device = torch.device("cpu")
    model.to(device)
    model.eval()
    ad_copy_embeddings_dict = common.generate_all_embeddings(embeddings_file, model, tokenizer, device)

# Map embeddings to ad_copy
embeddings_list = df_test['ad_copy'].map(ad_copy_embeddings_dict).tolist()
ad_copy_embeddings = np.vstack(embeddings_list)

# Load PCA model and transform embeddings to reduce dimensions
pca_model_path = 'model_artifacts/pca_model.pkl'
with open(pca_model_path, 'rb') as file:
    pca = pickle.load(file)
reduced_embeddings = pca.transform(ad_copy_embeddings)

# Scale the reduced embeddings
scaled_ad_copy_embeddings = common.load_and_transform_scaler(reduced_embeddings, 'model_artifacts/embeddings_scaler.pkl')

### Load Encoders and Label Encode Categorical Columns

In [30]:
categorical_cols = ['location', 'product_type', 'ad_type']

# Load the saved label encoders
label_encoders = joblib.load('model_artifacts/label_encoders.joblib')

# Apply the label encoders to the test data
encoded_categorical_data = np.empty((df_test.shape[0], len(categorical_cols)))

for i, col in enumerate(categorical_cols):
    # Use the loaded label encoder to transform the data
    encoded_categorical_data[:, i] = label_encoders[col].transform(df_test[col])



### Load Continuous Scaler and Scale Continuous Columns

In [31]:
continuous_fields = ['age', 'site_visit_frequency']
df_continuous = df_test[continuous_fields]
scaled_continuous_features = common.load_and_transform_scaler(df_continuous, 'model_artifacts/continuous_scaler.pkl')

In [32]:
combined_features = np.hstack((scaled_continuous_features, scaled_ad_copy_embeddings, encoded_categorical_data))

### Apply Encoders and Scaler to Test Data

In [34]:
# Convert to tensors suitable for training
X = torch.tensor(combined_features, dtype=torch.float32)
y = torch.tensor(test_df['ad_clicked'].to_numpy(), dtype=torch.float32).unsqueeze(1)

test_dataset = DLRM_Dataset(X, y)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Verify Target Data Distribution

In [35]:
import pandas as pd

# Calculate the percentage of 'ad_clicked' equals 1
percentage_clicked = (df_test['ad_clicked'].sum() / len(df_test)) * 100

print(f"Percentage of ads clicked (ad_clicked = 1): {percentage_clicked:.2f}%")


Percentage of ads clicked (ad_clicked = 1): 40.50%


### Initialize Model

In [37]:
                      # Number of continuous features + ad_copy_embeddings length
num_dense_features = len(scaled_continuous_features[0]) + scaled_ad_copy_embeddings.shape[1]  
cat_embedding_sizes = [len(label_encoders[col].classes_) for col in categorical_cols]

model = DLRM_Net(num_dense_features=num_dense_features, cat_embedding_sizes=cat_embedding_sizes)
model.load_state_dict(torch.load('model_artifacts/trained_model.pt'))
model.eval()
criterion = nn.BCELoss()

### Test Model

In [38]:
correct_predictions = 0
incorrect_predictions = 0

num_continuous_features = 2  
num_embedding_features = len(scaled_ad_copy_embeddings[0])
num_categorical_features = 3

with torch.no_grad():
    for features, labels in test_loader:
        x_dense = features[:, :num_continuous_features + num_embedding_features]
        x_cat = features[:, num_continuous_features + num_embedding_features:num_continuous_features + num_embedding_features + num_categorical_features]

        outputs = model(x_dense, x_cat)
        predicted = (outputs > 0.5).float().view(-1)

        corrects = (predicted == labels.view(-1)).sum().item()
        incorrects = (predicted != labels.view(-1)).sum().item()

        correct_predictions += corrects
        incorrect_predictions += incorrects

    print(f"Total Correct Predictions: {correct_predictions}")
    print(f"Total Incorrect Predictions: {incorrect_predictions}")

Total Correct Predictions: 1640
Total Incorrect Predictions: 360


In [39]:
from sklearn.metrics import precision_score, f1_score, roc_auc_score

total_loss = 0.0
total_correct = 0
num_samples = 0
all_predictions = []
all_actuals = []
all_probabilities = []

num_continuous_features = 2
num_embedding_features = len(scaled_ad_copy_embeddings[0])
num_categorical_features = 3 

with torch.no_grad():
    for features, labels in test_loader:
        x_dense = features[:, :num_continuous_features + num_embedding_features]
        x_cat = features[:, num_continuous_features + num_embedding_features:num_continuous_features + num_embedding_features + num_categorical_features]

        # Forward pass
        outputs = model(x_dense, x_cat)

        predicted = (outputs > 0.5).float().view(-1)

        # Store probabilities for ROC AUC calculation
        all_probabilities.extend(outputs.view(-1).numpy())

        correct = (predicted == labels.view(-1)).sum().item()
        total_correct += correct
        num_samples += labels.size(0)

        all_predictions.extend(predicted.numpy())
        all_actuals.extend(labels.view(-1).numpy())

        loss = criterion(outputs, labels)
        total_loss += loss.item()

avg_loss = total_loss / len(test_loader)
test_accuracy = total_correct / num_samples

print(f"Loss: {avg_loss:.4f}, Accuracy: {test_accuracy:.4f}")

all_predictions = np.array(all_predictions)
all_actuals = np.array(all_actuals)
all_probabilities = np.array(all_probabilities)

precision = precision_score(all_actuals, all_predictions)
f1 = f1_score(all_actuals, all_predictions)
roc_auc = roc_auc_score(all_actuals, all_probabilities)

print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Loss: 6.6578, Accuracy: 0.8200
Precision: 0.7409
F1 Score: 0.7936
ROC AUC Score: 0.8714
