In [3]:
!pip install -q pandas scikit-learn

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import Dataset, DataLoader

In [5]:
df = pd.read_csv('ad_10000records.csv')
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Gender,Country,Timestamp,Clicked on Ad
0,62.26,32.0,69481.85,172.83,Decentralized real-time circuit,Lisafort,Male,Svalbard & Jan Mayen Islands,2016-06-09 21:43:05,0
1,41.73,31.0,61840.26,207.17,Optional full-range projection,West Angelabury,Male,Singapore,2016-01-16 17:56:05,0
2,44.4,30.0,57877.15,172.83,Total 5thgeneration standardization,Reyesfurt,Female,Guadeloupe,2016-06-29 10:50:45,0
3,59.88,28.0,56180.93,207.17,Balanced empowering success,New Michael,Female,Zambia,2016-06-21 14:32:32,0
4,49.21,30.0,54324.73,201.58,Total 5thgeneration standardization,West Richard,Female,Qatar,2016-07-21 10:54:35,1


**Preprocess Data (Use All Ads)**

In [6]:
# Encode Gender
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

# Normalize numerical features
scaler = MinMaxScaler()
df[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']] = scaler.fit_transform(
    df[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']]
)

# Encode Ad Topic Line (all ads)
le_ad = LabelEncoder()
df['Ad Topic Line'] = le_ad.fit_transform(df['Ad Topic Line'])

# Full ad pool
unique_ads = df['Ad Topic Line'].unique()
num_ads = len(unique_ads)
print(f"Number of unique ads: {num_ads}")

Number of unique ads: 559


In [17]:
import pickle

with open('le_ad.pkl', 'wb') as f:
    pickle.dump(le_ad, f)

with open('le_gender.pkl', 'wb') as f:
    pickle.dump(le_gender, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

**Generate User-Ad Interaction Dataset**

In [7]:
def generate_user_ad_pairs(df, negative_ratio=3):
    user_ad_pairs = []
    for idx, row in df.iterrows():
        user_features = row[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Gender']].values
        clicked_ad = row['Ad Topic Line']
        label = row['Clicked on Ad']

        # Positive sample
        user_ad_pairs.append((user_features, clicked_ad, 1))

        # Negative samples
        for _ in range(negative_ratio):
            negative_ad = np.random.choice(unique_ads[unique_ads != clicked_ad])
            user_ad_pairs.append((user_features, negative_ad, 0))

    return user_ad_pairs

pairs = generate_user_ad_pairs(df)
print(f"Total samples (pos + neg): {len(pairs)}")

Total samples (pos + neg): 40000


In [8]:
class AdRankingDataset(torch.utils.data.Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        user_feats, ad_id, label = self.pairs[idx]

        # Convert to numpy arrays with correct dtype if they are numpy object arrays
        if isinstance(user_feats, np.ndarray) and user_feats.dtype == np.object_:
            user_feats = user_feats.astype(np.float32)  # or float64 if you want

        # For ad_id, if it's an object array or a list, convert to int
        if isinstance(ad_id, (np.ndarray, list)) and np.array(ad_id).dtype == np.object_:
            ad_id = np.array(ad_id).astype(np.int64)
        elif not isinstance(ad_id, (int, np.integer)):
            ad_id = int(ad_id)

        return torch.tensor(user_feats, dtype=torch.float32), torch.tensor(ad_id, dtype=torch.long), torch.tensor(label, dtype=torch.float32)

dataset = AdRankingDataset(pairs)
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64)

**Twin Tower Model Definition**

In [9]:
class TwinTowerModel(nn.Module):
    def __init__(self, user_input_dim, ad_vocab_size, embed_dim=32):
        super(TwinTowerModel, self).__init__()
        self.user_tower = nn.Sequential(
            nn.Linear(user_input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, embed_dim)
        )
        self.ad_embedding = nn.Embedding(ad_vocab_size, embed_dim)

    def forward(self, user_feats, ad_ids):
        user_emb = self.user_tower(user_feats)              # shape: (batch_size, embed_dim)
        ad_emb = self.ad_embedding(ad_ids)                   # shape: (batch_size, embed_dim)
        dot = (user_emb * ad_emb).sum(dim=1)                 # dot product per example
        prob = torch.sigmoid(dot)
        return prob

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = TwinTowerModel(user_input_dim=5, ad_vocab_size=num_ads, embed_dim=32).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for user_feats, ad_ids, labels in train_loader:
        user_feats, ad_ids, labels = user_feats.to(device), ad_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(user_feats, ad_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
torch.save(model.cpu(), 'twin_tower_model.pth')

Using device: cuda
Epoch 1/10, Loss: 0.6573
Epoch 2/10, Loss: 0.5166
Epoch 3/10, Loss: 0.4231
Epoch 4/10, Loss: 0.3909
Epoch 5/10, Loss: 0.3749
Epoch 6/10, Loss: 0.3639
Epoch 7/10, Loss: 0.3542
Epoch 8/10, Loss: 0.3465
Epoch 9/10, Loss: 0.3394
Epoch 10/10, Loss: 0.3332


**Model Evaluation (AUC Score)**

In [11]:
from sklearn.metrics import roc_auc_score

model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for user_feats, ad_ids, labels in test_loader:
        user_feats, ad_ids = user_feats.to(device), ad_ids.to(device)
        outputs = model(user_feats, ad_ids).cpu()
        all_preds.extend(outputs.numpy())
        all_labels.extend(labels.numpy())

auc = roc_auc_score(all_labels, all_preds)
print(f"AUC on test set: {auc:.4f}")

AUC on test set: 0.8634


**Candidate Retrieval + Ranking Function**

In [14]:
def recommend_topk_ads_for_user(model, user_row, top_k=5, candidate_k=100):
    model.eval()
    with torch.no_grad():
        # Extract features as floats (convert dtype)

        user_feats = torch.tensor(user_feats_np, dtype=torch.float32).to(device)
        user_feats = user_feats.unsqueeze(0).repeat(candidate_k, 1)  # repeat candidate_k times

        # Randomly sample candidate_k ads from full ad pool
        candidate_ads = np.random.choice(unique_ads, size=candidate_k, replace=False)
        ad_ids = torch.tensor(candidate_ads, dtype=torch.long).to(device)

        # Predict scores
        scores = model(user_feats, ad_ids)
        top_indices = torch.topk(scores, top_k).indices.cpu().numpy()
        top_ads = [le_ad.inverse_transform([candidate_ads[i]])[0] for i in top_indices]

        return top_ads, scores[top_indices].cpu().numpy()


**Demo - Recommend Ads for a Random User**

In [15]:
sample_user = df.sample(1).iloc[0]
print("User features:")
print(sample_user[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Gender']])

top_ads, scores = recommend_topk_ads_for_user(model, sample_user)

print("\nTop recommended ads and scores:")
for i, (ad, score) in enumerate(zip(top_ads, scores), 1):
    print(f"{i}. {ad} (Score: {score:.4f})")

User features:
Daily Time Spent on Site     0.69385
Age                         0.634146
Area Income                 0.702372
Daily Internet Usage        0.422848
Gender                             0
Name: 9574, dtype: object

Top recommended ads and scores:
1. Intuitive exuding service-desk (Score: 0.8745)
2. Virtual bandwidth-monitored initiative (Score: 0.8637)
3. Digitized content-based circuit (Score: 0.7795)
4. Streamlined homogeneous analyzer (Score: 0.7135)
5. Robust uniform framework (Score: 0.6579)
