In [30]:
import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
import clip
from sklearn.preprocessing import StandardScaler
import hdbscan
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from IPython.display import display

In [31]:
# ==== STEP 1: LOAD DATA ====
# Load clothing dataset
clothing_excel_path = "C:/Users/megdy/Desktop/stat 4830/STAT-4830-vllm-project/two_tower/clothing_data.xlsx"
df_items = pd.read_excel(clothing_excel_path)
df_items.drop(["image", "average"], axis=1, inplace=True, errors='ignore')  # Drop unused columns

# Load user ratings dataset
ratings_excel_path = "C:/Users/megdy/Desktop/stat 4830/STAT-4830-vllm-project/two_tower/rating_data.xlsx"
df_ratings = pd.read_excel(ratings_excel_path)
df_ratings.drop(["image", "average"], axis=1, inplace=True, errors='ignore')  # Drop unused columns
df_ratings = df_ratings.set_index(df_items.index)  # Align indices with clothing dataset

# Ensure all ratings are numeric
df_ratings = df_ratings.apply(pd.to_numeric, errors='coerce')

In [34]:
# ==== STEP 2: SET IMPLICIT 10 FOR USERS WHO LIKED THE ITEM ====
for idx, row in df_items.iterrows():
    liked_user = row["user"].strip().lower()  # Get user who originally liked the item
    for col in df_ratings.columns:
        if col.strip().lower() == liked_user:
            df_ratings.at[idx, col] = 10  # Assign rating of 10


In [35]:
# ==== STEP 3: THRESHOLD RATINGS ====
df_ratings = df_ratings.applymap(lambda x: 1 if x >= 7 else -1)
df_ratings = df_ratings.reindex(df_items.index)  # Ensure index alignment


  df_ratings_thresholded = df_ratings.applymap(lambda x: 1 if x >= 7 else -1)


In [36]:
# ==== STEP 4: CREATE FEATURE REPRESENTATIONS ====
# Normalize price
scaler = StandardScaler()
price_scaled = scaler.fit_transform(df_items[["price"]])

# Convert text features to numerical representations
df_items["text_feature"] = df_items["brand"] + " " + df_items["name"] + " " + df_items["description"]

# Convert text features to numerical embeddings using simple hashing (alternative to CLIP)
df_items["text_embedding"] = df_items["text_feature"].apply(lambda x: hash(x) % (10**8))  # Simple integer hash

# Combine all numerical features
text_embs = df_items["text_embedding"].to_numpy().reshape(-1, 1)  # (n, 1)
price_embs = price_scaled  # (n, 1)

# Final item embeddings: Concatenate text hash + price
features = np.hstack([text_embs, price_embs])
item_embeddings = torch.tensor(features, dtype=torch.float32)


In [37]:
# ==== STEP 5: CREATE USER EMBEDDINGS ====
user_embeddings = {}

for user in df_ratings_thresholded.columns:
    user_ratings = df_ratings_thresholded[user].dropna()

    liked_items = user_ratings[user_ratings == 1].index.tolist()
    disliked_items = user_ratings[user_ratings == -1].index.tolist()

    liked_items = [idx - 2 for idx in liked_items if 0 <= idx - 2 < len(item_embeddings)]
    disliked_items = [idx - 2 for idx in disliked_items if 0 <= idx - 2 < len(item_embeddings)]

    if liked_items or disliked_items:
        user_embs = []

        if liked_items:
            user_embs.append(item_embeddings[liked_items].mean(dim=0))

        if disliked_items:
            user_embs.append(-item_embeddings[disliked_items].mean(dim=0))

        user_embeddings[user] = torch.stack(user_embs).mean(dim=0)

In [38]:
# ==== STEP 6: PREPARE TRAINING DATA ====
train_pairs = []

for user in user_embeddings.keys():
    user_ratings = df_ratings_thresholded[user].dropna()

    liked_items = user_ratings[user_ratings == 1].index.tolist()
    disliked_items = user_ratings[user_ratings == -1].index.tolist()

    liked_items = [idx - 2 for idx in liked_items if 0 <= idx - 2 < len(item_embeddings)]
    disliked_items = [idx - 2 for idx in disliked_items if 0 <= idx - 2 < len(item_embeddings)]

    for item in liked_items:
        train_pairs.append((user, item, 1))

    for item in disliked_items:
        train_pairs.append((user, item, -1))


In [39]:
# ==== STEP 7: CREATE PYTORCH DATASET ====
class UserItemDataset(Dataset):
    def __init__(self, train_pairs, user_text_data, user_price_data, item_text_data, item_price_data):
        self.train_pairs = train_pairs
        self.user_text_data = user_text_data
        self.user_price_data = user_price_data
        self.item_text_data = item_text_data
        self.item_price_data = item_price_data

    def __len__(self):
        return len(self.train_pairs)

    def __getitem__(self, idx):
        user, item, label = self.train_pairs[idx]

        # Retrieve user and item features
        user_text_idx = self.user_text_data[user]  # Tokenized text index
        user_price = self.user_price_data[user]

        item_text_idx = self.item_text_data[item]  # Tokenized text index
        item_price = self.item_price_data[item]

        return user_text_idx, user_price, item_text_idx, item_price, torch.tensor(label, dtype=torch.float32)

# Create dataset & dataloader
dataset = UserItemDataset(train_pairs, user_text_embeddings, user_price_embeddings, item_text_embeddings, item_price_embeddings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


NameError: name 'user_text_embeddings' is not defined

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class TwoTowerModel(nn.Module):
    def __init__(self, vocab_size=50000, text_dim=128, price_dim=1):
        super(TwoTowerModel, self).__init__()

        # Text Embedding Layer (Learned)
        self.text_embedding = nn.Embedding(vocab_size, text_dim)

        # User Tower
        self.user_tower = nn.Sequential(
            nn.Linear(text_dim + price_dim, 64), nn.ReLU(),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 16)
        )

        # Item Tower
        self.item_tower = nn.Sequential(
            nn.Linear(text_dim + price_dim, 64), nn.ReLU(),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 16)
        )

    def forward(self, user_text_idx, user_price, item_text_idx, item_price):
        # Embed text features
        user_text_emb = self.text_embedding(user_text_idx).mean(dim=1)  # Aggregate tokens
        item_text_emb = self.text_embedding(item_text_idx).mean(dim=1)

        # Concatenate price with text embeddings
        user_input = torch.cat([user_text_emb, user_price], dim=1)
        item_input = torch.cat([item_text_emb, item_price], dim=1)

        # Pass through towers
        user_repr = self.user_tower(user_input)
        item_repr = self.item_tower(item_input)

        # Compute similarity
        return F.cosine_similarity(user_repr, item_repr, dim=1)


In [None]:
# ==== STEP 9: TRAIN THE MODEL ====
model = TwoTowerModel()
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

losses = []
for epoch in range(5):
    epoch_loss = 0
    for user_text_idx, user_price, item_text_idx, item_price, labels in dataloader:
        optimizer.zero_grad()

        # Forward pass: Pass all required inputs
        sim = model(user_text_idx, user_price, item_text_idx, item_price)

        # Convert labels from -1/1 to 0/1 for BCE loss
        labels = (labels + 1) / 2  

        # Compute loss
        loss = loss_fn(sim, labels)

        # Backpropagation
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    losses.append(epoch_loss / len(dataloader))
    print(f"Epoch {epoch+1}, Loss: {losses[-1]:.4f}")


TypeError: TwoTowerModel.forward() missing 2 required positional arguments: 'item_text_idx' and 'item_price'

In [None]:
# ==== STEP 10: RECOMMEND ====
def recommend_items(user_id, top_k=5):
    user_emb = user_embeddings[user_id].unsqueeze(0)
    scores = torch.cosine_similarity(model.user_tower(user_emb), model.item_tower(item_embeddings), dim=1)
    return df_items.iloc[scores.argsort(descending=True)[:top_k]][["brand", "name"]]

