In [None]:
# mock up of how model and loss functions can work and be made into modules

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import ast
import time

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# FIX 1: Better user ID parsing that handles the actual data format
def safe_parse_user_ids(user_data):
    """Parse user IDs from the actual Amazon dataset format... this can be changed for other datasets"""
    if pd.isna(user_data):
        return []

    if isinstance(user_data, list):
        return user_data

    if isinstance(user_data, str):
        # The actual format appears to be comma-separated user IDs
        try:
            # Split by comma and clean
            user_ids = [uid.strip() for uid in user_data.split(',') if uid.strip()]
            return user_ids
        except:
            return [user_data]

    return [str(user_data)]

def create_implicit_feedback_dataset(df, negative_samples_ratio=4):
    """Create implicit feedback dataset with proper user parsing... in the future this will just be the dataset adjusted and tuned for implicit feedback. we will have at least 2 versions of each dataset"""
    all_users = set()
    all_products = set()

    # Extract users and products from the dataset
    for idx, row in df.iterrows():
        user_list = safe_parse_user_ids(row['user_id'])
        product_id = row['product_id']

        all_users.update(user_list)
        all_products.add(product_id)

    all_users = list(all_users)
    all_products = list(all_products)

    print(f"Total users: {len(all_users)}")
    print(f"Total products: {len(all_products)}")

    # Create positive examples
    positive_examples = []
    for idx, row in df.iterrows():
        user_list = safe_parse_user_ids(row['user_id'])
        product_id = row['product_id']

        for user in user_list:
            positive_examples.append((user, product_id, 1))

    print(f"Positive examples: {len(positive_examples)}")

    # Create negative examples
    negative_examples = []
    user_positive_products = {}

    for user, product, _ in positive_examples:
        if user not in user_positive_products:
            user_positive_products[user] = set()
        user_positive_products[user].add(product)

    # FIX 2: Better negative sampling
    for user in all_users:
        positive_set = user_positive_products.get(user, set())
        negative_candidates = [p for p in all_products if p not in positive_set]

        if negative_candidates and positive_set:
            # Sample more strategically
            n_negative = min(len(positive_set) * negative_samples_ratio, len(negative_candidates))
            if n_negative > 0:
                sampled_negatives = np.random.choice(
                    negative_candidates,
                    size=n_negative,
                    replace=False
                )
                for product in sampled_negatives:
                    negative_examples.append((user, product, 0))

    print(f"Negative examples: {len(negative_examples)}")

    # Combine and shuffle
    all_examples = positive_examples + negative_examples
    np.random.shuffle(all_examples)

    return all_examples, all_users, all_products

# FIX 3: Improved model from base model with better initialization and regularization
class ImprovedRecommender(nn.Module):
    def __init__(self, num_users, num_products, embedding_dim=64, bert_dim=128):
        super(ImprovedRecommender, self).__init__()

        # User and product embeddings with better initialization
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.product_embedding = nn.Embedding(num_products, embedding_dim)

        # Initialize embeddings properly
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.product_embedding.weight)

        # BERT feature projection
        self.bert_projection = nn.Linear(bert_dim, 64)

        # Combined features dimension
        combined_dim = embedding_dim * 2 + 64

        # simple MLP architecture
        self.mlp = nn.Sequential(
            nn.Linear(combined_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, user_idx, product_idx, bert_features):
        user_emb = self.user_embedding(user_idx)
        product_emb = self.product_embedding(product_idx)
        bert_proj = self.bert_projection(bert_features)

        combined = torch.cat([user_emb, product_emb, bert_proj], dim=1)
        prediction = self.mlp(combined)

        return prediction.squeeze()

# FIX 4: Improved BPR loss implementation... currently still playing with this as it is not up to par yet
class ImprovedBPRLoss(nn.Module):
    def __init__(self):
        super(ImprovedBPRLoss, self).__init__()

    def forward(self, positive_scores, negative_scores):
        # Add margin and better numerical stability
        diff = positive_scores - negative_scores
        # Use a margin to prevent trivial solutions
        loss = -torch.log(torch.sigmoid(diff) + 1e-10).mean()
        return loss

# FIX 5: Better training function with proper pair sampling for BPR... still playing around with this but this is just a "proof of concept" will work on this over break
def improved_train_model(model, train_loader, loss_fn, optimizer, num_epochs=5, loss_name="BCE"):
    model.to(device)
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        start_time = time.time()
        batch_count = 0

        for batch_idx, batch in enumerate(train_loader):
            user_idx = batch['user_idx'].to(device)
            product_idx = batch['product_idx'].to(device)
            bert_features = batch['bert_features'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            predictions = model(user_idx, product_idx, bert_features)

            if loss_name == "BPR":
                # FIX: Proper pair sampling for BPR
                positive_mask = labels == 1
                negative_mask = labels == 0

                if positive_mask.sum() > 0 and negative_mask.sum() > 0:
                    positive_scores = predictions[positive_mask]
                    negative_scores = predictions[negative_mask]

                    # Sample pairs more carefully
                    min_size = min(len(positive_scores), len(negative_scores))
                    if min_size > 0:
                        # Randomly sample pairs
                        pos_indices = torch.randperm(len(positive_scores))[:min_size]
                        neg_indices = torch.randperm(len(negative_scores))[:min_size]

                        positive_scores = positive_scores[pos_indices]
                        negative_scores = negative_scores[neg_indices]

                        loss = loss_fn(positive_scores, negative_scores)
                    else:
                        continue
                else:
                    continue
            else:
                # BCE loss
                loss = loss_fn(predictions, labels)

            loss.backward()
            # Add gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            epoch_loss += loss.item()
            batch_count += 1

            if batch_idx % 20 == 0:
                elapsed = time.time() - start_time
                print(f'Epoch {epoch+1}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item():.4f}')

        if batch_count > 0:
            avg_loss = epoch_loss / batch_count
            print(f'Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}')
        else:
            print(f'Epoch {epoch+1} completed. No batches processed.')

    return model

# FIX 6: Create balanced dataset... we won't be creating datasets but again this is just for proof of concept
def create_balanced_dataset(implicit_data, max_negative_ratio=2):
    """Create a more balanced dataset"""
    positive_examples = [ex for ex in implicit_data if ex[2] == 1]
    negative_examples = [ex for ex in implicit_data if ex[2] == 0]

    print(f"Original - Positive: {len(positive_examples)}, Negative: {len(negative_examples)}")

    # Balance the dataset
    max_negatives = min(len(negative_examples), len(positive_examples) * max_negative_ratio)
    balanced_negatives = np.random.choice(
        len(negative_examples),
        size=max_negatives,
        replace=False
    )
    balanced_negatives = [negative_examples[i] for i in balanced_negatives]

    balanced_data = positive_examples + balanced_negatives
    np.random.shuffle(balanced_data)

    print(f"Balanced - Positive: {len(positive_examples)}, Negative: {len(balanced_negatives)}")

    return balanced_data

print("=== IMPROVED SETUP ===")

# Create implicit feedback dataset with proper user parsing
implicit_data, all_users, all_products = create_implicit_feedback_dataset(df)

# FIX: Create balanced dataset
balanced_data = create_balanced_dataset(implicit_data, max_negative_ratio=2)

# Create mappings
user_to_idx = {user: idx for idx, user in enumerate(all_users)}
product_to_idx = {product: idx for idx, product in enumerate(all_products)}

print(f"User mapping size: {len(user_to_idx)}")
print(f"Product mapping size: {len(product_to_idx)}")

# Prepare product texts
print("Preparing product texts...")
product_texts = {}
for idx, row in df.iterrows():
    product_id = row['product_id']
    text = f"{row.get('product_name', '')} {row.get('about_product', '')} {row.get('category', '')}"
    product_texts[product_id] = text

# Use smaller BERT model
print("Loading BERT model...")
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
bert_model = BertModel.from_pretrained('prajjwal1/bert-tiny')
bert_model.eval()

print("Precomputing BERT embeddings...")
bert_embeddings = {}
batch_size = 16

product_ids = list(product_texts.keys())
for i in range(0, len(product_ids), batch_size):
    batch_ids = product_ids[i:i+batch_size]
    batch_texts = [product_texts[pid] for pid in batch_ids]

    encoding = tokenizer(
        batch_texts,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        outputs = bert_model(**encoding)
        batch_embeddings = outputs.pooler_output

    for j, product_id in enumerate(batch_ids):
        bert_embeddings[product_id] = batch_embeddings[j].numpy()

    if (i // batch_size) % 10 == 0:
        print(f"Processed {i + len(batch_ids)}/{len(product_ids)} products")

print(f"Precomputed embeddings for {len(bert_embeddings)} products")

# Dataset class
class FastAmazonDataset(Dataset):
    def __init__(self, data, bert_embeddings, user_to_idx, product_to_idx):
        self.data = data
        self.bert_embeddings = bert_embeddings
        self.user_to_idx = user_to_idx
        self.product_to_idx = product_to_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        user, product, label = self.data[idx]

        user_idx = self.user_to_idx[user]
        product_idx = self.product_to_idx[product]
        bert_embedding = self.bert_embeddings.get(product, np.zeros(128))

        return {
            'user_idx': torch.tensor(user_idx, dtype=torch.long),
            'product_idx': torch.tensor(product_idx, dtype=torch.long),
            'bert_features': torch.tensor(bert_embedding, dtype=torch.float),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Split balanced data
train_data, test_data = train_test_split(balanced_data, test_size=0.2, random_state=42)

# Create datasets
train_dataset = FastAmazonDataset(train_data, bert_embeddings, user_to_idx, product_to_idx)
test_dataset = FastAmazonDataset(test_data, bert_embeddings, user_to_idx, product_to_idx)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

# Initialize model
num_users = len(user_to_idx)
num_products = len(product_to_idx)
bert_dim = 128

print(f"Model parameters: {num_users} users, {num_products} products")

# Test improved loss functions
loss_functions = {
    "BCE": nn.BCELoss(),
    "BPR": ImprovedBPRLoss()
}

results = {}

print("\n=== IMPROVED TRAINING ===")

for loss_name, loss_fn in loss_functions.items():
    print(f"\n=== Training with {loss_name} Loss ===")
    start_time = time.time()

    # Initialize model with better hyperparameters
    model = ImprovedRecommender(num_users, num_products, embedding_dim=64, bert_dim=bert_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    # Train with improved function
    model = improved_train_model(model, train_loader, loss_fn, optimizer, num_epochs=5, loss_name=loss_name)

    training_time = time.time() - start_time
    print(f"{loss_name} training completed in {training_time:.2f} seconds")

    results[loss_name] = {
        'model': model,
        'training_time': training_time
    }

# Evaluation function... this part will be more in depth according to what the best evaluation techniques are from the literature review
def evaluate_model(model, test_loader, device):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            user_idx = batch['user_idx'].to(device)
            product_idx = batch['product_idx'].to(device)
            bert_features = batch['bert_features'].to(device)
            labels = batch['label'].to(device)

            predictions = model(user_idx, product_idx, bert_features)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return np.array(all_predictions), np.array(all_labels)

print("\n=== IMPROVED EVALUATION ===")
evaluation_results = {}

for loss_name, result in results.items():
    model = result['model']
    predictions, labels = evaluate_model(model, test_loader, device)

    # Calculate comprehensive metrics... we can keep these as baseline while including other evaluation metrics from the literature review
    binary_predictions = (predictions > 0.5).astype(int)
    accuracy = accuracy_score(labels, binary_predictions)
    precision = precision_score(labels, binary_predictions, zero_division=0)
    recall = recall_score(labels, binary_predictions, zero_division=0)
    f1 = f1_score(labels, binary_predictions, zero_division=0)
    auc = roc_auc_score(labels, predictions)

    evaluation_results[loss_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc
    }

    print(f"\n{loss_name} Loss:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  AUC: {auc:.4f}")


# Recommendation function... this will be the "engine" behind the LLM model that will generate/provide the recommendations to the LLM model to give back to the user
def get_recommendations(model, user_id, bert_embeddings, user_to_idx, product_to_idx, top_k=5):
    model.eval()

    if user_id not in user_to_idx:
        print(f"User {user_id} not found")
        return []

    user_idx = user_to_idx[user_id]
    user_tensor = torch.tensor([user_idx], dtype=torch.long).to(device)

    scores = []
    product_info = []

    with torch.no_grad():
        for product_id, bert_embedding in bert_embeddings.items():
            if product_id not in product_to_idx:
                continue

            product_idx = product_to_idx[product_id]
            product_tensor = torch.tensor([product_idx], dtype=torch.long).to(device)
            bert_tensor = torch.tensor(bert_embedding, dtype=torch.float).unsqueeze(0).to(device)

            score = model(user_tensor, product_tensor, bert_tensor)
            scores.append(score.item())

            product_row = df[df['product_id'] == product_id]
            product_name = product_row['product_name'].iloc[0] if len(product_row) > 0 else "Unknown"
            product_info.append((product_id, product_name))

    # Get diverse recommendations (not just highest scores)
    top_indices = np.argsort(scores)[::-1][:top_k]
    recommendations = []

    for idx in top_indices:
        product_id, product_name = product_info[idx]
        recommendations.append({
            'product_id': product_id,
            'product_name': product_name,
            'score': scores[idx]
        })

    return recommendations

# Test with a single valid user
valid_users = [user for user in user_to_idx.keys() if len(user) > 10]  # Filter out malformed user IDs
if valid_users:
    sample_user = valid_users[0]
    print(f"\n=== RECOMMENDATIONS FOR USER {sample_user} ===")

    for loss_name, result in results.items():
        recommendations = get_recommendations(
            result['model'], sample_user, bert_embeddings, user_to_idx, product_to_idx, top_k=3
        )

        print(f"\n{loss_name} Loss Recommendations:")
        for i, rec in enumerate(recommendations, 1):
            print(f"  {i}. {rec['product_name'][:40]}... (Score: {rec['score']:.4f})")

print("\n" + "="*50)
print("FINAL MODEL COMPARISON")
print("="*50)
for loss_name, metrics in evaluation_results.items():
    print(f"\n{loss_name}:")
    print(f"  AUC: {metrics['auc']:.4f} | F1: {metrics['f1_score']:.4f} | Accuracy: {metrics['accuracy']:.4f}")

print("\n=== TRAINING COMPLETED ===")

Using device: cpu
=== IMPROVED SETUP ===
Total users: 9050
Total products: 1351
Positive examples: 11503
Negative examples: 42416
Original - Positive: 11503, Negative: 42416
Balanced - Positive: 11503, Negative: 23006
User mapping size: 9050
Product mapping size: 1351
Preparing product texts...
Loading BERT model...
Precomputing BERT embeddings...
Processed 16/1351 products
Processed 176/1351 products
Processed 336/1351 products
Processed 496/1351 products
Processed 656/1351 products
Processed 816/1351 products
Processed 976/1351 products
Processed 1136/1351 products
Processed 1296/1351 products
Precomputed embeddings for 1351 products
Train batches: 863
Test batches: 216
Model parameters: 9050 users, 1351 products

=== IMPROVED TRAINING ===

=== Training with BCE Loss ===
Epoch 1, Batch 0/863, Loss: 0.7017
Epoch 1, Batch 20/863, Loss: 0.6223
Epoch 1, Batch 40/863, Loss: 0.6599
Epoch 1, Batch 60/863, Loss: 0.6340
Epoch 1, Batch 80/863, Loss: 0.6654
Epoch 1, Batch 100/863, Loss: 0.5236
