In [3]:
import os
import pandas as pd
import numpy as np
import torch
import logging
from datetime import datetime
from collections import defaultdict
import itertools
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from torch_sparse import coalesce

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def manage_memory():
    torch.cuda.empty_cache()

def process_data_to_homogeneous_graph(dataset_path, selected_day='2017-11-28'):
    logger.info("Processing to homogeneous graph...")

    try:
        # Load dataset
        df = pd.read_csv(dataset_path)
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

        # Filter only for selected date
        selected_day = datetime.strptime(selected_day, '%Y-%m-%d').date()
        df = df[df['Timestamp'].dt.date == selected_day]

        num_users = df['User_ID'].nunique()
        logger.info(f"  Number of unique users     : {num_users}")

        if df.empty:
            raise ValueError(f"No data found for selected date: {selected_day}")

        logger.info(f"Selected {len(df)} interactions on {selected_day}")

        # Encode behaviors
        behavior_dict = {'PageView': 0, 'AddToCart': 1, 'Buy': 2, 'Favorite': 3}
        df['Behavior'] = df['Behavior'].map(behavior_dict)

        # Encode IDs
        for col in ['User_ID', 'Product_ID', 'Category_ID']:
            df[col], _ = pd.factorize(df[col])

        num_items = df['Product_ID'].nunique()

        # === Step 1: Create item-item edges via co-interaction (same user interacted with both items)
        user_to_items = defaultdict(set)
        for u, i in zip(df['User_ID'], df['Product_ID']):
            user_to_items[u].add(i)

        edge_list = []
        for items in user_to_items.values():
            for i1, i2 in itertools.combinations(items, 2):
                edge_list.append((i1, i2))
                edge_list.append((i2, i1))  # undirected

        edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
        edge_index, _ = coalesce(edge_index, None, num_items, num_items)  # remove duplicates

        # === Step 2: Create node labels (item.y)
        item_labels = torch.zeros(num_items, dtype=torch.long)
        buy_items = df[df['Behavior'] == 2]['Product_ID'].unique()
        item_labels[torch.tensor(buy_items)] = 1

        # === Step 3: Create node features (e.g., interaction count per item)
        item_degree = df['Product_ID'].value_counts().sort_index()
        item_degree = torch.tensor(item_degree.values, dtype=torch.float).unsqueeze(1)  # shape [N, 1]

        # === Step 4: Train/val/test masks
        all_items = torch.arange(num_items)
        train_idx, temp_idx = train_test_split(all_items, test_size=0.4, random_state=42, stratify=item_labels)
        val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42, stratify=item_labels[temp_idx])

        train_mask = torch.zeros(num_items, dtype=torch.bool)
        val_mask = torch.zeros(num_items, dtype=torch.bool)
        test_mask = torch.zeros(num_items, dtype=torch.bool)
        train_mask[train_idx] = True
        val_mask[val_idx] = True
        test_mask[test_idx] = True

        # === Step 5: Build final Data object
        data = Data(
            x=item_degree,
            edge_index=edge_index,
            y=item_labels,
            train_mask=train_mask,
            val_mask=val_mask,
            test_mask=test_mask
        )

        # === Step 6: Print graph statistics ===
        num_nodes = data.num_nodes
        num_edges = data.num_edges
        num_features = data.num_node_features
        num_classes = int(item_labels.max().item()) + 1
        class_counts = torch.bincount(item_labels)

        logger.info(f"Graph Summary:")
        logger.info(f"  Number of item nodes     : {num_nodes}")
        logger.info(f"  Number of item-item edges: {num_edges}")
        logger.info(f"  Number of node features  : {num_features}")
        logger.info(f"  Number of classes        : {num_classes}")
        logger.info(f"  Class distribution       : {class_counts.tolist()}")
        logger.info(f"  Train/Val/Test splits    : {train_mask.sum().item()}/{val_mask.sum().item()}/{test_mask.sum().item()}")

        torch.save(data, f'filtered_graph_{selected_day}.pt')
        logger.info(f"Saved homogeneous graph: filtered_graph_{selected_day}.pt")

        manage_memory()
        return data

    except Exception as e:
        logger.error(f"Processing error: {e}")
        return None

# === Run the processing ===
dataset_path = 'UserBehavior_5M_cleaned.csv'
if os.path.exists(dataset_path):
    processed_data = process_data_to_homogeneous_graph(dataset_path, selected_day='2017-11-28')
else:
    logger.error(f"Dataset not found at {dataset_path}")


INFO:__main__:Processing to homogeneous graph...


INFO:__main__:  Number of unique users     : 35360
INFO:__main__:Selected 490128 interactions on 2017-11-28
INFO:__main__:Graph Summary:
INFO:__main__:  Number of item nodes     : 253988
INFO:__main__:  Number of item-item edges: 11057360
INFO:__main__:  Number of node features  : 1
INFO:__main__:  Number of classes        : 2
INFO:__main__:  Class distribution       : [244298, 9690]
INFO:__main__:  Train/Val/Test splits    : 152392/50798/50798
INFO:__main__:Saved homogeneous graph: filtered_graph_2017-11-28.pt
