In [15]:
import os
import pandas as pd
import numpy as np
import torch
import logging
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function for memory management
def manage_memory():
    torch.cuda.empty_cache()

# Function to process and convert dataset to a graph format
def process_data_to_graph(dataset_path):
    logger.info("Starting to process dataset...")

    try:
        # Load dataset from the given path
        df = pd.read_csv(dataset_path)

        # Map behavior types to integers
        behavior_dict = {'PageView': 0, 'AddToCart': 1, 'Buy': 2, 'Favorite': 3}
        df['Behavior'] = df['Behavior'].map(behavior_dict)

        # Map categorical IDs to integers (for graph nodes)
        num_entries = {}
        for name in ['User_ID', 'Product_ID', 'Category_ID']:
            value, df[name] = np.unique(df[[name]].values, return_inverse=True)
            num_entries[name] = value.shape[0]

        # Create graph data using HeteroData
        data = HeteroData()

        # Assign number of nodes to each entity (user, item, category)
        data['user'].num_nodes = num_entries['User_ID']
        data['item'].num_nodes = num_entries['Product_ID']
        data['category'].num_nodes = num_entries['Category_ID']

        # Construct edges between users and items
        row = torch.from_numpy(df['User_ID'].values)
        col = torch.from_numpy(df['Product_ID'].values)
        data['user', 'item'].edge_index = torch.stack([row, col], dim=0)
        data['user', 'item'].time = torch.from_numpy(df['Timestamp'].values)
        behavior = torch.from_numpy(df['Behavior'].values)
        data['user', 'item'].behavior = behavior

        # Construct edges between items and categories
        df = df[['Product_ID', 'Category_ID']].drop_duplicates()
        row = torch.from_numpy(df['Product_ID'].values)
        col = torch.from_numpy(df['Category_ID'].values)
        data['item', 'category'].edge_index = torch.stack([row, col], dim=0)

        # Optionally, apply any transformations before saving
        data = data if not hasattr(data, 'pre_transform') else data.pre_transform(data)

        # Save the processed data
        processed_file = 'processed_data.pt'
        torch.save(data, processed_file)
        logger.info(f"Processed data saved to {processed_file}")

        # Manage memory after processing
        manage_memory()

        return data

    except Exception as e:
        logger.error(f"Error during dataset processing: {e}")
        return None

# Example dataset path, replace with your actual file path
dataset_path = 'UserBehavior_5M_cleaned.csv'
if not os.path.exists(dataset_path):
    logger.error(f"Dataset file not found: {dataset_path}")

# Process the data and convert it into a graph
processed_graph = process_data_to_graph(dataset_path)

INFO:__main__:Starting to process dataset...
INFO:__main__:Processed data saved to processed_data.pt


In [16]:
import torch

# Load the processed data
data = torch.load('processed_data.pt')

# Print the number of nodes for each type (user, item, category)
print("Number of nodes for each type:")
print(f"Users: {data['user'].num_nodes}")
print(f"Items: {data['item'].num_nodes}")
print(f"Categories: {data['category'].num_nodes}")

# Print the number of edges for each edge type (user-item, item-category)
print("\nNumber of edges for each edge type:")
print(f"User-Item edges: {data['user', 'item'].edge_index.size(1)}")
print(f"Item-Category edges: {data['item', 'category'].edge_index.size(1)}")

# Print the 'time' and 'behavior' features
print("\nTime feature:")
print(data['user', 'item'].time)

print("\nBehavior feature:")
print(data['user', 'item'].behavior)

Number of nodes for each type:
Users: 49400
Items: 1099089
Categories: 7475

Number of edges for each edge type:
User-Item edges: 4952632
Item-Category edges: 1099297

Time feature:
tensor([1511642462, 1511822512, 1511822555,  ..., 1512276319, 1512276475,
        1512276510])

Behavior feature:
tensor([0, 0, 0,  ..., 1, 0, 0])
