## Helper Functions

In [4]:
import tensorflow as tf
import numpy as np
import multiprocessing as mp
from scipy import sparse
import random
import os
import pickle
from tqdm import tqdm
import time

def compute_rmse(dataset, model, batch_size=1000):
    """
    Computes RMSE for the entire dataset using the provided model.
    
    Args:
    - dataset (list): List of subgraphs for evaluation.
    - model (IGMC): The trained IGMC model.
    
    Returns:
    - RMSE (float): Root Mean Squared Error for the dataset.
    """
    # Iterate over batches
    num_batches = len(dataset) // batch_size
    
    total_squared_error = 0
    
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = start_idx + batch_size
        batch_subgraphs = dataset[start_idx:end_idx]
        
        # Batch the entire dataset
        processed_data = pyg_batching(batch_subgraphs)

        # Predict ratings for the entire dataset
        H_concat = model([processed_data['x'], processed_data['edge_index'], processed_data['edge_type'], processed_data['batch']])
        predicted_ratings_all = model.mlp(H_concat)

        # Extract the predicted ratings for the central nodes
#         unique_values, _ = tf.unique(processed_data['batch'])
#         central_node_indices = tf.stack([tf.where(processed_data['batch'] == val)[0][0] for val in unique_values])
#         predicted_ratings_central = tf.gather(predicted_ratings_all, central_node_indices)
#         predicted_ratings_central = tf.squeeze(predicted_ratings_central, axis=-1)

        # Compute the squared error for the entire dataset
        squared_error = tf.math.square(predicted_ratings_all - processed_data['y'])
        
        total_squared_error += tf.reduce_sum(squared_error).numpy()
        
    rmse = np.sqrt(total_squared_error / (num_batches * batch_size))

    return rmse

def parallel_func(args):
    return construct_tf_graph(*subgraph_extraction_labeling_tf(*args))

def apply_edge_dropout(subgraph, dropout_prob=0.2):

    edge_index_np = subgraph['edge_index'].numpy()
    edge_type_np = subgraph['edge_type'].numpy()

    # Generate a mask for edges to keep based on the dropout probability
    mask = np.random.rand(edge_index_np.shape[1]) > dropout_prob

    # Apply the mask to the edge_index and edge_type arrays
    edge_index_np = edge_index_np[:, mask]
    edge_type_np = edge_type_np[mask]

    subgraph['edge_index'] = tf.convert_to_tensor(edge_index_np, dtype=tf.int64)
    subgraph['edge_type'] = tf.convert_to_tensor(edge_type_np, dtype=tf.int64)

    return subgraph


class MyDataset:
    def __init__(self, Arow, Acol, links, labels, h=1, sample_ratio=1.0, max_nodes_per_hop=None, processed_file_path='./processed_files/data', parallel=False):
        self.Arow = Arow
        self.Acol = Acol
        self.links = links
        self.labels = labels
        self.h = h
        self.sample_ratio = sample_ratio
        self.max_nodes_per_hop = max_nodes_per_hop
        self.processed_file_path = processed_file_path
        self.parallel = parallel

    def links2subgraphs(self):
        
        print('Enclosing subgraph extraction begins...')
        
        # Define the arguments for each function call in the parallel processing
        args = [
            ((i, j), self.Arow, self.Acol, g_label, self.h, self.sample_ratio, self.max_nodes_per_hop)
                for i, j, g_label in zip(self.links[0], self.links[1], self.labels)
        ]
        
        
        if not self.parallel:
            g_list = []
            with tqdm(total=len(self.links[0])) as pbar:
                for i, j, g_label in zip(self.links[0], self.links[1], self.labels):
                    tmp = subgraph_extraction_labeling_tf(
                        (i, j), self.Arow, self.Acol, g_label, self.h, self.sample_ratio, self.max_nodes_per_hop
                    )
                    data = construct_tf_graph(*tmp)

                    g_list.append(data)
                    pbar.update(1)

            return g_list
        # else:
        #     start = time.time()
        #     with mp.Pool(mp.cpu_count()) as pool:
        #         results = pool.map_async(parallel_func, args)

        #         remaining = len(args)
        #         pbar = tqdm(total=remaining)
        #         while not results.ready():
        #             remaining_new = results._number_left
        #             pbar.update(remaining - remaining_new)
        #             remaining = remaining_new
        #             time.sleep(1)

        #         results = results.get()
        #         pool.close()

        #     pbar.close()
        #     end = time.time()
        #     print(f"Time elapsed for subgraph extraction: {end-start}s")

        #     return results
    
    def process(self):
        # Extract enclosing subgraphs
        data_list = self.links2subgraphs()
        
        # # Serialize and save the data_list using pickle
        # os.makedirs(os.path.dirname(self.processed_file_path), exist_ok=True)
        # with open(self.processed_file_path, 'wb') as f:
        #     pickle.dump(data_list, f)
            
        # del data_list
        return data_list

    def load(self):
        
        with open(self.processed_file_path, 'rb') as f:
            loaded_data_list = pickle.load(f)
        
        return loaded_data_list

class SparseRowIndexer:
    def __init__(self, csr_matrix):
        data = []
        indices = []
        indptr = []

        for row_start, row_end in zip(csr_matrix.indptr[:-1], csr_matrix.indptr[1:]):
            data.append(csr_matrix.data[row_start:row_end])
            indices.append(csr_matrix.indices[row_start:row_end])
            indptr.append(row_end - row_start)  # nnz of the row

        self.data = np.array(data, dtype=object)
        self.indices = np.array(indices, dtype=object)
        self.indptr = np.array(indptr, dtype=object)
        self.shape = csr_matrix.shape

    def __getitem__(self, row_selector):
        indices = np.concatenate(self.indices[row_selector])
        data = np.concatenate(self.data[row_selector])
        indptr = np.append(0, np.cumsum(self.indptr[row_selector]))
        shape = [indptr.shape[0] - 1, self.shape[1]]
        return sparse.csr_matrix((data, indices, indptr), shape=shape)

class SparseColIndexer:
    def __init__(self, csc_matrix):
        data = []
        indices = []
        indptr = []

        for col_start, col_end in zip(csc_matrix.indptr[:-1], csc_matrix.indptr[1:]):
            data.append(csc_matrix.data[col_start:col_end])
            indices.append(csc_matrix.indices[col_start:col_end])
            indptr.append(col_end - col_start)

        self.data = np.array(data, dtype=object)
        self.indices = np.array(indices, dtype=object)
        self.indptr = np.array(indptr, dtype=object)
        self.shape = csc_matrix.shape

    def __getitem__(self, col_selector):
        indices = np.concatenate(self.indices[col_selector])
        data = np.concatenate(self.data[col_selector])
        indptr = np.append(0, np.cumsum(self.indptr[col_selector]))

        shape = [self.shape[0], indptr.shape[0] - 1]
        return sparse.csc_matrix((data, indices, indptr), shape=shape)


def one_hot(indices, depth):
    return tf.one_hot(indices, depth)

def construct_tf_graph(u, v, r, node_labels, max_node_label, y):
    u, v = tf.convert_to_tensor(u, dtype=tf.float32), tf.convert_to_tensor(v, dtype=tf.float32)
    r = tf.convert_to_tensor(r, dtype=tf.float32)
    edge_index = tf.stack([tf.concat([u, v], axis=0), tf.concat([v, u], axis=0)], axis=0)
    edge_type = tf.concat([r, r], axis=0)
    x = tf.cast(one_hot(node_labels, max_node_label+1), dtype=tf.float32)
    y = tf.convert_to_tensor([y], dtype=tf.float32)
    
    data = {
        'x': x,
        'edge_index': edge_index,
        'edge_type': edge_type,
        'y': y
    }

    return data

def neighbors(fringe, A):
    # find all 1-hop neighbors of nodes in fringe from A
    if not fringe:
        return set([])
    return set(A[list(fringe)].indices)

def subgraph_extraction_labeling_tf(ind, Arow, Acol, y, h=1, sample_ratio=1.0, max_nodes_per_hop=None):
    # extract the h-hop enclosing subgraph around link 'ind'
    u_nodes, v_nodes = [ind[0]], [ind[1]]
    u_dist, v_dist = [0], [0]
    u_visited, v_visited = set([ind[0]]), set([ind[1]])
    u_fringe, v_fringe = set([ind[0]]), set([ind[1]])
    
    for dist in range(1, h+1):
        v_fringe, u_fringe = neighbors(u_fringe, Arow), neighbors(v_fringe, Acol)
        u_fringe = u_fringe - u_visited
        v_fringe = v_fringe - v_visited
        u_visited = u_visited.union(u_fringe)
        v_visited = v_visited.union(v_fringe)
        if sample_ratio < 1.0:
            u_fringe = random.sample(u_fringe, int(sample_ratio*len(u_fringe)))
            v_fringe = random.sample(v_fringe, int(sample_ratio*len(v_fringe)))
        if max_nodes_per_hop is not None:
            if max_nodes_per_hop < len(u_fringe):
                u_fringe = random.sample(u_fringe, max_nodes_per_hop)
            if max_nodes_per_hop < len(v_fringe):
                v_fringe = random.sample(v_fringe, max_nodes_per_hop)
        if len(u_fringe) == 0 and len(v_fringe) == 0:
            break
        u_nodes = u_nodes + list(u_fringe)
        v_nodes = v_nodes + list(v_fringe)
        u_dist = u_dist + [dist] * len(u_fringe)
        v_dist = v_dist + [dist] * len(v_fringe)
        
    subgraph = Arow[u_nodes][:, v_nodes]
    subgraph[0, 0] = 0
    
    # prepare pyg graph constructor input
    u, v, r = sparse.find(subgraph)  # r is 1, 2... (rating labels + 1)
    v += len(u_nodes)
    r = r - 1  # transform r back to rating label
    num_nodes = len(u_nodes) + len(v_nodes)
    node_labels = [x*2 for x in u_dist] + [x*2+1 for x in v_dist]
    max_node_label = 2*h + 1
    
    return u, v, r, node_labels, max_node_label, y

def pyg_batching(subgraphs):
    """
    Create a PyG-like batched representation from a list of subgraphs.
    
    Parameters:
    - subgraphs: List of subgraphs
    
    Returns:
    - batched_data: Dictionary containing batched node features, edge indices, edge types, and batch vector
    """
    
    # Lists to store batched data
    batched_x = []
    batched_edge_index = []
    batched_edge_type = []
    batch_vector = []
    batched_y = []
    
    # Variables to keep track of the number of nodes seen so far
    nodes_cumsum = 0
    
    for subgraph in subgraphs:
        # Node features
        x = subgraph['x'].numpy()
        batched_x.append(x)
        
        y = subgraph['y'].numpy()
        batched_y.append(y)
        
        # Edge indices (adjusted based on the number of nodes seen so far)
        edge_index = subgraph['edge_index'].numpy() + nodes_cumsum
        batched_edge_index.append(edge_index)
        
        # Edge types
        edge_type = subgraph['edge_type'].numpy()
        batched_edge_type.append(edge_type)
        
        # Batch vector
        batch_vector.extend([len(batched_x) - 1] * x.shape[0])
        
        # Update nodes_cumsum
        nodes_cumsum += x.shape[0]
    
    # Concatenate everything to form the batched data
    batched_data = {
        'x': tf.convert_to_tensor(np.concatenate(batched_x, axis=0), dtype=tf.float32),
        'edge_index': tf.convert_to_tensor(np.concatenate(batched_edge_index, axis=1), dtype=tf.int32),
        'edge_type': tf.convert_to_tensor(np.concatenate(batched_edge_type), dtype=tf.int32),
        'y': tf.convert_to_tensor(np.concatenate(batched_y, axis=0), dtype=tf.float32),
        'batch': tf.convert_to_tensor(batch_vector, dtype=tf.int32)
    }
    
    return batched_data





In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the data
# Replace 'path_to_ratings.dat' with the actual path to your 'ratings.dat' file
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', header=None, names=ratings_cols)


In [8]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
# Step 2: Encode User and Movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['movie_id'] = movie_encoder.fit_transform(ratings['movie_id'])

# Step 3: Create Pivot Table
pivot_df = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')


In [10]:
pivot_df.shape

(943, 1682)

In [11]:
pivot_df

movie_id,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
1,4.0,,,,,,,,,2.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,,,,,,,,,5.0,,...,,,,,,,,,,
939,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
940,5.0,,,,,,4.0,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,


In [12]:
from utils.data_utils import prepare_data

R_bar_train, R_bar_val, R_bar_test, R_train, R_val, R_test = prepare_data('ml_100k', p = 0.0)

In [13]:
R_bar_train.shape

TensorShape([943, 1682])

In [17]:
from scipy.sparse import csr_matrix, csc_matrix
from scipy import sparse

In [15]:
R_train_csr = csr_matrix(R_train)
R_train_csc = csc_matrix(R_train)

Arow_train = SparseRowIndexer(R_train_csr)
Acol_train = SparseColIndexer(R_train_csc)

In [18]:
train_rows, train_cols, train_labels = sparse.find(R_train_csr)
train_indices = np.row_stack((train_rows, train_cols))


In [19]:
train_rows

array([  0,   0,   0, ..., 942, 942, 942], dtype=int32)

In [20]:
train_indices

array([[   0,    0,    0, ...,  942,  942,  942],
       [   0,    1,    2, ..., 1066, 1073, 1187]], dtype=int32)

In [None]:
processed_file_path = f'./processed_files/IGMC_data/p={args.p}'
train_dataset = MyDataset(Arow_train, Acol_train, train_indices, train_labels, processed_file_path=f'{processed_file_path}/train_set')



In [2]:
import numpy as np

R_train = np.array([
    [2, 0, 1, 0, 0],  # User 1
    [3, 1, 0, 0, 5],  # User 2
    [0, 0, 1, 0, 3],  # User 3
])


In [5]:
from scipy.sparse import csr_matrix, csc_matrix

R_train_csr = csr_matrix(R_train)
R_train_csc = csc_matrix(R_train)
Arow_train = SparseRowIndexer(R_train_csr)
Acol_train = SparseColIndexer(R_train_csc)

In [6]:
R_train_csr

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [7]:
Arow_train

<__main__.SparseRowIndexer at 0x1089211c0>

In [9]:
R_train_csr.data

array([2, 1, 3, 1, 5, 1, 3])

In [10]:
R_train_csr.indices

array([0, 2, 0, 1, 4, 2, 4], dtype=int32)

In [11]:
R_train_csr.indptr

array([0, 2, 5, 7], dtype=int32)

In [12]:
train_rows, train_cols, train_labels = sparse.find(R_train_csr)
train_indices = np.row_stack((train_rows, train_cols))

In [13]:
train_rows

array([0, 0, 1, 1, 1, 2, 2], dtype=int32)

In [14]:
train_cols

array([0, 2, 0, 1, 4, 2, 4], dtype=int32)

In [15]:
train_labels

array([2, 1, 3, 1, 5, 1, 3])

In [16]:
train_indices

array([[0, 0, 1, 1, 1, 2, 2],
       [0, 2, 0, 1, 4, 2, 4]], dtype=int32)

In [18]:
processed_file_path = f'./processed_files/IGMC_data/p={0.0}'

train_dataset = MyDataset(Arow_train, Acol_train, train_indices, train_labels, processed_file_path=f'{processed_file_path}/train_set')


In [19]:
train_subgraphs = train_dataset.process()

Enclosing subgraph extraction begins...


  0%|                                                                                                                  | 0/7 [00:00<?, ?it/s]2024-02-25 20:53:01.679754: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 83.77it/s]


In [29]:
len(train_subgraphs)

7

In [30]:
train_subgraphs[0]

{'x': <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]], dtype=float32)>,
 'edge_index': <tf.Tensor: shape=(2, 4), dtype=float32, numpy=
 array([[0., 1., 3., 2.],
        [3., 2., 0., 1.]], dtype=float32)>,
 'edge_type': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 2., 0., 2.], dtype=float32)>,
 'y': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.], dtype=float32)>}

In [31]:
R_train

array([[2, 0, 1, 0, 0],
       [3, 1, 0, 0, 5],
       [0, 0, 1, 0, 3]])

In [None]:
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

# Create a dummy user-item interaction matrix
# Assume 5 users and 5 items for simplicity
# 1 indicates interaction, 0 indicates no interaction

user_item_matrix = np.array([
    [2, 0, 1, 0, 1],  # User 1
    [3, 1, 0, 0, 0],  # User 2
    [0, 1, 1, 1, 0],  # User 3
])

# Convert to CSR (Compressed Sparse Row) format for efficient processing
user_item_csr = sp.csr_matrix(user_item_matrix)

user_item_csr
