In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json


In [26]:

class BooksDataset:

    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.train_matrix = pd.read_pickle(f'{data_dir}/train_matrix.pkl')
        self.images = np.load(f'{data_dir}/embed_image.npy')
        self.text = np.load(f'{data_dir}/embed_text.npy')
        self.user_profiles = np.load(f'{data_dir}/users_profiles_embeddings.npy')
        self.books_attributes = np.load(f'{data_dir}/books_attributes_embeddings.npy')
        with open(f'{data_dir}/train.json', 'r') as f:
            self.train_dict = json.load(f)
        with open(f'{data_dir}/test.json', 'r') as f:
            self.test_dict = json.load(f)
        with open(f'{data_dir}/validation.json', 'r') as f:
            self.val_dict = json.load(f)
        # create a dict to map each dataset name to its corresponding data
        self.datasets = {
            'train': self.train_matrix,
            'images': self.images,
            'text': self.text,
            'user_profiles': self.user_profiles,
            'books_attributes': self.books_attributes,
            'train_dict': self.train_dict,
            'test_dict': self.test_dict,
            'val_dict': self.val_dict
        }
        
    # return the length of all the datasets as dictionary    
    def __len__(self):
        return {k: len(v) for k,v in self.datasets.items()}
    
    # return the dataset by name
    def get_dataset(self,dataset):
        """
        Return the dataset by name
        :param dataset: dataset name
        :type dataset: str
        :return: dataset
        :rtype: numpy.ndarray
        """
        return self.datasets[dataset]
    
    # return all the datasets
    def get_all_datasets(self):
        """
        Return all the datasets
        :return: all the datasets defined in the class
        :rtype: dict
        """
        return self.datasets
    
    # sample n_users from the train dataset, and return the users, positive and negative books
    def sample(self, n_users):
        """
        Sample n_users from the train dataset, and return the users, positive and negative books
        :param n_users: number of users to sample 
        :type n_users: int
        :return:  users list, positive books list, negative books list
        :rtype: list, list, list
        """
        users = []
        pos_books = []
        neg_books = []
        for i in range(n_users):
            user = self.train_dict[str(i)]
            pos_book = np.random.choice(self.train_dict[user])
            while True:
                neg_book = np.random.choice(self.train_dict.values())
                if neg_book not in self.train_dict[user]:
                    break
            users.append(user)
            pos_books.append(pos_book)
            neg_books.append(neg_book)

        return users, pos_books, neg_books

    def describe(self):
        """
        Return the shape of all the datasets, number of interractions and the sparsity of the train matrix
      
        """
        # get the shape of all the datasets (dictionaries need special handling)
        shape = {k: v.shape if isinstance(v, np.ndarray) else len(v) for k,v in self.datasets.items()}
        # get the number of interactions in the train matrix
        n_interactions = np.count_nonzero(self.train_matrix)
        # get the sparsity of the train matrix (number of missing interactions / total interactions)
        sparsity = 1 - n_interactions / (self.train_matrix.shape[0] * self.train_matrix.shape[1])
        # print the results as two columns
        print(f"{'Dataset':<20}{'Shape':<20}")
        print('-' * 40)
        for k,v in shape.items():
            print(f"{k:<20}{v}")
            
        print(f"\nNumber of interactions: {n_interactions}")
        print(f"Sparsity: {sparsity:.2%}")
        
      

In [27]:
# test the class
data_dir = '../data/books'
dataset = BooksDataset(data_dir)
dataset.describe()


Dataset             Shape               
----------------------------------------
train               14790
images              (33962, 1024)
text                (33962, 1024)
user_profiles       (14790, 768)
books_attributes    (33962, 768)
train_dict          14790
test_dict           14790
val_dict            14790

Number of interactions: 449114
Sparsity: 99.91%


In [ ]:
import scipy as sp
import torch
n_users = dataset.get_dataset('train').shape[0]
# create a function that converts the train dicti to A_tilda matrix
def get_A_tilda(self):
    R = sp.dok_matrix((self.n_users, self.n_items), dtype = np.float32)
    R[self.dataset["user_id_idx"] , self.data['item_id_idx']] = 1.0

    adj_mat = sp.dok_matrix(
        (self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32
    )
    adj_mat = adj_mat.tolil()
    R = R.tolil()

    adj_mat[: n_users, n_users :] = R
    adj_mat[n_users :, : n_users] = R.T
    adj_mat = adj_mat.todok()

    rowsum = np.array(adj_mat.sum(1))
    d_inv = np.power(rowsum + 1e-9, -0.5).flatten()
    d_inv[np.isinf(d_inv)] = 0.0
    d_mat_inv = sp.diags(d_inv)
    norm_adj_mat = d_mat_inv.dot(adj_mat)
    norm_adj_mat = norm_adj_mat.dot(d_mat_inv)

    # Below Code is toconvert the dok_matrix to sparse tensor.

    norm_adj_mat_coo = norm_adj_mat.tocoo().astype(np.float32)
    values = norm_adj_mat_coo.data
    indices = np.vstack((norm_adj_mat_coo.row, norm_adj_mat_coo.col))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = norm_adj_mat_coo.shape

    norm_adj_mat_sparse_tensor = torch.sparse.FloatTensor(i, v, torch.Size(shape))

    return norm_adj_mat_sparse_tensor