In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

# 1 Data Exploration

In [10]:
books = pd.read_csv('books.csv')
tags = pd.read_csv('tags.csv')
tags = tags.merge(pd.read_csv('book_tags.csv'), how='inner', on='tag_id')
ratings = pd.read_csv('ratings.csv')

In [51]:
nodes_books = books[['book_id', 'goodreads_book_id', 'authors', 'original_publication_year', 'title', 'language_code', 'average_rating', 'ratings_count']].copy()
english_variants = [x for x in nodes_books.language_code.unique() if pd.notnull(x) and x.find('en') != -1]
nodes_books['language_code'] = nodes_books.language_code.apply(lambda x: x if x not in english_variants else 'en')
nodes_books['node_id'] = 'b_' + nodes_books['book_id'].astype(str)
nodes_books.sample(5)


Unnamed: 0,book_id,goodreads_book_id,authors,original_publication_year,title,language_code,average_rating,ratings_count,node_id
2214,2215,80674,Judy Blume,1980.0,"Superfudge (Fudge, #3)",en,3.98,50917,b_2215
7486,7487,28374062,Karin Slaughter,2016.0,"The Kept Woman (Will Trent, #8)",en,4.1,8175,b_7487
920,921,18460392,"Jennifer Niven, فرانک معنوی",2015.0,All the Bright Places,en,4.19,132087,b_921
3995,3996,68458,Andrzej Sapkowski,1994.0,"Krew elfów (Saga o Wiedźminie, #3)",pol,4.23,11981,b_3996
1731,1732,209194,Agatha Christie,1924.0,The Man in the Brown Suit,en,3.97,56053,b_1732


In [46]:
'''
Step 1: Crosstab to get counts of each rating per user
Step 2: Compute total ratings count and average rating per user
Step 3: Merge the crosstab result with the user stats
Step 4: Rename the rating columns
Step 5: Fill NaN values with 0 for users who have no ratings in some categories
'''

rating_counts = pd.crosstab(ratings['user_id'], ratings['rating'])
user_stats = ratings.groupby('user_id').agg(ratings=('book_id', 'count'),
                                            average_rating=('rating', 'mean'))

nodes_users = user_stats.merge(rating_counts, left_index=True, right_index=True, how='left')
nodes_users.columns = ['ratings', 'average_rating', 'rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5']
nodes_users = nodes_users.fillna(0).reset_index()
nodes_users['node_id'] = 'u_' + nodes_users['user_id'].astype(str)
nodes_users.sample(5)


Unnamed: 0,user_id,ratings,average_rating,rating_1,rating_2,rating_3,rating_4,rating_5,node_id
19798,19799,76,4.842105,1,2,1,0,72,u_19799
19999,20000,114,3.842105,2,11,26,39,36,u_20000
20261,20262,109,4.247706,2,2,9,50,46,u_20262
17242,17243,149,3.966443,4,5,23,77,40,u_17243
683,684,128,3.953125,1,7,33,43,44,u_684


In [13]:
print('nodes_books.columns:', nodes_books.columns)
print('nodes_users.columns:', nodes_users.columns)
print('ratings.columns:', ratings.columns)

nodes_books.columns: Index(['book_id', 'goodreads_book_id', 'authors', 'original_publication_year',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'node_type'],
      dtype='object')
nodes_users.columns: Index(['user_id', 'ratings', 'average_rating', 'rating_1', 'rating_2',
       'rating_3', 'rating_4', 'rating_5', 'node_type'],
      dtype='object')
ratings.columns: Index(['user_id', 'book_id', 'rating'], dtype='object')


# 2 Graph

Using the ratings dataset of the Goodreads-10k dataset, we want to create a directed bipartite graph with two types of nodes: users and books. The edge between a user and a book represents a rating.

### 2.1 Construct the Graph

Sample the dataset to work with a smaller graph:

In [67]:
sample_nodes_books = nodes_books.sample(500)
sample_nodes_users = nodes_users.sample(1000)
sample_edges_ratings = ratings[(ratings.book_id.isin(sample_nodes_books.book_id)) & (ratings.user_id.isin(sample_nodes_users.user_id))]

Construct a networkx graph:

In [68]:
G = nx.DiGraph()
# Add book nodes
for _, row in tqdm(sample_nodes_books.iterrows(), desc='Add book-nodes'):
    G.add_node(row['node_id'],
               node_type='book')

# Add user nodes
for _, row in tqdm(sample_nodes_users.iterrows(), desc='Add user-nodes'):
    G.add_node(row['node_id'], 
               node_type='user')
    
# Add edges (ratings)
for _, row in tqdm(sample_edges_ratings.iterrows(), desc='Add rating-edges'):
    G.add_edge(f"u_{row['user_id']}", f"b_{row['book_id']}", rating=row['rating'])

print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

Add book-nodes: 500it [00:00, 27612.27it/s]
Add user-nodes: 1000it [00:00, 34036.94it/s]
Add rating-edges: 5629it [00:00, 41168.17it/s]

Graph has 1500 nodes and 5629 edges.





Remove isolated nodes:

In [70]:
isolated_nodes = list(nx.isolates(G))
G.remove_nodes_from(isolated_nodes)
print(f"Removed {len(isolated_nodes)} isolated nodes.")

Removed 33 isolated nodes.


Verify that it is bipartite:

In [71]:
from networkx.algorithms import bipartite
is_bipartite = bipartite.is_bipartite(G)
print(f"Graph is bipartite: {is_bipartite}")

Graph is bipartite: True


### 2.2 Convert Graph to Torch Geometric

In [None]:
from torch_geometric.utils import from_networkx

data = from_networkx(G)

# 3 Recommender System Architectures

In this section, we will implement and apply different recommender system architectures to the Goodreads-10k dataset. 

The architectures we want to test are:
1. Graph Convolutional Network (GCN)
2. Graph Attention Network (GATv2)
3.


### Perplexity Recommedations:

1. **Graph Convolutional Networks (GCNs):** are a type of GNN that operate directly on the graph structure, allowing information to propagate between nodes along the edges of the graph. They can effectively capture high-order relationships between users and items by aggregating information from neighbors. GCNs have been widely used in recommender systems to learn latent representations of users and items from the user-item interaction graph.
2. **GraphSAGE (Graph Sample and AggregatE):** is a highly scalable GNN architecture that can generate node embeddings by sampling and aggregating features from a node's neighborhood. It uses an inductive learning approach, allowing it to generalize to unseen nodes, making it suitable for dynamic recommendation scenarios. GraphSAGE's sampling technique and aggregation function flexibility make it a popular choice for large-scale recommender systems.
3. **Knowledge Graph Attention Network (KGAT):** is a GNN-based model that incorporates knowledge graphs to enhance item representations. It constructs a heterogeneous graph consisting of users, items, and item attributes as nodes. KGAT recursively propagates and aggregates embeddings from neighboring nodes using an attention mechanism, capturing the importance of different neighbors. This approach leverages both user-item interactions and item knowledge for improved recommendations.
4. **Edge-Enhanced Graph Neural Networks (EGNNs):** also known as Edge GraphSAGE or Enhanced Graph Neural Networks, extend the GraphSAGE model by incorporating edge features in addition to node features. In recommender systems, EGNNs can capture the type and strength of connections between users and items, leading to more accurate and personalized recommendations by modeling both node-level and edge-level interactions.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

class BaseRecommender(nn.Module):
    def __init__(self):
        super(BaseRecommender, self).__init__()

    def forward(self, data):
        raise NotImplementedError("This method needs to be implemented by subclasses.")

    def train_model(self, data, epochs, lr):
        optimizer = optim.Adam(self.parameters(), lr=lr)
        criterion = nn.MSELoss()
        
        for epoch in range(epochs):
            self.train()
            optimizer.zero_grad()
            output = self(data)
            loss = criterion(output[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()
            
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

    def evaluate(self, data):
        self.eval()
        with torch.no_grad():
            output = self(data)
            mse = ((output[data.test_mask] - data.y[data.test_mask]) ** 2).mean().item()
        return mse



## 3.1 Graph Convolutional Networks (GCN)

In [None]:
class GCNRecommender(BaseRecommender):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNRecommender, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


In [50]:
ratings

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
...,...,...,...
5976474,49925,510,5
5976475,49925,528,4
5976476,49925,722,4
5976477,49925,949,5
