In [1]:
import json
import networkx as nx
import numpy as np
import pandas as pd
from torch_geometric.data import Data
import torch
import os

In [2]:
# Yelp dataset JSON file paths
review_filepath = os.path.join('data', 'yelp', 'yelp_academic_dataset_review.json')
business_filepath = os.path.join('data', 'yelp', "yelp_academic_dataset_business.json")

# Load the first N reviews
N = 1000  # You can adjust this value depending on your computation resources
reviews = []
with open(review_filepath) as f:
    for i, line in enumerate(f):
        if i == N:
            break
        reviews.append(json.loads(line))

# Convert reviews to pandas DataFrame
reviews_df = pd.DataFrame(reviews)

# Load the first N businesses
N = 1000  # You can adjust this value depending on your computation resources
businesses = []
with open(business_filepath) as f:
    for i, line in enumerate(f):
        if i == N:
            break
        businesses.append(json.loads(line))

# Convert businesses to pandas DataFrame
businesses_df = pd.DataFrame(businesses)

In [3]:
# Merge reviews and businesses based on business_id
data = pd.merge(reviews_df, businesses_df, on='business_id', how='inner')

# Create a graph using NetworkX
graph = nx.Graph()

# Add users and businesses to the graph
graph.add_nodes_from(data['user_id'], bipartite='user')
graph.add_nodes_from(data['business_id'], bipartite='business')

# Add edges between users and businesses
edges = [(row['user_id'], row['business_id']) for _, row in data.iterrows()]
graph.add_edges_from(edges)

# Create adjacency matrix from the graph
adjacency_matrix = nx.adjacency_matrix(graph)

# Convert adjacency matrix to edge index for PyTorch Geometric
edge_index = torch.tensor(np.array(adjacency_matrix.nonzero()), dtype=torch.long)

# Generate node features and labels (you might need to modify this according to your specific use case)
node_features = torch.randn((graph.number_of_nodes(), 16))  # Random node features for this example
labels = torch.randint(0, 2, (graph.number_of_nodes(),))  # Random labels for this example

# Create PyTorch Geometric data
pyg_data = Data(x=node_features, edge_index=edge_index, y=labels)


In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import DataLoader

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [5]:

# Initialize the model and optimizer
model = GCN(pyg_data.num_node_features, 2)  # Assuming binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Convert your PyG data into a DataLoader
data_loader = DataLoader([pyg_data], batch_size=1, shuffle=True)

# Train the model
model.train()
for epoch in range(200):  # 200 epochs
    for batch in data_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = F.nll_loss(out, batch.y)
        loss.backward()
        optimizer.step()

    
    if epoch % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item()}')




Epoch: 0, Loss: 0.8351233005523682
Epoch: 10, Loss: 0.6655710339546204
Epoch: 20, Loss: 0.614723265171051
Epoch: 30, Loss: 0.606084406375885
Epoch: 40, Loss: 0.5628054738044739
Epoch: 50, Loss: 0.5582737922668457
Epoch: 60, Loss: 0.5114756226539612
Epoch: 70, Loss: 0.5093430876731873
Epoch: 80, Loss: 0.4938248097896576
Epoch: 90, Loss: 0.45695188641548157
Epoch: 100, Loss: 0.4941503703594208
Epoch: 110, Loss: 0.4713861048221588
Epoch: 120, Loss: 0.4507020115852356
Epoch: 130, Loss: 0.4618241488933563
Epoch: 140, Loss: 0.4495328664779663
Epoch: 150, Loss: 0.4469359815120697
Epoch: 160, Loss: 0.42924764752388
Epoch: 170, Loss: 0.4566683769226074
Epoch: 180, Loss: 0.44306397438049316
Epoch: 190, Loss: 0.4562693238258362


In [6]:
reviews_df.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')

In [7]:
def get_user_features(user_id):
    user_features = reviews_df[reviews_df['user_id'] == user_id]['stars'].values.astype(np.float32)
    # You can replace 'age', 'average_rating' with your actual user feature columns
    return user_features[0] if len(user_features) > 0 else None

def get_all_business_features():
    business_features = businesses_df[['review_count', 'stars']].values.astype(np.float32)
    # You can replace 'number_of_reviews', 'average_stars' with your actual business feature columns
    return business_features

In [8]:
reviews_df['user_id'].unique()[:25]

array(['mh_-eMZ6K5RLWhZyISBhwA', 'OyoGAe7OKpv6SyGZT5g77Q',
       '8g_iMtfSiwikVnbP2etR0A', '_7bHUi9Uuf5__HHc_Q8guQ',
       'bcjbaE6dDog4jkNY91ncLQ', 'eUta8W_HdHMXPzLBBZhL1A',
       'r3zeYsv1XFBRA4dJpL78cw', 'yfFzsLmaWF2d4Sr0UNbBgg',
       'wSTuiTk-sKNdcFyprzZAjg', '59MxRhNVhU9MYndMkz0wtw',
       '1WHRWwQmZOZDAhp2Qyny4g', 'ZbqSHbgCjzVAqaa7NKWn5A',
       '9OAtfnWag-ajVxRbUTGIyg', 'smOvOajNG0lS4Pq7d8g4JQ',
       '4Uh27DgGzsp6PqrH913giQ', '1C2lxzUo1Hyye4RFIXly3g',
       'Dd1jQj7S-BFGqRbApFzCFw', 'j2wlzrntrbKwyOcOiB3l3w',
       'NDZvyYHTUWWu-kqgQzzDGQ', 'IQsF3Rc6IgCzjVV9DE8KXg',
       'Ohhrhu1RkqfVciIVx_W5HQ', 'WBpQDAZymU0dhIqXcACGNw',
       'vrKkXsozqqecF3CW4cGaVQ', 'OhECKhQEexFypOMY6kypRw',
       'RreNy--tOmXMl1en0wiBOg'], dtype=object)

In [9]:
all_business_ids = businesses_df['business_id'].tolist()

In [10]:
def recommend(user_id, model, original_data, all_business_ids):
    model.eval()

    # Assuming that user_features and business_features are pre-processed and available
    user_features = get_user_features(user_id)
    business_features = get_all_business_features()

    # Assuming that the features are in numpy array, converting them to torch tensors
    user_features = torch.tensor(user_features, dtype=torch.float).view(1, -1)
    business_features = torch.tensor(business_features, dtype=torch.float)

    # Create an input feature matrix combining the user features with each business feature
    user_business_features = torch.cat([user_features.repeat(len(business_features), 1), business_features], dim=1)

    # We should use the same graph structure (edge_index) as during training
    edge_index = original_data.edge_index

    # Construct a PyTorch Geometric Data object
    data = Data(x=user_business_features, edge_index=edge_index)

    # Pass the data through the model
    scores = model(data)

    # Get the top K business indices
    top_k_indices = torch.topk(scores, k=5).indices  # Top-5 businesses

    # Get the business IDs for the top indices
    top_k_business_ids = [all_business_ids[i] for i in top_k_indices]

    return top_k_business_ids


In [11]:
recommend('bcjbaE6dDog4jkNY91ncLQ', model, pyg_data , all_business_ids)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1000x3 and 16x16)