In [1]:
import os
import json
import re
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import torch
import torch.nn.functional as F
from torch_sparse import SparseTensor
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader, HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from sklearn.model_selection import train_test_split


In [6]:
# Load sliced data
restaurants = pd.read_feather('data/yelp_restaurants.feather')
reviews = pd.read_feather('data/yelp_restaurants_reviews.feather', )

In [7]:
reviews['date'] = pd.to_datetime(reviews['date'])
reviews_sub = reviews[reviews.date.dt.year == 2018].reset_index(drop=True)

In [8]:
rest = restaurants[restaurants.business_id.isin(reviews_sub.business_id.unique())].reset_index(drop=True)
rest.drop(['address', 'state', 'postal_code', 'hours'], axis='columns', inplace=True)

In [5]:
# Extract unique restaurant and user IDs and create a dictionary mapping them to indices
restaurant_ids = rest['business_id'].unique().tolist()
num_restaurants = len(restaurant_ids)
restaurant_indices = dict(zip(restaurant_ids, range(num_restaurants)))

user_ids = reviews_sub['user_id'].unique().tolist()
num_users = len(user_ids)
user_indices = dict(zip(user_ids, range(num_users)))

In [60]:
rest.insert(loc=1, column='mapped_business_id', value=rest.business_id.map(restaurant_indices))

In [8]:
user = pd.read_feather('data/yelp/yelp_academic_dataset_user.feather')
user = user[user.user_id.isin(reviews_sub.user_id.unique())].reset_index(drop=True)
user.insert(loc=1, column='mapped_user_id', value=user.user_id.map(user_indices))

In [12]:
reviews_merged = reviews_sub.merge(rest[['business_id']], on='business_id', how='inner')
# Encode IDs
reviews_merged.insert(loc=3, column='mapped_user_id', value=reviews_merged.user_id.map(user_indices))
reviews_merged.insert(loc=4, column='mapped_business_id', value=reviews_merged.business_id.map(restaurant_indices))

In [13]:
reviews_merged.head()

Unnamed: 0,review_id,user_id,business_id,mapped_user_id,mapped_business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,0,388,3.0,0.0,0.0,0.0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,uyS0ysaMd4mzw5rNYbgcjA,ql0XsKTjM7VeBAUqbphQDw,XQfwVwDr-v0ZS3_CbbE5Xw,1878,388,3.0,0.0,0.0,0.0,"Food is fantastic, service is quite awful! Ca...",2018-03-24 17:50:37
2,R10wk4xEHX9r-qs5Z_2vvw,ZeBgfIMxp9K8OFmlXmQ3yA,XQfwVwDr-v0ZS3_CbbE5Xw,5139,388,3.0,0.0,0.0,0.0,Update: I deducted a star because they no long...,2018-07-21 09:26:33
3,pDN3hRBarmGWXbK64A83MA,IBrReMAeZkVIbjZIe1E_Hw,XQfwVwDr-v0ZS3_CbbE5Xw,7057,388,1.0,0.0,0.0,0.0,never coming back here again. all of the glass...,2018-09-08 17:03:53
4,HxWtq5q4OQ-4osStqn54bA,k4_8Cw2icH0nFV5MskGK1A,XQfwVwDr-v0ZS3_CbbE5Xw,12935,388,2.0,0.0,0.0,0.0,Unfortunately the weekend chef doesn't know ho...,2018-09-09 14:30:29


In [73]:
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
    if attr == None:
        return "{}"
    if key in attr:
        return attr.get(key, "{}")

# convert string to dictionary
import ast
def str_to_dict(attr):
    if attr != None:
        return ast.literal_eval(attr)
    else:
        return ast.literal_eval("{}")  

In [74]:
# get dummies from nested attributes
rest['DietaryRestrictions'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Dietary')), axis=1)
rest['DogsAllowed'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'DogsAllowed')), axis=1)
rest['WheelchairAccessible'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'WheelchairAccessible')), axis=1)
rest['WiFi'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'WiFi')), axis=1)


In [84]:
rest['DietaryRestrictions'].value_counts()

DietaryRestrictions
{}    31217
Name: count, dtype: int64

In [72]:
# create table with attribute dummies
df_attr = pd.concat(
    [ 
        # rest['attributes'].apply(pd.Series), 
        # rest['BusinessParking'].apply(pd.Series),
        # rest['Ambience'].apply(pd.Series), 
        # rest['GoodForMeal'].apply(pd.Series), 
        rest['DogsAllowed'].apply(pd.Series),
        rest['DietaryRestrictions'].apply(pd.Series),
        rest['WheelchairAccessible'].apply(pd.Series),
        rest['WiFi'].apply(pd.Series),
    ], 
    axis=1
)
df_attr_dummies = pd.get_dummies(df_attr, dtype=int)
df_attr_dummies

ValueError: No objects to concatenate

In [35]:
df_categories_dummies = pd.Series(rest['categories']).str.get_dummies(', ', dtype=float)
df_categories_dummies

Unnamed: 0,Acai Bowls,Accessories,Accountants,Active Life,Acupuncture,Adult,Adult Entertainment,Advertising,Afghan,African,...,Whiskey Bars,Wholesale Stores,Wholesalers,Wine Bars,Wine Tasting Room,Wine Tours,Wineries,Women's Clothing,Wraps,Yoga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31214,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
node_feature = rest[['latitude', 'longitude', 'stars', 'review_count', 'is_open']]
node_feature

## REVIEW BASED GNN

In [None]:
# reviews_df = reviews_df[['review_id', 'user_id', 'business_id', 'stars', 'text']]
# restaurant_ratings = reviews_subdf.groupby(['business_id', 'user_id'])['stars'].mean().reset_index()

# # Create a sparse tensor representing the restaurant ratings
# restaurant_indices_inv = {v: k for k, v in restaurant_indices.items()}
# restaurant_ratings = restaurant_ratings.dropna(subset=['business_id', 'user_id'])
# user_ratings_df = restaurant_ratings.pivot(index='business_id', columns='user_id', values='stars').fillna(0)
# user_ratings_df = user_ratings_df.rename(index=restaurant_indices)

In [21]:
sample_user, sample_restaurant = 'mh_-eMZ6K5RLWhZyISBhwA', 'XQfwVwDr-v0ZS3_CbbE5Xw'

In [18]:
graph = nx.Graph()
# Add users and businesses to the graph
graph.add_nodes_from(reviews_merged['user_id'].unique(), bipartite='user')
graph.add_nodes_from(reviews_merged['business_id'], bipartite='business')

for rev in reviews_merged.iterrows():
    graph.add_node(rev['user_'])

In [19]:
# Add edges between users and businesses
edges = [(row['user_id'], row['business_id']) for _, row in reviews_merged.iterrows()]
graph.add_edges_from(edges)

In [22]:
H = graph.subgraph([sample_restaurant, sample_user])
list(H.edges)

[('XQfwVwDr-v0ZS3_CbbE5Xw', 'mh_-eMZ6K5RLWhZyISBhwA')]

In [15]:
# Create adjacency matrix from the graph
adjacency_matrix = nx.adjacency_matrix(graph)

In [16]:
# Convert adjacency matrix to edge index for PyTorch Geometric
edge_index = torch.tensor(np.array(adjacency_matrix.nonzero()), dtype=torch.long)

In [12]:
# Generate node features and labels (you might need to modify this according to your specific use case)
node_features = torch.randn((graph.number_of_nodes(), 16))  # Random node features for this example
labels = torch.randint(0, 2, (graph.number_of_nodes(),))  # Random labels for this example

# Create PyTorch Geometric data
pyg_data = Data(x=node_features, edge_index=edge_index, y=labels)

In [13]:
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [14]:
# Initialize the model and optimizer
model = GCN(pyg_data.num_node_features, 2)  # Assuming binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Convert your PyG data into a DataLoader
data_loader = DataLoader([pyg_data], batch_size=1, shuffle=True)



In [None]:
model.train()
for epoch in range(200):  # 200 epochs
    for batch in data_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = F.nll_loss(out, batch.y)
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item()}')

In [16]:
reviews.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')

In [17]:
restaurants.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

In [20]:
model = GCN(pyg_data.num_node_features, 2)
model.load_state_dict(torch.load('output/simple_GNN.pth'))
model.eval()

GCN(
  (conv1): GCNConv(16, 16)
  (conv2): GCNConv(16, 2)
)

In [22]:
user_ids = reviews['user_id'].unique().tolist()

In [28]:
def get_user_features(user_id):
    user_features = reviews[reviews['user_id'] == user_id][['stars',]].values.astype(float)
    # You can replace 'age', 'average_rating' with your actual user feature columns
    return user_features[0] if len(user_features) > 0 else None

def get_all_business_features():
    business_features = restaurants[['review_count', 'stars']].values.astype(float)
    # You can replace 'number_of_reviews', 'average_stars' with your actual business feature columns
    return business_features

In [38]:
def recommend(user_id, model):
    model.eval()
    
    # Assuming that user_features and business_features are pre-processed and available
    user_features = get_user_features(user_id)  # You need to implement this function
    business_features = get_all_business_features()  # You need to implement this function

    user_features = torch.tensor(user_features, dtype=torch.float)
    business_features = torch.tensor(business_features, dtype=torch.float)

    # Concatenate the user features with each business feature
    user_business_features = torch.cat([user_features.repeat(len(business_features), 1), business_features], dim=1)
    print(user_business_features.shape)
    print(user_business_features)

    # Pass the user-business pair through the model
    scores = model(user_business_features)

    # Get the top K business indices
    top_k_indices = torch.topk(scores, k=5).indices  # Top-5 businesses

    # Get the business IDs for the top indices
    top_k_business_ids = [restaurant_ids[i] for i in top_k_indices]  # all_business_ids should be pre-defined

    return top_k_business_ids


In [39]:
recommend(user_ids[5], model)

torch.Size([44676, 6])
tensor([[ 3.0000,  0.0000,  0.0000,  0.0000, 80.0000,  4.0000],
        [ 3.0000,  0.0000,  0.0000,  0.0000,  6.0000,  2.0000],
        [ 3.0000,  0.0000,  0.0000,  0.0000, 10.0000,  1.5000],
        ...,
        [ 3.0000,  0.0000,  0.0000,  0.0000, 35.0000,  4.5000],
        [ 3.0000,  0.0000,  0.0000,  0.0000, 14.0000,  4.5000],
        [ 3.0000,  0.0000,  0.0000,  0.0000, 18.0000,  4.5000]])


AttributeError: 'Tensor' object has no attribute 'x'

## Heterogenous Graph Neural Network

In [13]:
edge_index = torch.stack(
    [torch.from_numpy(reviews_merged.mapped_user_id.values), torch.from_numpy(reviews_merged.mapped_business_id.values)],
    dim=0
)

data = HeteroData()

data['user'].node_id = torch.arange(len(reviews_merged.mapped_user_id.unique()))
data['restaurant'].node_id = torch.arange(len(reviews_merged.mapped_business_id.unique()))
data["restaurant"].x = node_features

data['user', 'rating', 'restaurant'].edge_index = edge_index
data['user', 'rating', 'restaurant'].edge_label = torch.from_numpy(reviews_merged.stars.values)

data = T.ToUndirected()(data)
del data['restaurant', 'rev_rating', 'user'].edge_label 

In [19]:
data

HeteroData(
  [1muser[0m={ node_id=[290714] },
  [1mrestaurant[0m={ node_id=[31217] },
  [1m(user, rating, restaurant)[0m={
    edge_index=[2, 596895],
    edge_label=[596895]
  },
  [1m(restaurant, rev_rating, user)[0m={ edge_index=[2, 596895] }
)

In [20]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=0.0,
    add_negative_train_samples=False,
    edge_types=("user", "rating", "restaurant"),
    rev_edge_types=("restaurant", "rev_rating", "user"), 
)

train_data, val_data, test_data = transform(data)

In [21]:
# Define the training seed changes
edge_label_index = train_data["user", "rating", "restaurant"].edge_label_index
edge_label = train_data["user", "rating", "restaurant"].edge_label

In [None]:

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rating", "restaurant"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
)   

In [20]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rating", "restaurant"].edge_label_index
edge_label = val_data["user", "rating", "restaurant"].edge_label
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=((["user", "rating", "restaurant"]), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)
sampled_data = next(iter(val_loader))

: 

: 

In [3]:
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import LinkNeighborLoader

data = Planetoid('data/debug', name='Cora')[0]

loader = LinkNeighborLoader(
    data,
    # Sample 30 neighbors for each node for 2 iterations
    num_neighbors=[30] * 2,
    # Use a batch size of 128 for sampling training nodes
    batch_size=128,
    edge_label_index=data.edge_index,
)

sampled_data = next(iter(loader))

ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'

In [5]:
!pip install pyg-library

ERROR: Could not find a version that satisfies the requirement pyg-library (from versions: none)
ERROR: No matching distribution found for pyg-library


## Neural CF

In [3]:
df = pd.read_feather('data/yelp_restaurants_philadelphia.feather')
dfg = pd.read_feather('data/yelp_restaurants_philadelphia_reviews.feather')

In [6]:
restaurant_ids = df['business_id'].unique().tolist()
num_restaurants = len(restaurant_ids)
restaurant_indices = dict(zip(restaurant_ids, range(num_restaurants)))

user_ids = dfg['user_id'].unique().tolist()
num_users = len(user_ids)
user_indices = dict(zip(user_ids, range(num_users)))

In [7]:
rev = dfg.merge(df[['business_id']], on='business_id', how='inner')

In [9]:
rev.insert(loc=3, column='mapped_user_id', value=rev.user_id.map(user_indices))
rev.insert(loc=4, column='mapped_business_id', value=rev.business_id.map(restaurant_indices))

In [10]:
rev.head()

Unnamed: 0,review_id,user_id,business_id,mapped_user_id,mapped_business_id,stars,useful,funny,cool,text,date
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,0,217,5.0,1.0,0.0,1.0,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
1,HME_ksGph3se7Aze5hxa-Q,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,19,217,2.0,0.0,0.0,1.0,Dine-in gets 2 stars. Disappointing service & ...,2014-07-13 17:25:47
2,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,68,217,5.0,13.0,6.0,5.0,After a long hiatus from reviewing I have awak...,2010-08-20 19:16:04
3,T_kAb2NeylB-JdNDKphryw,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,534,217,5.0,1.0,1.0,1.0,We've eaten here 3 times and it seems that eac...,2017-01-02 14:25:26
4,NENaCqb6TNj5CyY1LOdI6Q,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,872,217,5.0,0.0,0.0,0.0,Came to Philly for a family event but stayed a...,2015-07-28 17:15:20


In [11]:
rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 662473 entries, 0 to 662472
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   review_id           662473 non-null  object 
 1   user_id             662473 non-null  object 
 2   business_id         662473 non-null  object 
 3   mapped_user_id      662473 non-null  int64  
 4   mapped_business_id  662473 non-null  int64  
 5   stars               662473 non-null  float64
 6   useful              662473 non-null  float64
 7   funny               662473 non-null  float64
 8   cool                662473 non-null  float64
 9   text                662473 non-null  object 
 10  date                662473 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory usage: 55.6+ MB


In [12]:
user_mapping = rev[['user_id', 'mapped_user_id']].drop_duplicates(inplace=True)
business_mapping = rev[['business_id', 'mapped_business_id']].drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_mapping = rev[['user_id', 'mapped_user_id']].drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  business_mapping = rev[['business_id', 'mapped_business_id']].drop_duplicates(inplace=True)


In [13]:
edge_index = torch.stack(
    [torch.from_numpy(rev.mapped_user_id.values), torch.from_numpy(rev.mapped_business_id.values)],
    dim=0
    )

In [19]:
num_interactions = edge_index.shape[1]

# split the edges of the graph using a 80/10/10 train/validation/test split
all_indices = [i for i in range(num_interactions)]

train_indices, test_indices = train_test_split(all_indices, 
                                               test_size=0.3, 
                                               random_state=1)

val_indices, test_indices = train_test_split(test_indices, 
                                             test_size=0.5, 
                                             random_state=1)

train_edge_index = edge_index[:, train_indices]
val_edge_index = edge_index[:, val_indices]
test_edge_index = edge_index[:, test_indices]

In [20]:
print(f"""
Num users: {num_users}
Num businesses: {num_restaurants}
Num interactions: {num_interactions}""")
print(torch.unique(train_edge_index[0]).size())
print(torch.unique(train_edge_index[1]).size())


Num users: 204722
Num businesses: 4829
Num interactions: 662473
torch.Size([164265])
torch.Size([4827])


In [21]:
def convert_r_mat_edge_index_to_adj_mat_edge_index(input_edge_index):
    R = torch.zeros((num_users, num_restaurants))
    for i in range(len(input_edge_index[0])):
        row_idx = input_edge_index[0][i]
        col_idx = input_edge_index[1][i]
        R[row_idx][col_idx] = 1

    R_transpose = torch.transpose(R, 0, 1)
    adj_mat = torch.zeros((num_users + num_restaurants , num_users + num_restaurants))
    adj_mat[: num_users, num_users :] = R.clone()
    adj_mat[num_users :, : num_users] = R_transpose.clone()
    adj_mat_coo = adj_mat.to_sparse_coo()
    adj_mat_coo = adj_mat_coo.indices()
    return adj_mat_coo

In [22]:
def convert_adj_mat_edge_index_to_r_mat_edge_index(input_edge_index):
    sparse_input_edge_index = SparseTensor(row=input_edge_index[0], 
                                           col=input_edge_index[1], 
                                           sparse_sizes=((num_users + num_restaurants), num_users + num_restaurants))
    adj_mat = sparse_input_edge_index.to_dense()
    interact_mat = adj_mat[: num_users, num_users :]
    r_mat_edge_index = interact_mat.to_sparse_coo().indices()
    return r_mat_edge_index

In [23]:
train_edge_index = convert_r_mat_edge_index_to_adj_mat_edge_index(train_edge_index)
val_edge_index = convert_r_mat_edge_index_to_adj_mat_edge_index(val_edge_index)
test_edge_index = convert_r_mat_edge_index_to_adj_mat_edge_index(test_edge_index)

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 175646486404 bytes.