In [6]:
import pandas as pd
import json
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
def read_json_in_chunks(file_path, chunk_size=10000):
    """Read large JSON file in chunks"""
    chunks = []
    
    with open(file_path, 'r') as file:
        chunk = []
        for i, line in enumerate(file):
            chunk.append(line)
            
            if (i + 1) % chunk_size == 0:
                chunk_df = pd.read_json('\n'.join(chunk), lines=True)
                chunks.append(chunk_df)
                chunk = []  
        
        # process remaining lines
        if chunk:
            chunk_df = pd.read_json('\n'.join(chunk), lines=True)
            chunks.append(chunk_df)
    
    return pd.concat(chunks, ignore_index=True)

In [3]:
reviews_file_path = "../data/processed/sf/sampled/sf-sampled-reviews.json"
reviews_df = read_json_in_chunks(reviews_file_path)
restaurants_file_path = "../data/processed/sf/sf-restaurants.json"
restaurants_df = read_json_in_chunks(restaurants_file_path)

  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), lines=True)
  chunk_df = pd.read_json('\n'.join(chunk), line

In [4]:
print(reviews_df.shape)
print(restaurants_df.shape)

(411496, 6)
(3721, 15)


In [7]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 411496 entries, 0 to 411495
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   user_id       411496 non-null  float64
 1   name          411496 non-null  object 
 2   time          411496 non-null  int64  
 3   rating        411496 non-null  int64  
 4   text          220550 non-null  object 
 5   gmap_id       411496 non-null  object 
 6   user_encoded  411496 non-null  int64  
 7   item_encoded  411496 non-null  int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 25.1+ MB


In [None]:
user_encoder = LabelEncoder()
restaurant_encoder = LabelEncoder()

reviews_df["user_encoded"] = user_encoder.fit_transform(reviews_df["user_id"])
reviews_df["restaurant_encoded"] = restaurant_encoder.fit_transform(reviews_df["gmap_id"])

reviews_df["label"] = reviews_df["rating"]
reviews_df["label_binary"] = (reviews_df["rating"] >= 4).astype(int)

## Dataset Sparsity () counts-only
U = reviews_df["user_encoded"].nunique()
I = reviews_df["restaurant_encoded"].nunique()
N = len(reviews_df)
sparsity = N / (U * I)

reviews_df['timestamp'] = pd.to_datetime(reviews_df['time'])
reviews_df = reviews_df.sort_values('timestamp')

split_idx = int(len(reviews_df) * 0.8)
train_df = reviews_df.iloc[:split_idx]
test_df = reviews_df.iloc[split_idx:]

print(f"Users (U): {U}")
print(f"Restaurants (I): {I}")
print(f"Interactions (N): {N}")
print(f"Data Sparsity: {sparsity*100:.2f}%")

Users (U): 131972
Items (I): 3721
Interactions (N): 411496
Data Sparsity: 0.08%


In [51]:
layer_sizes=[128, 256, 512, 256, 64]
dropout_sizes = [0.2,0.3,0.4,0.3,0.2]

class MLPBlock(nn.Module):
    """A reusable fully-connected block with optional BatchNorm, ReLU, and Dropout."""
    def __init__(self, in_dim, out_dim, dropout=0.0, use_bn=True):
        super().__init__()
        layers = [nn.Linear(in_dim, out_dim)]
        if use_bn:
            layers.append(nn.BatchNorm1d(out_dim))
        layers.append(nn.ReLU())
        if dropout > 0:
            layers.append(nn.Dropout(dropout))
        self.block = nn.Sequential(*layers)

    def forward(self, x):
        return self.block(x)

class NCFModel(nn.Module):
    def __init__(self, num_users, num_restaurants, embedding_dim=64):
        super(NCFModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_restaurants, embedding_dim)

        mlp_layers = []
        input_dim = embedding_dim * 2
        for i in range(len(layer_sizes)):
            hidden_dim = layer_sizes[i]
            dropout = dropout_sizes[i]
            mlp_layers.append(MLPBlock(input_dim, hidden_dim, dropout=dropout))
            input_dim = hidden_dim
        mlp_layers.append(nn.Linear(input_dim, 1))  # output layer
        self.mlp=nn.Sequential(*mlp_layers)
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)


    def forward(self, user, item):
        u = self.user_embedding(user)
        i = self.item_embedding(item)
        x = torch.cat([u, i], dim=1)
        logits = self.mlp(x)
        return logits

In [46]:
class ReviewsDataset(Dataset):
    def __init__(self, dataframe):
        # Convert columns to tensors
        self.users = torch.tensor(dataframe["user_encoded"].values, dtype=torch.long)
        self.restaurants = torch.tensor(dataframe["restaurant_encoded"].values, dtype=torch.long)
        self.labels = torch.tensor(dataframe["label"].values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.restaurants[idx], self.labels[idx]

train_dataset = ReviewsDataset(train_df)
test_dataset = ReviewsDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size = 512, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 1028, shuffle = True)


In [37]:
num_users = reviews_df["user_encoded"].nunique()
num_restaurants = reviews_df["restaurant_encoded"].nunique()
print(num_users, num_restaurants)

131972 3721


In [None]:
model = NCFModel(num_users, num_restaurants)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(model)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0001, weight_decay=1e-5)

train_losses, val_losses = [], []

for epoch in range(20):

    model.train()
    training_loss = 0
    total_samples = 0
    for u, i, r in train_loader:
        u, i, r = u.to(device), i.to(device), r.to(device)

        optimizer.zero_grad()
        logits = model(u, i).squeeze(-1)

        loss = criterion(logits, r)
        training_loss += loss.item() * r.size(0)
        total_samples += r.size(0)

        loss.backward()
        optimizer.step()

    average_training_loss = training_loss / total_samples
    train_losses.append(average_training_loss)
    training_rmse = np.sqrt(average_training_loss)

    if average_training_loss < 1.0:
        for param_group in optimizer.param_groups:
            param_group['lr'] = 2e-5  

    model.eval()
    num_correct = 0
    val_loss = 0
    all_preds, all_labels =  [], []

    with torch.no_grad():
        for u, i, r in test_loader:
            u, i, r = u.to(device), i.to(device), r.to(device)
            logits = model(u,i).squeeze(-1)

            all_preds.extend(logits.cpu().numpy())
            all_labels.extend(r.cpu().numpy())

            predictions = (logits >= 4).int()
            actual = (r>= 4).int()
            new_correct = (predictions == actual).sum().item()
            num_correct += new_correct

            loss = criterion(logits, r)
            val_loss += loss.item()*r.size(0)

    average_val_loss = val_loss/len(all_preds)
    val_rmse = np.sqrt(average_val_loss)
    val_losses.append(average_val_loss)
    accuracy = num_correct / len(all_labels)


    print("====================================================================================")
    print(f"Epoch {epoch+1}:")
    print(f"Training Loss: {average_training_loss:.4f}, Validation Loss: {average_val_loss:.4f}")
    print(f"Training RMSE: {training_rmse:.4f}, Validation RMSE: {val_rmse:.4f}")
    print(f"Validation Accuracy: {100*accuracy:.4f}%")


# Plotting of training and validation loss curves
plt.plot(range(1, 21), train_losses, label='Training Loss', color='blue')
plt.plot(range(1, 21), val_losses, label='Validation Loss', color='red')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.grid(True, alpha=0.3)
plt.show()

NCFModel(
  (user_embedding): Embedding(131972, 64)
  (item_embedding): Embedding(3721, 64)
  (mlp): Sequential(
    (0): MLPBlock(
      (block): Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Dropout(p=0.2, inplace=False)
      )
    )
    (1): MLPBlock(
      (block): Sequential(
        (0): Linear(in_features=128, out_features=256, bias=True)
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Dropout(p=0.3, inplace=False)
      )
    )
    (2): MLPBlock(
      (block): Sequential(
        (0): Linear(in_features=256, out_features=512, bias=True)
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Dropout(p=0.4, inplace=False)
      )
    )
    (3): MLPBlock(
      (block): Sequ

In [19]:
test_df.columns

Index(['user_id', 'name', 'time', 'rating', 'text', 'gmap_id', 'user_encoded',
       'restaurant_encoded', 'label', 'label_binary', 'timestamp'],
      dtype='object')

In [26]:
restaurant_ids_test = test_df['restaurant_encoded'].unique()
restaurant_ids_train = train_df['restaurant_encoded'].unique()

In [28]:
print(len(restaurant_ids_train))
print(len(restaurant_ids_test))


3589
3107


In [None]:
train_df['label_binary'].value_counts()
#total: 329196


label_binary
1    271255
0     57941
Name: count, dtype: int64

In [33]:
torch.save(model.state_dict(), "first_ncf_recommender.pth")