In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os

# --- 1. Utility Functions ---

# MAPE를 위한 유틸리티 함수 (0으로 나누는 오류 방지)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0 # 모든 y_true가 0인 경우 MAPE는 0으로 처리
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100

# --- 2. Data Loading and Preprocessing ---

# 파일 로드
# IMPORTANT: Adjust this path to where your JSON file is located on your local machine.
df = pd.read_json('review_business_5up_5aspect_3sentiment_vectorized_clean.json', lines=True)

# 필요한 컬럼 추출
df_processed = df[['user_id', 'business_id', 'stars', 'sentiment_vector']].copy()

# user_id와 business_id를 연속적인 정수 ID로 인코딩
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

df_processed.loc[:, 'user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed.loc[:, 'business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# 데이터 분할
# 논문에서 제시된 70/10/20 비율로 데이터 분할
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8 # 10% of total data (1/8 of 80%)
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)

print(f"전체 데이터 수: {len(df_processed)}")
print(f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)")
print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
print(f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)")

# Determine sentiment_vector_dim dynamically
sentiment_vector_dim = len(df_processed['sentiment_vector'].iloc[0]) if not df_processed.empty else 15

# --- 3. PyTorch Dataset and DataLoader Definition ---
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_encoded'].values, dtype=torch.long)
        self.business_ids = torch.tensor(df['business_encoded'].values, dtype=torch.long)
        self.sentiment_vectors = torch.tensor(np.array(df['sentiment_vector'].tolist()), dtype=torch.float)
        self.stars = torch.tensor(df['stars'].values, dtype=torch.float)

    def __len__(self):
        return len(self.stars)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.business_ids[idx], self.sentiment_vectors[idx], self.stars[idx]

# --- 4. Model Architecture Definition ---
class CustomerRestaurantInteractionModule(nn.Module):
    def __init__(self, num_users, num_businesses, embedding_dim, mlp_dims):
        super(CustomerRestaurantInteractionModule, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.business_embedding = nn.Embedding(num_businesses, embedding_dim)

        layers = []
        input_dim = embedding_dim * 2
        for dim in mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.output_dim = mlp_dims[-1] if mlp_dims else embedding_dim * 2

    def forward(self, user_ids, business_ids):
        user_vec = self.user_embedding(user_ids)
        business_vec = self.business_embedding(business_ids)
        combined_vec = torch.cat((user_vec, business_vec), dim=1)
        interaction_features = self.mlp(combined_vec)
        return interaction_features

class ReviewAspectModule(nn.Module):
    def __init__(self, sentiment_vector_dim, aspect_mlp_dims):
        super(ReviewAspectModule, self).__init__()
        layers = []
        input_dim = sentiment_vector_dim
        for dim in aspect_mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.output_dim = aspect_mlp_dims[-1] if aspect_mlp_dims else sentiment_vector_dim

    def forward(self, sentiment_vectors):
        aspect_features = self.mlp(sentiment_vectors)
        return aspect_features

class AATRec(nn.Module):
    def __init__(self, num_users, num_businesses, embedding_dim,
                 user_biz_mlp_dims, aspect_mlp_dims, final_mlp_dims,
                 sentiment_vector_dim):
        super(AATRec, self).__init__()
        self.customer_restaurant_interaction_module = CustomerRestaurantInteractionModule(
            num_users, num_businesses, embedding_dim, user_biz_mlp_dims
        )
        self.review_aspect_module = ReviewAspectModule(
            sentiment_vector_dim, aspect_mlp_dims
        )

        final_input_dim = self.customer_restaurant_interaction_module.output_dim + \
                          self.review_aspect_module.output_dim

        layers = []
        input_dim = final_input_dim
        for dim in final_mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        layers.append(nn.Linear(input_dim, 1)) # Final output is rating (1-dimensional)
        self.prediction_mlp = nn.Sequential(*layers)

    def forward(self, user_ids, business_ids, sentiment_vectors):
        user_biz_features = self.customer_restaurant_interaction_module(user_ids, business_ids)
        aspect_features = self.review_aspect_module(sentiment_vectors)
        combined_features = torch.cat((user_biz_features, aspect_features), dim=1)
        predicted_rating = self.prediction_mlp(combined_features)
        return predicted_rating.squeeze() # Return 1D rating

# --- 5. Device Configuration (GPU Setup) ---
# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 6. Dataset and DataLoader Creation ---
train_dataset = ReviewDataset(train_df)
val_dataset = ReviewDataset(val_df)
test_dataset = ReviewDataset(test_df)

# --- 7. Apply the Given Best Parameters ---
# Previously, these were found via grid search. Now, we explicitly set them.
best_params = {
    'aspect_mlp_hidden_dims': [64, 32],
    'batch_size': 128,
    'embedding_dim': 64,
    'final_mlp_hidden_dims': [32, 16],
    'learning_rate': 0.001,
    'user_biz_mlp_hidden_dims': [128, 64]
}

print("\n" + "="*50)
print(f"Applying pre-selected Best Parameters: {best_params}")
print("="*50)

# --- 8. Final Model Training and Testing (Using Best Parameters) ---
final_embedding_dim = best_params['embedding_dim']
final_learning_rate = best_params['learning_rate']
final_batch_size = best_params['batch_size']
final_user_biz_mlp_dims = best_params['user_biz_mlp_hidden_dims']
final_aspect_mlp_dims = best_params['aspect_mlp_hidden_dims']
final_final_mlp_dims = best_params['final_mlp_hidden_dims']

final_model = AATRec(num_users, num_businesses, final_embedding_dim,
                     final_user_biz_mlp_dims, final_aspect_mlp_dims, final_final_mlp_dims,
                     sentiment_vector_dim).to(device) # Move final model to device

final_criterion = nn.MSELoss()
final_optimizer = optim.Adam(final_model.parameters(), lr=final_learning_rate)

final_train_loader = DataLoader(train_dataset, batch_size=final_batch_size, shuffle=True)
final_val_loader = DataLoader(val_dataset, batch_size=final_batch_size, shuffle=False)
final_test_loader = DataLoader(test_dataset, batch_size=final_batch_size, shuffle=False)

final_epochs = 50 # Ample epochs for final training
final_patience = 10 # More patience for final training
final_min_delta = 0.0005 # Stricter improvement criterion

best_final_val_rmse = float('inf')
epochs_no_improve_final = 0
final_model_path = 'final_best_aat_rec_model.pt'

print("\n--- Training Final Model with Best Parameters ---")
for epoch in range(final_epochs):
    # Training phase
    final_model.train()
    total_train_loss = 0
    for user_ids, business_ids, sentiment_vectors, stars in final_train_loader:
        user_ids, business_ids, sentiment_vectors, stars = \
            user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), stars.to(device)

        final_optimizer.zero_grad()
        predictions = final_model(user_ids, business_ids, sentiment_vectors)
        loss = final_criterion(predictions, stars)
        loss.backward()
        final_optimizer.step()
        total_train_loss += loss.item()

    # Validation phase
    final_model.eval()
    val_predictions = []
    val_true_ratings = []
    with torch.no_grad():
        for user_ids, business_ids, sentiment_vectors, stars in final_val_loader:
            user_ids, business_ids, sentiment_vectors, stars = \
                user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), stars.to(device)

            predictions = final_model(user_ids, business_ids, sentiment_vectors)
            val_predictions.extend(predictions.cpu().tolist())
            val_true_ratings.extend(stars.cpu().tolist())

    current_val_rmse = np.sqrt(mean_squared_error(val_true_ratings, val_predictions))

    print(f"Final Train Epoch {epoch+1}/{final_epochs}, "
          f"Train Loss: {total_train_loss / len(final_train_loader):.4f}, "
          f"Val RMSE: {current_val_rmse:.4f}")

    # Early stopping logic for final model
    if current_val_rmse < best_final_val_rmse - final_min_delta:
        best_final_val_rmse = current_val_rmse
        epochs_no_improve_final = 0
        torch.save(final_model.state_dict(), final_model_path)
        print(f"  --> RMSE improved. Model saved: {best_final_val_rmse:.4f}")
    else:
        epochs_no_improve_final += 1
        print(f"  --> RMSE not improved. ({epochs_no_improve_final}/{final_patience})")
        if epochs_no_improve_final == final_patience:
            print(f"Early stopping - No validation RMSE improvement for {final_patience} epochs.")
            break

# --- 9. Final Model Testing ---
print("\n--- Evaluating Final Model on Test Set ---")
if os.path.exists(final_model_path):
    final_model.load_state_dict(torch.load(final_model_path))
    print(f"Loaded best model weights from {final_model_path}")
else:
    print(f"Could not find optimal final model weights at '{final_model_path}'. Testing with current model state.")

final_model.eval()
test_predictions = []
true_ratings = []

with torch.no_grad():
    for user_ids, business_ids, sentiment_vectors, stars in final_test_loader:
        user_ids, business_ids, sentiment_vectors, stars = \
            user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), stars.to(device)

        predictions = final_model(user_ids, business_ids, sentiment_vectors)
        test_predictions.extend(predictions.cpu().tolist())
        true_ratings.extend(stars.cpu().tolist())

mse = mean_squared_error(true_ratings, test_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_ratings, test_predictions)
mape = mean_absolute_percentage_error(true_ratings, test_predictions)

print(f"\n--- Performance Evaluation (Final Model with Best Parameters) ---")
print(f"Selected Hyperparameters: {best_params}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# --- 10. Top-K Recommendation Function ---
def recommend_topk_for_all_users(model, df_processed, user_encoder, business_encoder, k=5, device='cpu'):
    model.eval()
    user_ids_unique = df_processed['user_id'].unique()
    business_ids_unique = df_processed['business_id'].unique()

    # Calculate average sentiment_vector for each business_id
    sentiment_dict = df_processed.groupby('business_id')['sentiment_vector'].apply(
        lambda x: np.mean(x.tolist(), axis=0)
    ).to_dict()

    recommendations = {}

    # Iterate through unique users
    for user_id in user_ids_unique:
        encoded_user = user_encoder.transform([user_id])[0]

        # Businesses already rated by the current user
        rated_biz = df_processed[df_processed['user_id'] == user_id]['business_id'].unique()
        # Businesses not rated by the user and have a sentiment vector available
        unrated_biz = [b for b in business_ids_unique if b not in rated_biz and b in sentiment_dict]

        if not unrated_biz:
            recommendations[user_id] = []
            continue

        # Prepare tensors for prediction
        user_tensor = torch.tensor([encoded_user] * len(unrated_biz), dtype=torch.long).to(device)
        biz_encoded = business_encoder.transform(unrated_biz)
        biz_tensor = torch.tensor(biz_encoded, dtype=torch.long).to(device)
        sentiment_list = [sentiment_dict[b] for b in unrated_biz]
        sentiment_tensor = torch.tensor(np.array(sentiment_list), dtype=torch.float).to(device)

        # Perform prediction
        with torch.no_grad():
            predicted_ratings = model(user_tensor, biz_tensor, sentiment_tensor)

        # Get Top-K indices and corresponding business IDs
        actual_k = min(k, len(predicted_ratings))
        if actual_k > 0:
            topk_indices = torch.topk(predicted_ratings, actual_k).indices.tolist()
            topk_business_ids = [unrated_biz[i] for i in topk_indices]
        else:
            topk_business_ids = []

        recommendations[user_id] = topk_business_ids

    return recommendations

print("\n--- Generating Top-5 Recommendations for Users ---")
topk_result = recommend_topk_for_all_users(
    model=final_model,
    df_processed=df_processed,
    user_encoder=user_encoder,
    business_encoder=business_encoder,
    k=5,
    device=device
)

# Print example results (first 5 users)
for user_id, recs in list(topk_result.items())[:5]:
    print(f"User {user_id} -> Recommended Businesses: {recs}")

# Clean up temporary models directory if it was created by the original script
if os.path.exists('temp_models'):
    import shutil
    shutil.rmtree('temp_models')
    print("\nCleaned up 'temp_models' directory.")

전체 데이터 수: 447796
학습 데이터 수: 313456 (70.00%)
검증 데이터 수: 44780 (10.00%)
테스트 데이터 수: 89560 (20.00%)
Using device: cuda

Applying pre-selected Best Parameters: {'aspect_mlp_hidden_dims': [64, 32], 'batch_size': 128, 'embedding_dim': 64, 'final_mlp_hidden_dims': [32, 16], 'learning_rate': 0.001, 'user_biz_mlp_hidden_dims': [128, 64]}

--- Training Final Model with Best Parameters ---
Final Train Epoch 1/50, Train Loss: 0.6857, Val RMSE: 0.7278
  --> RMSE improved. Model saved: 0.7278
Final Train Epoch 2/50, Train Loss: 0.4749, Val RMSE: 0.6924
  --> RMSE improved. Model saved: 0.6924
Final Train Epoch 3/50, Train Loss: 0.4384, Val RMSE: 0.6936
  --> RMSE not improved. (1/10)
Final Train Epoch 4/50, Train Loss: 0.4106, Val RMSE: 0.6934
  --> RMSE not improved. (2/10)
Final Train Epoch 5/50, Train Loss: 0.3860, Val RMSE: 0.6977
  --> RMSE not improved. (3/10)
Final Train Epoch 6/50, Train Loss: 0.3623, Val RMSE: 0.7046
  --> RMSE not improved. (4/10)
Final Train Epoch 7/50, Train Loss: 0.3387, V