In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm
import numpy as np

# Dataset 정의
class RatingDataset(Dataset):
    def __init__(self, filepath):
        self.data = []
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                self.data.append(json.loads(line))

        self.user_ids_raw = [d["user_id"] for d in self.data]
        self.item_ids_raw = [d["business_id"] for d in self.data]
        self.ratings = torch.tensor([d["stars"] for d in self.data], dtype=torch.float32)
        self.sentiments = torch.tensor([d["sentiment_vector"] for d in self.data], dtype=torch.float32)

        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        self.user_ids = torch.tensor(self.user_encoder.fit_transform(self.user_ids_raw), dtype=torch.long)
        self.item_ids = torch.tensor(self.item_encoder.fit_transform(self.item_ids_raw), dtype=torch.long)

        self.num_users = len(self.user_encoder.classes_)
        self.num_items = len(self.item_encoder.classes_)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.sentiments[idx], self.ratings[idx]

# 모델 정의
class RatingPredictor(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, sentiment_dim=15):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2 + sentiment_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, user_ids, item_ids, sentiments):
        user_vec = self.user_embedding(user_ids)
        item_vec = self.item_embedding(item_ids)
        x = torch.cat([user_vec, item_vec, sentiments], dim=1)
        return self.mlp(x).squeeze()

# 파일 경로
filepath = "/content/drive/MyDrive/review_business_5up_5aspect_3sentiment_vectorized_clean.json"
dataset = RatingDataset(filepath)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# 모델, 손실 함수, 옵티마이저
model = RatingPredictor(dataset.num_users, dataset.num_items)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 학습 루프
epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
    for user, item, sentiment, rating in loop:
        optimizer.zero_grad()
        pred = model(user, item, sentiment)
        loss = loss_fn(pred, rating)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    print(f"[Epoch {epoch+1}] Loss: {running_loss / len(dataloader):.4f}")

# 평가
model.eval()
all_preds, all_truths = [], []
with torch.no_grad():
    for user, item, sentiment, rating in tqdm(dataloader, desc="Evaluating"):
        pred = model(user, item, sentiment)
        all_preds.extend(pred.tolist())
        all_truths.extend(rating.tolist())

mae = mean_absolute_error(all_truths, all_preds)
mse = mean_squared_error(all_truths, all_preds)
rmse = np.sqrt(mse)

print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}")




[Epoch 1] Loss: 0.6152




[Epoch 2] Loss: 0.4874




[Epoch 3] Loss: 0.4474




[Epoch 4] Loss: 0.4139




[Epoch 5] Loss: 0.3864




[Epoch 6] Loss: 0.3614




[Epoch 7] Loss: 0.3379




[Epoch 8] Loss: 0.3150




[Epoch 9] Loss: 0.2932




[Epoch 10] Loss: 0.2722


Evaluating: 100%|██████████| 6997/6997 [00:11<00:00, 606.74it/s]


MAE: 0.3614, RMSE: 0.4871
