In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# ------------------ 2. 라이브러리 임포트 ------------------
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# ------------------ 3. GloVe 로딩 ------------------
def load_glove_embeddings(glove_path, embedding_dim=100):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.split()

def text_to_glove_sequence(text, glove_embeddings, max_len=300, embed_dim=100):
    tokens = preprocess(text)
    vectors = [glove_embeddings[token] for token in tokens if token in glove_embeddings]
    if len(vectors) < max_len:
        vectors.extend([np.zeros(embed_dim)] * (max_len - len(vectors)))
    else:
        vectors = vectors[:max_len]
    return np.stack(vectors)

# ------------------ 4. 데이터 로딩 + 리뷰 임베딩 ------------------
data = []
with open('/content/drive/MyDrive/review_business_5up_with_text.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

print("Loading GloVe...")
glove_path = '/content/drive/MyDrive/glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_path)

print("Embedding review texts...")
df['glove_sequence'] = [text_to_glove_sequence(t, glove_embeddings) for t in tqdm(df['text'])]

user_group = df.groupby('user_id')['glove_sequence'].apply(lambda x: np.mean(np.stack(x), axis=0))
item_group = df.groupby('business_id')['glove_sequence'].apply(lambda x: np.mean(np.stack(x), axis=0))

user2idx = {u: i for i, u in enumerate(user_group.index)}
item2idx = {b: i for i, b in enumerate(item_group.index)}
df = df[df['user_id'].isin(user2idx) & df['business_id'].isin(item2idx)]

user_embeddings = np.stack([user_group[uid] for uid in df['user_id']])
item_embeddings = np.stack([item_group[iid] for iid in df['business_id']])
ratings = df['stars'].values

# ------------------ 5. Dataset 정의 ------------------
class DAttnDataset(Dataset):
    def __init__(self, user_reviews, item_reviews, ratings):
        self.user_reviews = user_reviews
        self.item_reviews = item_reviews
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.user_reviews[idx], dtype=torch.float32),
            torch.tensor(self.item_reviews[idx], dtype=torch.float32),
            torch.tensor(self.ratings[idx], dtype=torch.float32)
        )

# ------------------ 6. 모델 정의 ------------------
class LocalAttention(nn.Module):
    def __init__(self, embed_dim=100, num_filters=200, filter_size=3):
        super().__init__()
        self.conv = nn.Conv1d(embed_dim, num_filters, filter_size, padding=filter_size // 2)
        self.attn_fc = nn.Linear(num_filters, 1)

    def forward(self, x):
        x = x.transpose(1, 2)
        conv_out = torch.tanh(self.conv(x))
        conv_out = conv_out.transpose(1, 2)
        attn_scores = self.attn_fc(conv_out)
        attn_weights = F.softmax(attn_scores, dim=1)
        weighted = torch.sum(conv_out * attn_weights, dim=1)
        return weighted

class GlobalAttention(nn.Module):
    def __init__(self, embed_dim=100, num_filters=100, filter_sizes=[2, 3, 4]):
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, fs) for fs in filter_sizes
        ])
        self.attn_fc = nn.Linear(num_filters * len(filter_sizes), 1)

    def forward(self, x):
        x = x.transpose(1, 2)
        conv_outs = []
        for conv in self.convs:
            c = torch.tanh(conv(x))
            c = F.max_pool1d(c, kernel_size=c.shape[2])
            conv_outs.append(c.squeeze(2))
        merged = torch.cat(conv_outs, dim=1)
        attn_weights = F.softmax(self.attn_fc(merged), dim=1)
        weighted = merged * attn_weights
        return weighted

class DAttnRecommender(nn.Module):
    def __init__(self, embed_dim=100):
        super().__init__()
        self.user_lattn = LocalAttention(embed_dim)
        self.item_lattn = LocalAttention(embed_dim)
        self.user_gattn = GlobalAttention(embed_dim)
        self.item_gattn = GlobalAttention(embed_dim)

        feature_dim = (200 + 100 * 3) * 2
        self.fc1 = nn.Linear(feature_dim, 500)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(500, 50)
        self.dropout2 = nn.Dropout(0.5)
        self.output = nn.Linear(50, 1)

    def forward(self, user_reviews, item_reviews):
        u_l = self.user_lattn(user_reviews)
        i_l = self.item_lattn(item_reviews)
        u_g = self.user_gattn(user_reviews)
        i_g = self.item_gattn(item_reviews)
        x = torch.cat([u_l, u_g, i_l, i_g], dim=1)
        x = self.dropout1(F.relu(self.fc1(x)))
        x = self.dropout2(F.relu(self.fc2(x)))
        return self.output(x).squeeze(1)

# ------------------ 7. 학습 및 평가 함수 ------------------
def evaluate_model(model, dataloader, device):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for u, i, r in dataloader:
            u, i, r = u.to(device), i.to(device), r.to(device)
            o = model(u, i)
            preds.extend(o.cpu().numpy())
            targets.extend(r.cpu().numpy())
    preds, targets = np.array(preds), np.array(targets)
    mse = mean_squared_error(targets, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(targets, preds)
    mape = np.mean(np.abs((targets - preds) / (targets + 1e-10))) * 100
    print(f"\n✅ [D-Attn] 최종 테스트 평가 지표:\n   - MSE  : {mse:.4f}\n   - RMSE : {rmse:.4f}\n   - MAE  : {mae:.4f}\n   - MAPE : {mape:.2f}%")
    return mse, rmse, mae, mape

def train_dattn(model, train_loader, val_loader, device, lr=1e-3, epochs=50, patience=5, min_delta=0.001):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    best_val_rmse = float('inf')
    epochs_no_improve = 0
    model_path = 'best_dattn_model.pt'

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for u, i, r in train_loader:
            u, i, r = u.to(device), i.to(device), r.to(device)
            optimizer.zero_grad()
            output = model(u, i)
            loss = criterion(output, r)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # 검증
        model.eval()
        val_preds, val_true = [], []
        with torch.no_grad():
            for u, i, r in val_loader:
                u, i, r = u.to(device), i.to(device), r.to(device)
                output = model(u, i)
                val_preds.extend(output.cpu().numpy())
                val_true.extend(r.cpu().numpy())

        val_rmse = np.sqrt(mean_squared_error(val_true, val_preds))
        val_mae = mean_absolute_error(val_true, val_preds)
        val_mape = np.mean(np.abs((np.array(val_true) - np.array(val_preds)) / (np.array(val_true) + 1e-10))) * 100

        print(f"\nEpoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | "
              f"Val RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}, MAPE: {val_mape:.2f}%")

        if val_rmse < best_val_rmse - min_delta:
            best_val_rmse = val_rmse
            epochs_no_improve = 0
            torch.save(model.state_dict(), model_path)
            print(f"  --> 개선됨. 모델 저장됨 (RMSE: {best_val_rmse:.4f})")
        else:
            epochs_no_improve += 1
            print(f"  --> 개선 없음. ({epochs_no_improve}/{patience})")
            if epochs_no_improve == patience:
                print("조기 종료 발생.")
                break

    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path))
        print(f"최적 모델 로드 완료: {model_path}")
    return model

# ------------------ 8. 데이터 분할 및 학습 실행 ------------------
X_user_trainval, X_user_test, X_item_trainval, X_item_test, y_trainval, y_test = train_test_split(
    user_embeddings, item_embeddings, ratings, test_size=0.2, random_state=42)

X_user_train, X_user_val, X_item_train, X_item_val, y_train, y_val = train_test_split(
    X_user_trainval, X_item_trainval, y_trainval, test_size=0.125, random_state=42)

train_dataset = DAttnDataset(X_user_train, X_item_train, y_train)
val_dataset = DAttnDataset(X_user_val, X_item_val, y_val)
test_dataset = DAttnDataset(X_user_test, X_item_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)
test_loader = DataLoader(test_dataset, batch_size=256)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DAttnRecommender()
model = train_dattn(model, train_loader, val_loader, device)
evaluate_model(model, test_loader, device)


Loading GloVe...
Embedding review texts...


 10%|█         | 46031/447796 [00:43<03:34, 1869.83it/s]

23232

In [None]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

# GloVe 로딩 함수
def load_glove_embeddings(glove_path, embedding_dim=100):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# 전처리 + GloVe 임베딩 함수
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.split()

def text_to_glove_sequence(text, glove_embeddings, max_len=300, embed_dim=100):
    tokens = preprocess(text)
    vectors = [glove_embeddings[token] for token in tokens if token in glove_embeddings]
    if len(vectors) < max_len:
        vectors.extend([np.zeros(embed_dim)] * (max_len - len(vectors)))
    else:
        vectors = vectors[:max_len]
    return np.stack(vectors).astype(np.float16)

# 경로 설정
json_path = '/content/drive/MyDrive/review_business_5up_with_text.json'
glove_path = '/content/drive/MyDrive/glove.6B.100d.txt'
save_dir = '/content/drive/MyDrive/glove_chunks'
os.makedirs(save_dir, exist_ok=True)

# GloVe 임베딩 로드
print("Loading GloVe...")
glove_embeddings = load_glove_embeddings(glove_path)

# 리뷰 JSON 라인 단위로 읽고 처리
chunk_size = 10000
chunk_idx = 0
buffer = []

with open(json_path, 'r') as f:
    for line_num, line in enumerate(f):
        buffer.append(json.loads(line))
        if len(buffer) == chunk_size:
            df = pd.DataFrame(buffer)
            print(f"[Chunk {chunk_idx}] Embedding {len(df)} reviews...")
            glove_seqs = [text_to_glove_sequence(t, glove_embeddings) for t in tqdm(df['text'])]
            np.save(os.path.join(save_dir, f'glove_chunk_{chunk_idx}.npy'), glove_seqs)
            df[['user_id', 'business_id', 'stars']].to_csv(os.path.join(save_dir, f'meta_chunk_{chunk_idx}.csv'), index=False)
            buffer = []
            chunk_idx += 1

# 마지막 남은 리뷰 처리
if buffer:
    df = pd.DataFrame(buffer)
    print(f"[Chunk {chunk_idx}] Embedding {len(df)} reviews...")
    glove_seqs = [text_to_glove_sequence(t, glove_embeddings) for t in tqdm(df['text'])]
    np.save(os.path.join(save_dir, f'glove_chunk_{chunk_idx}.npy'), glove_seqs)
    df[['user_id', 'business_id', 'stars']].to_csv(os.path.join(save_dir, f'meta_chunk_{chunk_idx}.csv'), index=False)


Loading GloVe...
[Chunk 0] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:06<00:00, 1463.64it/s]


[Chunk 1] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:06<00:00, 1536.47it/s]


[Chunk 2] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:18<00:00, 533.75it/s]


[Chunk 3] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1417.70it/s]


[Chunk 4] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1325.72it/s]


[Chunk 5] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1373.41it/s]


[Chunk 6] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:10<00:00, 977.00it/s]


[Chunk 7] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:06<00:00, 1563.57it/s]


[Chunk 8] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1258.93it/s]


[Chunk 9] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:06<00:00, 1468.26it/s]


[Chunk 10] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:08<00:00, 1140.70it/s]


[Chunk 11] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1253.32it/s]


[Chunk 12] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1332.57it/s]


[Chunk 13] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1306.54it/s]


[Chunk 14] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1405.83it/s]


[Chunk 15] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:11<00:00, 894.30it/s]


[Chunk 16] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1402.66it/s]


[Chunk 17] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:11<00:00, 897.46it/s]


[Chunk 18] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:06<00:00, 1665.01it/s]


[Chunk 19] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:10<00:00, 916.40it/s]


[Chunk 20] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:05<00:00, 1894.51it/s]


[Chunk 21] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1408.72it/s]


[Chunk 22] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1426.03it/s]


[Chunk 23] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:05<00:00, 1886.99it/s]


[Chunk 24] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:06<00:00, 1428.93it/s]


[Chunk 25] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1422.68it/s]


[Chunk 26] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:05<00:00, 1852.46it/s]


[Chunk 27] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:05<00:00, 1902.09it/s]


[Chunk 28] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1423.25it/s]


[Chunk 29] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1395.63it/s]


[Chunk 30] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:06<00:00, 1512.66it/s]


[Chunk 31] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:07<00:00, 1405.35it/s]


[Chunk 32] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:05<00:00, 1889.80it/s]


[Chunk 33] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:05<00:00, 1887.22it/s]


[Chunk 34] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:05<00:00, 1867.33it/s]


[Chunk 35] Embedding 10000 reviews...


100%|██████████| 10000/10000 [00:05<00:00, 1828.27it/s]
