In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 예시 데이터프레임 구성
df = pd.read_json("../data/review_business_5up_5aspect_3sentiment_vectorized_clean.json", lines=True)

# 2. Leave-One-Out 방식으로 데이터 분할
train_df = df.groupby('user_id', group_keys=False).apply(lambda x: x.iloc[:-1])
test_df = df.groupby('user_id', group_keys=False).apply(lambda x: x.iloc[-1:])

# 3. 학습 데이터 기반 피벗 테이블 생성
pivot = train_df.pivot_table(index='user_id', columns='business_id', values='stars')
pivot_filled = pivot.fillna(0)

# 4. 사용자 유사도 계산 (코사인 유사도)
user_similarity = cosine_similarity(pivot_filled)
user_similarity_df = pd.DataFrame(user_similarity, index=pivot_filled.index, columns=pivot_filled.index)

# 5. 평점 예측 함수
def predict_rating(user_id, business_id):
    if user_id not in pivot_filled.index or business_id not in pivot_filled.columns:
        return np.nan
    sim_scores = user_similarity_df.loc[user_id]
    biz_ratings = pivot_filled[business_id]
    mask = biz_ratings.notna()
    if mask.sum() == 0:
        return np.nan
    weighted_sum = (sim_scores[mask] * biz_ratings[mask]).sum()
    sim_sum = np.abs(sim_scores[mask]).sum()
    return weighted_sum / sim_sum if sim_sum != 0 else np.nan


  train_df = df.groupby('user_id').apply(lambda x: x.iloc[:-1]).reset_index(drop=True)
  test_df = df.groupby('user_id').apply(lambda x: x.iloc[-1:]).reset_index(drop=True)


In [2]:
# 6. 테스트셋에 대해 예측 수행
test_df['pred'] = test_df.apply(lambda row: predict_rating(row['user_id'], row['business_id']), axis=1)

# 7. 성능 평가
valid = test_df.dropna(subset=['pred'])
rmse = np.sqrt(mean_squared_error(valid['stars'], valid['pred']))
mae = mean_absolute_error(valid['stars'], valid['pred'])

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 3.9843
MAE: 3.7805


In [3]:
# 8. Precision@k 계산 함수
def precision_at_k(user_id, k=5, threshold=4.0):
    if user_id not in pivot_filled.index:
        return np.nan
    sim_scores = user_similarity_df.loc[user_id]
    neighbors = sim_scores.sort_values(ascending=False)[1:k+1].index
    neighbor_ratings = pivot_filled.loc[neighbors]
    mean_scores = neighbor_ratings.mean().sort_values(ascending=False)
    top_k_items = mean_scores.head(k).index
    # 실제로 사용자가 좋아한 아이템이 몇 개 있는지
    liked_items = train_df[(train_df['user_id'] == user_id) & (train_df['stars'] >= threshold)]['business_id'].values
    hits = np.isin(top_k_items, liked_items).sum()
    return hits / k

# 9. 전체 평균 Precision@k 계산
precision_list = [precision_at_k(uid) for uid in test_df['user_id'].unique()]
precision_at_5 = np.nanmean(precision_list)

# 10. 출력
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Precision@5: {precision_at_5:.4f}")

RMSE: 3.9843
MAE: 3.7805
Precision@5: 0.5318


In [5]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from tqdm import tqdm
import pandas as pd
import numpy as np
from collections import defaultdict

# 1. 데이터 로딩
df = pd.read_json("../data/review_business_5up_5aspect_3sentiment_vectorized_clean.json", lines=True)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'stars']], reader)

# 2. 학습/테스트 분할
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# 3. SVD 모델 학습
model = SVD()
model.fit(trainset)

# 4. 테스트셋 예측
predictions = model.test(testset)

# 5. 정확도 지표 출력
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

# 6. Precision@k 계산 (tqdm 포함)
def precision_at_k(predictions, k=5, threshold=4.0):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = []
    for uid in tqdm(user_est_true, desc="Calculating Precision@k"):
        user_ratings = sorted(user_est_true[uid], key=lambda x: x[0], reverse=True)
        top_k = user_ratings[:k]
        hits = sum((true_r >= threshold) for (_, true_r) in top_k)
        precisions.append(hits / k)

    return np.mean(precisions)

# 7. Precision@5 평가
p_at_5 = precision_at_k(predictions, k=5)

# 8. 출력
print(f"\n✅ SVD 추천 시스템 성능:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Precision@5: {p_at_5:.4f}")

RMSE: 1.0492
MAE:  0.8182


Calculating Precision@k: 100%|██████████| 23511/23511 [00:00<00:00, 235116.95it/s]


✅ SVD 추천 시스템 성능:
RMSE: 1.0492
MAE: 0.8182
Precision@5: 0.3772



