In [2]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset, Reader, SVD
from sklearn.preprocessing import normalize
from tqdm import tqdm

# 경로 설정
train_file = "train_80.json"
test_file = "test_20.json"

# 1. 감성 벡터 포함된 리뷰 로드
def load_reviews_with_vectors(path):
    data = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            obj["vector"] = np.array(obj["sentiment_vector"])
            data.append(obj)
    return data

# 2. 평점만 추출
def extract_ratings(data):
    return [(d["user_id"], d["business_id"], d["stars"]) for d in data]

train_reviews = load_reviews_with_vectors(train_file)
test_reviews = load_reviews_with_vectors(test_file)

train_ratings = extract_ratings(train_reviews)
test_ratings = extract_ratings(test_reviews)

# 3. SVD 모델 학습
reader = Reader(rating_scale=(1, 5))
train_df = pd.DataFrame(train_ratings, columns=["user_id", "business_id", "stars"])
train_dataset = Dataset.load_from_df(train_df, reader)
trainset = train_dataset.build_full_trainset()

svd = SVD(n_factors=20)
svd.fit(trainset)

# 4. 감성 벡터 임베딩 계산
user_vecs = defaultdict(list)
biz_vecs = defaultdict(list)

for r in train_reviews:
    user_vecs[r["user_id"]].append(r["vector"])
    biz_vecs[r["business_id"]].append(r["vector"])

user_embed = {u: np.mean(vs, axis=0) for u, vs in user_vecs.items()}
biz_embed = {b: np.mean(vs, axis=0) for b, vs in biz_vecs.items()}

# 5. 하이브리드 추천 함수
def hybrid_recommendations(svd_model, user_embed, biz_embed, train_ratings, top_n=5, alpha=0.7):
    print("[Info] Generating hybrid recommendations...")
    recommendations = {}
    user_ids = list(user_embed.keys())
    biz_ids = list(biz_embed.keys())

    norm_user_embed = {u: v / np.linalg.norm(v) for u, v in user_embed.items()}
    norm_biz_embed = {b: v / np.linalg.norm(v) for b, v in biz_embed.items()}

    user_seen = defaultdict(set)
    for u, b, _ in train_ratings:
        user_seen[u].add(b)

    for u in tqdm(user_ids, desc="Generating for users"):
        if u not in norm_user_embed:
            continue
        scores = []
        for b in biz_ids:
            if b in user_seen[u] or b not in norm_biz_embed:
                continue
            svd_score = svd_model.predict(u, b).est
            content_score = np.dot(norm_user_embed[u], norm_biz_embed[b])
            hybrid_score = alpha * svd_score + (1 - alpha) * content_score
            scores.append((b, hybrid_score))
        ranked = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
        recommendations[u] = ranked
    print("[Done] Recommendation generation complete.")
    return recommendations

# 6. 실행
hybrid_recs = hybrid_recommendations(svd, user_embed, biz_embed, train_ratings, top_n=5, alpha=0.7)

# 7. 결과 예시 출력
for uid, recs in list(hybrid_recs.items())[:5]:
    print(f"\nUser: {uid}")
    for bid, score in recs:
        print(f"  Recommend: {bid} (Hybrid Score: {score:.4f})")


[Info] Generating hybrid recommendations...


Generating for users:  11%|█         | 3192/28518 [02:39<21:03, 20.04it/s]


KeyboardInterrupt: 