In [2]:
import os
import sys
from langchain_community.document_loaders.csv_loader import CSVLoader
from pathlib import Path
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
import os
from dotenv import load_dotenv
load_dotenv()
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import TextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
import pandas as pd
import networkx as nx
import math
import torch

# os.chdir('/Users/mac/AIworkspace/LLMWORKSPACE/RAG_Rec')
# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [3]:
# 영화기록 데이터 
import pandas as pd
file_path = "data/movies.dat"
df2 = pd.read_csv(file_path, delimiter="::", engine="python", header=None,encoding="latin1")
df2.columns = ["MovieID", "Title", "Genres"]

file_path = "data/ratings.dat"
df = pd.read_csv(file_path, delimiter="::", engine="python", header=None,encoding="latin1")
df.columns = ["UserId", "MovieID", "Ratings","timestamp"]
new_df=df.merge(df2, on='MovieID')
df_sorted = new_df.sort_values(by=['UserId', 'timestamp']).reset_index(drop=True)

In [4]:
# --- 2. 사용자별 interaction 리스트 생성 ---
df_sorted['interaction'] = df_sorted.apply(
    lambda row: f"{row['Genres']} (Rating: {row['Ratings']})", axis=1
)

# 사용자별 interaction 연결 (리스트 형태)
user_interactions = df_sorted.groupby('UserId')['interaction'].apply(list).reset_index()

# 컬럼명 변경
user_interactions.columns = ['UserId', 'interaction_list']

In [5]:
df= pd.read_csv('header_vanila.csv')

In [6]:
import numpy as np
from sklearn.mixture import GaussianMixture
class GMMClusterer:
    def __init__(self, random_state=42):
        self.random_state = random_state

    def fit_predict(self, embeddings: np.ndarray, n_clusters: int) -> np.ndarray:
        # 🔥 매 레벨마다 n_clusters를 전달하도록 변경
        gmm = GaussianMixture(n_components=n_clusters, random_state=self.random_state)
        return gmm.fit_predict(embeddings)

In [7]:
class RaptorTree:
    def __init__(self, embedding_generator, clusterer, min_clusters=2, max_level=5, top_level_clusters=100):
        self.embedding_generator = embedding_generator
        self.clusterer = clusterer
        self.min_clusters = min_clusters
        self.max_level = max_level
        self.top_level_clusters = top_level_clusters
        self.tree = {}
        self.user_id_to_text = {}

    def build_tree(self, texts: list[str], user_ids: list[str]):
        self.user_id_to_text = dict(zip(user_ids, texts))
        current_texts = texts
        current_user_ids = user_ids
        current_level = 0
        parent_ids = None

        while len(current_texts) > 1 and current_level < self.max_level:
            embeddings = self.embedding_generator.embed_texts(current_texts)

            # 🔥 클러스터 개수 설정 수정 (빈 클러스터 방지)
            n_clusters = max(self.min_clusters, min(len(current_texts) // 2, self.top_level_clusters // (current_level + 1)))

            cluster_labels = self.clusterer.fit_predict(embeddings, n_clusters=n_clusters)

            cluster_metadata = []
            next_level_texts = []
            next_level_user_ids = []

            for cluster_id in np.unique(cluster_labels):
                cluster_indices = np.where(cluster_labels == cluster_id)[0]
                cluster_texts = [current_texts[i] for i in cluster_indices]

                # 🔥 빈 클러스터 제거
                if len(cluster_texts) == 0:
                    continue

                # 🔥 첫 번째 레벨에서는 user_ids 그대로 사용
                if current_level == 0:
                    cluster_user_ids = [current_user_ids[i] for i in cluster_indices]
                else:
                    # 상위 레벨에서는 이전 클러스터 ID 기준으로 `user_ids` 추가
                    cluster_user_ids = []
                    for idx in cluster_indices:
                        child_cluster_id = current_user_ids[idx]
                        for child_meta in self.tree[current_level - 1]:
                            if child_meta["cluster_id"] == child_cluster_id:
                                cluster_user_ids.extend(child_meta["user_ids"])

                # 🔥 대표 텍스트 (가장 긴 텍스트 선정)
                representative_text = max(cluster_texts, key=len)

                cluster_embeddings = embeddings[cluster_indices]
                mean_embedding = cluster_embeddings.mean(axis=0)

                # 🔥 올바른 parent_id 설정 (부모 클러스터 1개만 추가)
                metadata = {
                    "cluster_id": f"level_{current_level}_cluster_{cluster_id}",
                    "level": current_level,
                    "user_ids": cluster_user_ids,  # ✅ 첫 번째 레벨에서도 user_ids 추가
                    "embedding": mean_embedding,
                    "parent_id": parent_ids if parent_ids else [],
                    "child_ids": None
                }

                cluster_metadata.append(metadata)
                next_level_texts.append(representative_text)
                next_level_user_ids.append(metadata["cluster_id"])

            # 클러스터 메타데이터 추가
            self.tree[current_level] = cluster_metadata
            current_texts = next_level_texts
            current_user_ids = next_level_user_ids
            parent_ids = current_user_ids
            current_level += 1

        return self.tree

    def search_user_cluster(self, target_user_id: str, target_user_text: str, threshold=0.01):
        query_embedding = self.embedding_generator.embed_texts([target_user_text])[0]
        current_level = max(self.tree.keys())
        previous_similarity = None
        best_cluster = None

        while current_level >= 0:
            clusters = self.tree[current_level]
            clusters_filtered = []
            clusters_filtered_embeddings = []

            for cluster in clusters:
                cluster_user_ids = cluster["user_ids"]

                # 🔥 클러스터 ID가 아닌 실제 유저 ID만 필터링
                texts_to_embed = [
                    self.user_id_to_text[uid] for uid in cluster_user_ids
                    if uid != target_user_id and 'cluster' not in uid
                ]
                if not texts_to_embed:
                    continue

                embeddings_cluster = self.embedding_generator.embed_texts(texts_to_embed)
                mean_embedding = embeddings_cluster.mean(axis=0)

                clusters_filtered.append(cluster)
                clusters_filtered_embeddings.append(mean_embedding)

            if not clusters_filtered:
                break

            similarities = cosine_similarity([query_embedding], clusters_filtered_embeddings).flatten()
            best_idx = np.argmax(similarities)
            best_cluster = clusters_filtered[best_idx]
            current_similarity = similarities[best_idx]

            if previous_similarity and abs(previous_similarity - current_similarity) / previous_similarity > threshold:
                break

            previous_similarity = current_similarity
            current_level -= 1

        best_cluster_users_excluded = [uid for uid in best_cluster["user_ids"] if uid != target_user_id]
        return best_cluster["cluster_id"], best_cluster_users_excluded

In [8]:
# 필요한 라이브러리들 임포트 미리 준비
import numpy as np
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
# 만들어진 청크를 임베딩하는 클래스
class EmbeddingGenerator:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.model = SentenceTransformer(model_name, device= device)
    
    def embed_texts(self, texts: list[str]) -> np.ndarray:
        # 텍스트 리스트를 임베딩 벡터로 변환하여 반환
        return self.model.encode(texts, convert_to_numpy=True, device='cuda' if torch.cuda.is_available() else 'cpu')

# 초기화 예시
embedding_gen = EmbeddingGenerator()
clusterer = GMMClusterer()

# 초기 RAPTOR 트리 생성 (한 번만 수행)
# 클래스 초기화 부분
raptor_tree = RaptorTree(
    embedding_gen, 
    clusterer,
    min_clusters=2,
    max_level=5,          # 더 높이거나 낮춰서 조정
    top_level_clusters=100 # 최상위 클러스터 수를 늘리거나 줄여본다
)
tree_structure = raptor_tree.build_tree(df.chunk_header.tolist(), df.UserId.astype(str).tolist())

# 특정 사용자 검색 수행 (Self-exclusion 방식 적용)
target_user_id = "2"
target_user_text = df.loc[df.UserId == int(target_user_id), 'chunk_header'].iloc[0]

best_cluster_id, similar_users = raptor_tree.search_user_cluster(
    target_user_id, target_user_text, threshold=0.005
)

print(f"유저 {target_user_id}가 가장 유사한 클러스터: {best_cluster_id}")
print(f"해당 클러스터에 속한 유사 유저 목록 (본인 제외): {similar_users}")

  from .autonotebook import tqdm as notebook_tqdm
2025-03-27 06:48:52.050311: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-27 06:48:52.066557: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743025732.083691 2719886 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743025732.088677 2719886 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743025732.101687 2719886 computation_placer.cc:177] computation placer already r

유저 2가 가장 유사한 클러스터: level_2_cluster_16
해당 클러스터에 속한 유사 유저 목록 (본인 제외): ['46', '127', '304', '1089', '1302', '1424', '1610', '1726', '1927', '2101', '2126', '2146', '2263', '2297', '2363', '2441', '2514', '3144', '3352', '3460', '3567', '3636', '3673', '4000', '4031', '4109', '4160', '4587', '4690', '4874', '4892', '5040', '5069', '5469', '5703', '5895', '6034', '7', '277', '296', '431', '908', '1030', '1131', '1200', '1398', '1520', '1649', '1866', '2466', '2598', '2663', '3048', '3068', '3307', '3337', '3459', '3461', '3487', '3662', '3818', '3978', '4093', '4183', '4417', '4489', '4499', '4626', '5003', '5029', '5095', '5854', '5870', '5871', '5884', '5912', '5947', '6020', '185', '279', '422', '542', '633', '677', '700', '917', '1260', '1269', '1533', '1548', '1567', '1682', '2062', '2114', '2163', '2211', '2357', '2369', '2388', '2479', '2625', '2642', '2767', '2967', '3027', '3162', '3187', '3455', '3490', '3616', '3733', '3822', '3828', '3855', '4200', '4207', '4531', '4579', '5309'

In [9]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(
    "vectorstore_index_ratings_min5_no",
    embeddings,
    allow_dangerous_deserialization=True  # 역직렬화 허용
)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [10]:
import ast
file_path='data/train_movie.csv'
data = pd.read_csv(file_path)
# 'movie_explain' 열을 리스트로 변환
data['movie_explain'] = data['movie_explain'].apply(ast.literal_eval)
file_path_test='data/test_movie.csv'
# 최신 구매 기록을 가져옴
purchase_history=data.iloc[1]['movie_explain']
# 정답 데이터셋 가져옴
df_test=pd.read_csv(file_path_test)
# 정답
df_test.iloc[1].movie_explain

"['1917 (Action|Adventure|Sci-Fi|Thriller) ratings: 3']"

In [11]:
# 🔹 최신 구매 기록 (Query)
query = " ".join(purchase_history[-1:])  # 리스트를 문자열로 변환

# 🔹 similar_users의 타입 확인 후 변환
similar_users_str = set(map(str, similar_users))  # 문자열 변환

# 🔹 FAISS에서 `similar_users`만 검색하도록 필터 추가
retriever = vectorstore.as_retriever(
    search_kwargs={
        "k": 500,
        # "filter": user_filter  # 🔥 수정된 부분
    }
)

# 🔹 검색 수행
records = retriever.get_relevant_documents(query)

# 🔹 결과 확인
record_user_ids = [str(record.metadata['UserId']) for record in records]
intersection = similar_users_str.intersection(set(record_user_ids))

# 🔹 'similar_users'에 속하는 유저만 필터링
filtered_records = [
    record for record in records 
    if str(record.metadata['UserId']) in similar_users_str
]

  records = retriever.get_relevant_documents(query)


In [12]:
from typing import List
from langchain.vectorstores import FAISS
from langchain.schema import Document

def get_documents_with_context(
    vectorstore: FAISS,
    filtered_records: List[Document],
    context_window: int = 1
) -> List[List[Document]]:
    """
    intersection 유저들의 검색 결과와 각 결과의 앞뒤 문서들을 함께 반환합니다.
    
    Args:
        vectorstore: FAISS 벡터스토어 인스턴스
        filtered_records: 검색된 유저들의 기록
        context_window: 앞뒤로 가져올 문서 수 (default: 1)
    
    Returns:
        List[List[Document]]: 각 검색 결과에 대해 [이전 문서들, 현재 문서, 다음 문서들]을 포함하는 리스트
    """
    
    # 🔹 모든 문서와 메타데이터를 딕셔너리로 구성
    all_docs = {}
    for doc_id, doc in enumerate(vectorstore.docstore._dict.values()):
        user_id = str(doc.metadata['UserId'])
        chunk_idx = doc.metadata['chunk_index']
        
        if user_id not in all_docs:
            all_docs[user_id] = {}
        all_docs[user_id][chunk_idx] = doc
    
    # 🔹 Context 추가
    context_results = []
    
    for doc in filtered_records:
        current_user_id = str(doc.metadata['UserId'])
        current_chunk_index = doc.metadata['chunk_index']
        
        context_docs = []
        
        # 🔹 이전 문서들 추가
        for i in range(current_chunk_index - context_window, current_chunk_index):
            if current_user_id in all_docs and i in all_docs[current_user_id]:
                context_docs.append(all_docs[current_user_id][i])
        
        # 🔹 현재 문서 추가
        context_docs.append(doc)
        
        # 🔹 다음 문서들 추가
        for i in range(current_chunk_index + 1, current_chunk_index + context_window + 1):
            if current_user_id in all_docs and i in all_docs[current_user_id]:
                context_docs.append(all_docs[current_user_id][i])
        
        context_results.append(context_docs)
    
    return context_results

# 🔹 intersection 유저들의 청크 앞뒤 청크 추출
context_results = get_documents_with_context(vectorstore, filtered_records, context_window=1)

In [15]:
# 🔹 intersection 유저들의 청크 앞뒤 청크 추출
context_results = get_documents_with_context(vectorstore, filtered_records, context_window=1)

flattened_results = [doc for sublist in context_results for doc in sublist]
# 사용자별 영화 ID 가져오기
user_movies = get_user_movies(flattened_results)

# 사용자-영화 그래프 생성
G = create_user_movie_graph(user_movies)

previous_movie_ids = extract_previous_movie_ids(purchase_history)

# Top-10 영화 리스트 가져오기
top_movies = get_top_10_common_movies(G)

# 이전 기록에 포함되지 않은 영화만 필터링
filtered_movies = filter_movies_by_history(top_movies, previous_movie_ids)

# 결과 출력
for movie, count in filtered_movies:
    print(f"{movie}: watched by {count} users")


Movie 788: watched by 3 users
Movie 1573: watched by 3 users
Movie 2094: watched by 2 users
Movie 1320: watched by 2 users
Movie 329: watched by 2 users
Movie 1917: watched by 2 users
Movie 1779: watched by 2 users
Movie 173: watched by 2 users
Movie 3354: watched by 2 users
Movie 880: watched by 2 users
Movie 748: watched by 2 users
Movie 1580: watched by 2 users
Movie 1589: watched by 2 users
Movie 1748: watched by 2 users
Movie 802: watched by 2 users
Movie 3020: watched by 1 users


In [14]:
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
import re

def get_user_movies(data):
    user_movies = defaultdict(list)

    for doc in data:
        user_id = doc.metadata['UserId']
        page_content = doc.page_content

        # 정규식을 사용하여 영화 ID 추출 (ratings 이전 내용만)
        movie_ids = re.findall(r'(\d+)(?= \()', page_content)

        # 사용자별로 영화 ID 추가
        user_movies[user_id].extend(movie_ids)

    return user_movies

def create_user_movie_graph(user_movies):
    # 그래프 생성
    G = nx.Graph()

    # 사용자와 영화 노드 추가 및 엣지 생성
    for user_id, movies in user_movies.items():
        user_node = f"User {user_id}"
        G.add_node(user_node, type='user')

        for movie_id in movies:
            movie_node = f"Movie {movie_id}"
            G.add_node(movie_node, type='movie')
            G.add_edge(user_node, movie_node)

    return G

def visualize_graph(G):
    # 그래프 시각화
    plt.figure(figsize=(20, 15))

    # 사용자와 영화 노드 분리
    users = [node for node in G.nodes() if G.nodes[node]['type'] == 'user']
    movies = [node for node in G.nodes() if G.nodes[node]['type'] == 'movie']

    # 레이아웃 설정
    pos = nx.spring_layout(G, k=0.5, iterations=50)

    # 노드 그리기
    nx.draw_networkx_nodes(G, pos, nodelist=users, node_color='lightblue', node_size=300, alpha=0.8)
    nx.draw_networkx_nodes(G, pos, nodelist=movies, node_color='lightgreen', node_size=200, alpha=0.8)

    # 엣지 그리기
    nx.draw_networkx_edges(G, pos, alpha=0.5)

    # 레이블 그리기
    nx.draw_networkx_labels(G, pos, font_size=8, font_weight="bold")

    plt.title("User-Movie Relationship Graph", fontsize=16)
    plt.axis('off')
    plt.tight_layout()

    # 그래프 저장
    plt.savefig('user_movie_graph.png', dpi=300, bbox_inches='tight')
    plt.close()

import re

# 과거 기록에서 영화 ID 추출
def extract_previous_movie_ids(purchase_history):
    # 영화 ID만 추출 (숫자와 괄호 전까지만 가져옴)
    previous_movies = re.findall(r'(\d+)(?=\s\()', ' '.join(purchase_history))
    return set(previous_movies)

# 동시 시청 영화에서 이전 기록을 제외하는 함수
def filter_movies_by_history(top_movies, previous_movie_ids):
    # 영화 ID만 추출하여 비교 후 제외
    filtered_movies = [(movie, count) for movie, count in top_movies if movie.split()[1] not in previous_movie_ids]
    return filtered_movies
def get_top_10_common_movies(G):
    # 영화 노드만 필터링
    movies = [node for node in G.nodes() if G.nodes[node]['type'] == 'movie']

    # 영화별 연결된 사용자 수 계산
    movie_view_counts = {movie: len(list(G.neighbors(movie))) for movie in movies}

    # 사용자 수 기준으로 내림차순 정렬하여 상위 10개 추출
    top_10_movies = sorted(movie_view_counts.items(), key=lambda x: x[1], reverse=True)[:20]

    # 결과 반환
    return top_10_movies

# 사용자별 영화 ID 가져오기
user_movies = get_user_movies(flattened_results)

# 사용자-영화 그래프 생성
G = create_user_movie_graph(user_movies)

previous_movie_ids = extract_previous_movie_ids(purchase_history)

# Top-10 영화 리스트 가져오기
top_movies = get_top_10_common_movies(G)

# 이전 기록에 포함되지 않은 영화만 필터링
filtered_movies = filter_movies_by_history(top_movies, previous_movie_ids)

# 결과 출력
for movie, count in filtered_movies:
    print(f"{movie}: watched by {count} users")


Movie 788: watched by 3 users
Movie 1573: watched by 3 users
Movie 2094: watched by 2 users
Movie 1320: watched by 2 users
Movie 329: watched by 2 users
Movie 1917: watched by 2 users
Movie 1779: watched by 2 users
Movie 173: watched by 2 users
Movie 3354: watched by 2 users
Movie 880: watched by 2 users
Movie 748: watched by 2 users
Movie 1580: watched by 2 users
Movie 1589: watched by 2 users
Movie 1748: watched by 2 users
Movie 802: watched by 2 users
Movie 3020: watched by 1 users


In [16]:
from tqdm import tqdm
import pandas as pd
import re

# (1) 실험 파라미터 (고정값)
threshold = 0.007
k_val = 750
window_size = 1

# 결과 저장용 DataFrame
all_results = pd.DataFrame(columns=["UserId", "Hit", "Answer", "Recommended",
                                    "threshold", "faiss_k", "window"])

# (2) 유저 단일 세팅 실험 (예: 5명 유저 반복)
for idx in tqdm(range(3000)):
    results = pd.DataFrame(columns=["UserId", "Hit", "Answer", "Recommended"])

    target_user_id = str(idx + 1)  # 유저 ID는 1부터 시작
    target_user_text = df.loc[df.UserId == int(target_user_id), 'chunk_header'].iloc[0]

    # (2-1) Raptor Tree 검색
    best_cluster_id, similar_users = raptor_tree.search_user_cluster(
        target_user_id,
        target_user_text,
        threshold=threshold
    )

    # 최신 구매 기록에서 마지막 항목을 질의로 사용
    purchase_history = data.iloc[idx]['movie_explain']
    query = " ".join(purchase_history[-1:])

    # (2-2) FAISS 검색
    retriever_k = vectorstore.as_retriever(
        search_kwargs={"k": k_val}
    )
    records = retriever_k.get_relevant_documents(query)

    record_user_ids = [str(record.metadata['UserId']) for record in records]
    intersection = set(map(str, similar_users)).intersection(set(record_user_ids))

    filtered_records = [
        record for record in records
        if str(record.metadata['UserId']) in intersection
    ]

    # (2-3) 문맥 확장 (window size 적용)
    context_results = get_documents_with_context(
        vectorstore,
        filtered_records,
        context_window=window_size
    )
    flattened_results = [doc for sublist in context_results for doc in sublist]

    # 유저-영화 그래프 생성 및 추천 영화 추출
    user_movies = get_user_movies(flattened_results)
    G = create_user_movie_graph(user_movies)
    top_movies = get_top_10_common_movies(G)

    # 과거 기록 제거
    filtered_movies = filter_movies_by_history(
        top_movies,
        extract_previous_movie_ids(purchase_history)
    )

    # 정답 및 예측 결과 비교
    answer = df_test.iloc[idx].movie_explain
    match = re.search(r"(\d+)", answer)
    answer_id = match.group(1) if match else None

    filtered_movie_ids = [
        re.search(r"(\d+)", movie).group(1)
        for movie, _ in filtered_movies
        if re.search(r"(\d+)", movie)
    ]

    hit = 1 if answer_id in filtered_movie_ids else 0

    # 유저별 결과 저장
    results = pd.concat([results, pd.DataFrame({
        "UserId": [target_user_id],
        "Hit": [hit],
        "Answer": [answer_id],
        "Recommended": [", ".join(filtered_movie_ids)]
    })], ignore_index=True)

    # 파라미터 정보 추가
    results["threshold"] = threshold
    results["faiss_k"] = k_val
    results["window"] = window_size

    all_results = pd.concat([all_results, results], ignore_index=True)

# (3) 최종 성능 평가
performance = all_results["Hit"].mean()
print(f"Hit Rate: {performance:.3f}")


  0%|          | 0/3000 [00:00<?, ?it/s]

  all_results = pd.concat([all_results, results], ignore_index=True)
100%|██████████| 3000/3000 [5:58:12<00:00,  7.16s/it]  

Hit Rate: 0.096





In [16]:
from tqdm import tqdm
import pandas as pd
import re

# (1) 실험 파라미터 (고정값)
threshold = 0.007
k_val = 750
window_size = 1

# 결과 저장용 DataFrame
all_results = pd.DataFrame(columns=["UserId", "Hit", "Answer", "Recommended",
                                    "threshold", "faiss_k", "window"])

# (2) 유저 단일 세팅 실험 (예: 5명 유저 반복)
for idx in tqdm(range(3001, 6041)):
    results = pd.DataFrame(columns=["UserId", "Hit", "Answer", "Recommended"])

    target_user_id = str(idx + 1)  # 유저 ID는 1부터 시작
    target_user_text = df.loc[df.UserId == int(target_user_id), 'chunk_header'].iloc[0]

    # (2-1) Raptor Tree 검색
    best_cluster_id, similar_users = raptor_tree.search_user_cluster(
        target_user_id,
        target_user_text,
        threshold=threshold
    )

    # 최신 구매 기록에서 마지막 항목을 질의로 사용
    purchase_history = data.iloc[idx]['movie_explain']
    query = " ".join(purchase_history[-1:])

    # (2-2) FAISS 검색
    retriever_k = vectorstore.as_retriever(
        search_kwargs={"k": k_val}
    )
    records = retriever_k.get_relevant_documents(query)

    record_user_ids = [str(record.metadata['UserId']) for record in records]
    intersection = set(map(str, similar_users)).intersection(set(record_user_ids))

    filtered_records = [
        record for record in records
        if str(record.metadata['UserId']) in intersection
    ]

    # (2-3) 문맥 확장 (window size 적용)
    context_results = get_documents_with_context(
        vectorstore,
        filtered_records,
        context_window=window_size
    )
    flattened_results = [doc for sublist in context_results for doc in sublist]

    # 유저-영화 그래프 생성 및 추천 영화 추출
    user_movies = get_user_movies(flattened_results)
    G = create_user_movie_graph(user_movies)
    top_movies = get_top_10_common_movies(G)

    # 과거 기록 제거
    filtered_movies = filter_movies_by_history(
        top_movies,
        extract_previous_movie_ids(purchase_history)
    )

    # 정답 및 예측 결과 비교
    answer = df_test.iloc[idx].movie_explain
    match = re.search(r"(\d+)", answer)
    answer_id = match.group(1) if match else None

    filtered_movie_ids = [
        re.search(r"(\d+)", movie).group(1)
        for movie, _ in filtered_movies
        if re.search(r"(\d+)", movie)
    ]

    hit = 1 if answer_id in filtered_movie_ids else 0

    # 유저별 결과 저장
    results = pd.concat([results, pd.DataFrame({
        "UserId": [target_user_id],
        "Hit": [hit],
        "Answer": [answer_id],
        "Recommended": [", ".join(filtered_movie_ids)]
    })], ignore_index=True)

    # 파라미터 정보 추가
    results["threshold"] = threshold
    results["faiss_k"] = k_val
    results["window"] = window_size

    all_results = pd.concat([all_results, results], ignore_index=True)

# (3) 최종 성능 평가
performance = all_results["Hit"].mean()
print(f"Hit Rate: {performance:.3f}")

  all_results = pd.concat([all_results, results], ignore_index=True)
100%|█████████▉| 3039/3040 [5:53:04<00:06,  6.97s/it]  


IndexError: single positional indexer is out-of-bounds