# **Settings**

## **라이브러리 불러오기**

In [None]:
# 패키지 설치
!pip install langchain_google_genai faiss-cpu langchain_community langchain_openai

In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
device

device(type='cpu')

In [None]:
# 라이브러리 불러오기
import os
import sys
from langchain_community.document_loaders.csv_loader import CSVLoader
from pathlib import Path
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import TextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

In [None]:
# Colab 마운트
from google.colab import drive
drive.mount('/content/drive/')

# 경로 지정
os.chdir('/content/drive/MyDrive/RAG_LLM')

## **데이터 불러오기**

In [None]:
import pandas as pd
import ast

# 데이터 불러오기
data = pd.read_csv('data/train_movie.csv')

# 'movie_explain' 열을 리스트로 변환
data['movie_explain'] = data['movie_explain'].apply(ast.literal_eval)

## **벡터DB 불러오기**

In [None]:
# 벡터스토어와 임베딩 불러오기
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local("vectorstore_index_ratings_min10", embeddings, allow_dangerous_deserialization=True)

# **그래프 임베딩**

In [None]:
!pip install torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+cu118.html

## **Hetero 그래프**

In [None]:
import torch
from torch_geometric.data import HeteroData
import re
import numpy as np

# HeteroData 그래프 생성 (장르에서 '평점 평균'과 '시청 횟수'로 연결)
def create_genre_detailed_graph(user_id, viewing_data):

    data_graph = HeteroData()

    # 사용자 노드 추가
    data_graph["user"].num_nodes = 1  # 단일 사용자

    # 장르별 평점 및 시청 횟수 저장
    genre_ratings = {}
    genre_counts = {}

    for record in viewing_data:
        try:
            # 예시: "3186 (Drama) ratings: 4"
            match = re.match(r"\d+\s+\((.*?)\)\s+ratings:\s*(\d+)", record)
            if not match:
                print(f"데이터 형식 오류: {record}")
                continue
            genres, rating = match.groups()
            rating = int(rating)

            for genre in genres.split("|"):
                genre = genre.strip()

                # 장르별 평점 누적 및 시청 횟수 증가
                if genre not in genre_ratings:
                    genre_ratings[genre] = []
                    genre_counts[genre] = 0
                genre_ratings[genre].append(rating)
                genre_counts[genre] += 1
        except Exception as e:
            print(f"오류 발생: {e} (레코드: {record})")
            continue

    # 노드 추가
    genre_nodes = list(genre_ratings.keys())
    num_genres = len(genre_nodes)

    data_graph["genre"].num_nodes = num_genres
    data_graph["rating"].num_nodes = num_genres
    data_graph["count"].num_nodes = num_genres

    # 장르별 특징(평점 평균, 시청 횟수) 계산
    avg_ratings = [np.mean(genre_ratings[genre]) for genre in genre_nodes]
    view_counts = [genre_counts[genre] for genre in genre_nodes]

    # 노드 특성 추가 (평점 평균, 시청 횟수)
    data_graph["rating"].x = torch.tensor(avg_ratings, dtype=torch.float).view(-1, 1)
    data_graph["count"].x = torch.tensor(view_counts, dtype=torch.float).view(-1, 1)

    # 사용자 → 장르 엣지 추가
    edge_index_user_genre = torch.tensor([[0] * num_genres, list(range(num_genres))], dtype=torch.long)
    data_graph["user", "has_preference", "genre"].edge_index = edge_index_user_genre

    # 장르 → 평점 연결
    edge_index_genre_rating = torch.tensor([list(range(num_genres)), list(range(num_genres))], dtype=torch.long)
    data_graph["genre", "has_avg_rating", "rating"].edge_index = edge_index_genre_rating

    # 장르 → 시청 횟수 연결
    edge_index_genre_count = torch.tensor([list(range(num_genres)), list(range(num_genres))], dtype=torch.long)
    data_graph["genre", "has_watch_count", "count"].edge_index = edge_index_genre_count

    return data_graph, genre_nodes

## **RGCN 모델**

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import RGCNConv, to_hetero
from tqdm import tqdm
import re
import math

# RGCN 모델 정의 (homogeneous 모델로 시작)
class RGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_relations):
        super().__init__()
        self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations)
        self.conv2 = RGCNConv(hidden_channels, out_channels, num_relations)

    def forward(self, x, edge_index, edge_type):
        x = self.conv1(x, edge_index, edge_type)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_type)
        return x

## **이용자별로 그래프 임베딩**

In [None]:
# 이용자별 그래프 임베딩
def compute_all_user_graph_embeddings(data, num_epochs=30, feature_dim=384, lr=0.01):

    embeddings = []  # 임베딩을 저장할 리스트

    for idx, row in tqdm(data.iterrows(), total=len(data), desc="Processing Users"):
        user_id = row["UserId"]
        viewing_data = row["movie_explain"]

        # 사용자별 장르 기반 그래프 생성 (graph만 추출)
        genre_graph, _ = create_genre_detailed_graph(user_id, viewing_data)

        # 사용자별 그래프 임베딩 계산
        graph_embedding = compute_user_genre_graph_embedding(genre_graph,
                                                             num_epochs=num_epochs,
                                                             feature_dim=feature_dim,
                                                             lr=lr)

        # detach()를 사용하여 텐서에서 numpy 변환 가능하게 함
        embeddings.append(graph_embedding.detach().numpy())

    # 새 컬럼에 그래프 임베딩 저장
    data["graph_embedding"] = embeddings
    return data

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv, to_hetero

# 이용자-장르 그래프 임베딩
def compute_user_genre_graph_embedding(graph, num_epochs=30, feature_dim=128, lr=0.01):
    # 사용자(self-loop) 추가 → 사용자 노드가 업데이트 가능하도록 수정
    if "user" in graph.node_types and ("user", "self_loop", "user") not in graph.edge_types:
        num_users = graph["user"].num_nodes
        if num_users > 0:
            self_loop_edges = torch.tensor([list(range(num_users)), list(range(num_users))], dtype=torch.long)
            graph["user", "self_loop", "user"].edge_index = self_loop_edges

    # 노드 초기 임베딩 설정 (각 타입별 랜덤 벡터 할당)
    for ntype in graph.node_types:
        num_nodes = graph[ntype].num_nodes
        graph[ntype].x = torch.randn((num_nodes, feature_dim))

    # metadata 및 edge_type_dict 생성
    metadata = graph.metadata()  # (list(node_types), list(edge_types))

    # edge_type_map: 각 엣지 타입에 대해 고유 id 부여
    edge_type_map = {etype: i for i, etype in enumerate(metadata[1])}
    edge_type_dict = {}
    for etype, edge_index in graph.edge_index_dict.items():
        edge_type_dict[etype] = torch.full((edge_index.size(1),), edge_type_map[etype],
                                           dtype=torch.long, device=edge_index.device)

    # RGCN 모델 생성 및 heterogeneous 모델 변환
    num_relations = len(metadata[1])
    base_model = RGCN(in_channels=feature_dim, hidden_channels=512,
                      out_channels=feature_dim, num_relations=num_relations)
    model = to_hetero(base_model, metadata, aggr='sum')
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # 학습 루프 (tqdm으로 경과 관찰)
    model.train()
    for epoch in range(num_epochs) :
        optimizer.zero_grad()
        out = model(graph.x_dict, graph.edge_index_dict, edge_type_dict)
        loss = sum(x.pow(2).mean() for x in out.values())  # 더미 loss
        loss.backward()
        optimizer.step()

    # 노드 임베딩을 평균 풀링하여 사용자 그래프 임베딩 생성
    graph_embeddings = []
    for ntype, x in out.items():
        pooled = x.mean(dim=0)  # 각 노드 타입별 평균 풀링
        graph_embeddings.append(pooled)
    graph_embedding = torch.stack(graph_embeddings, dim=0).mean(dim=0)  # 전체 평균

    return graph_embedding

In [None]:
# 사용자별 그래프 임베딩 계산
processed_data = compute_all_user_graph_embeddings(data, num_epochs=10, feature_dim=384, lr=0.01)

In [None]:
import numpy as np
import faiss

# DataFrame의 "graph_embedding" 컬럼에 저장된 384차원 임베딩들을 numpy 배열로 변환
graph_embeddings = np.stack(data["graph_embedding"].values, axis=0).astype('float32')

# 임베딩 차원 확인 (384여야 함)
dimension = graph_embeddings.shape[1]

# FAISS 인덱스 생성 (L2 거리 기반)
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(graph_embeddings)

# FAISS 인덱스 파일로 저장
faiss.write_index(faiss_index, "raptor_graph_embeddings_faiss.index")