### Vector DB 구축

In [None]:
# Jupyter Notebook에서 실행할 코드

import pandas as pd
import numpy as np
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm
import os
from dotenv import load_dotenv
import time

# 환경 설정

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found!")
print("OpenAI API Key loaded")

# 데이터 로드

path = Path("../data/tmdb_cleaned.csv")
df = pd.read_csv(path, usecols=['id', 'title', 'overview', 'genres', 'keywords', 
                                 'vote_average', 'vote_count', 'release_date', 
                                 'runtime', 'original_language', 'poster_path'])

print(f"Total movies: {len(df):,}")

df['overview'] = df['overview'].fillna('')
df['genres'] = df['genres'].fillna('')
df['keywords'] = df['keywords'].fillna('')
df = df.dropna(subset=['title'])

print(f" Cleaned movies: {len(df):,}")

# 텍스트 결합

def create_movie_text(row):
    return f"{row['title']} {row['title']} {row['overview']} Genres: {row['genres']} Keywords: {row['keywords']}"

print("\n Creating combined text...")
df['combined_text'] = df.apply(create_movie_text, axis=1)
print("Combined text created")

# OpenAI 임베딩

embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    model_name="text-embedding-ada-002"
)
print(" Embedding function created")

# ChromaDB 설정 (프로젝트 루트에 생성)

# 프로젝트 루트의 vector_db 경로
db_path = Path("../vector_db/chroma_db")
db_path.parent.mkdir(parents=True, exist_ok=True)

print(f" Vector DB path: {db_path.absolute()}")

chroma_client = chromadb.PersistentClient(path=str(db_path))

# 기존 컬렉션 삭제
try:
    chroma_client.delete_collection(name="movies")
    print("Old collection deleted")
except:
    pass

# 새 컬렉션 생성
collection = chroma_client.create_collection(
    name="movies",
    embedding_function=embedding_function,
    metadata={"description": "TMDB movies with ada-002"}
)
print(f"Collection '{collection.name}' created\n")

# 배치 처리

BATCH_SIZE = 20
total_movies = len(df)
num_batches = (total_movies + BATCH_SIZE - 1) // BATCH_SIZE

print(f"{'='*60}")
print(f"Vector DB Construction")
print(f"{'='*60}")
print(f"Total movies: {total_movies:,}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Number of batches: {num_batches:,}\n")

for i in tqdm(range(0, total_movies, BATCH_SIZE), desc="Processing"):
    try:
        batch_df = df.iloc[i:i+BATCH_SIZE]
        
        texts = batch_df['combined_text'].tolist()
        
        metadatas = []
        for _, row in batch_df.iterrows():
            metadata = {
                'title': str(row['title']),
                'overview': str(row['overview'])[:500],
                'genres': str(row['genres']),
                'vote_average': float(row['vote_average']) if pd.notna(row['vote_average']) else 0.0,
                'vote_count': int(row['vote_count']) if pd.notna(row['vote_count']) else 0,
                'runtime': int(row['runtime']) if pd.notna(row['runtime']) else 0,
                'release_date': str(row['release_date']) if pd.notna(row['release_date']) else '',
                'poster_path': str(row['poster_path']) if pd.notna(row['poster_path']) else ''
            }
            metadatas.append(metadata)
        
        ids = [f"movie_{int(row['id'])}" for _, row in batch_df.iterrows()]
        
        collection.add(
            documents=texts,
            metadatas=metadatas,
            ids=ids
        )
        
        time.sleep(0.15)
        
    except Exception as e:
        print(f"\n Batch {i//BATCH_SIZE + 1} failed: {str(e)[:100]}")
        if "403" in str(e):
            print("권한 오류! 중단합니다.")
            break
        continue

print(f"\n{'='*60}")
print(f" Vector DB Construction Complete!")
print(f"Stored: {collection.count():,} movies")
print(f"Location: {db_path.absolute()}")
print(f"{'='*60}")

# 테스트

def search_similar_movies(query, n_results=5):
    results = collection.query(query_texts=[query], n_results=n_results)
    return results

print("\n 스트: Space Sci-Fi")
results = search_similar_movies("space science fiction", n_results=3)

for i, (metadata, distance) in enumerate(zip(
    results['metadatas'][0], 
    results['distances'][0]
), 1):
    print(f"{i}. {metadata['title']} (distance: {distance:.4f})")

print("\n ector DB 구축 및 테스트 완료!")