In [20]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# recommend.py
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
# from config import GEMINI_API_KEY

from langchain_community.vectorstores import Chroma
# from config import VECTORSTORE_PATH_VIEW_1

import pandas as pd
import json

In [3]:
# 임베딩 모델 로드
embeddings = HuggingFaceEmbeddings(model_name='ibm-granite/granite-embedding-278m-multilingual')

  embeddings = HuggingFaceEmbeddings(model_name='ibm-granite/granite-embedding-278m-multilingual')


# 시청기록 리트리버 테스트

In [27]:
views_vectorstore = Chroma(persist_directory=VECTORSTORE_PATH_VIEW_1,
                               embedding_function=embeddings)

In [36]:
# LLM 모델 생성 (1. GEMINI 2. OpenAI)
def load_gemini():
    model = ChatGoogleGenerativeAI(
        model='gemini-1.5-flash',
        temperature=0.3,
        max_tokens=5000,
        api_key=GEMINI_API_KEY
    )
    print(">>>>>>> Gemini loaded from recommend chain...")
    return model
llm = load_gemini()

>>>>>>> Gemini loaded from recommend chain...


In [33]:
# 🎯 Define Response Schema
recommend_response_schemas = [
    ResponseSchema(name="candidates",
                   description="A list of user IDs who watched a VOD on the given date.")
]

output_parser = StructuredOutputParser.from_response_schemas(recommend_response_schemas)
recommend_chain_format_instructions = output_parser.get_format_instructions()

In [34]:
retriever = views_vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}  # Retrieve 10 most similar results
)

In [35]:
recommend_template = """
You are a system that retrieves user IDs who watched a VOD on a given date.
Use the provided retrieved user watch history to return the top 10 user IDs.

[Search Results]
{retrieved_documents}

Return a JSON list of user IDs.

{recommend_chain_format_instructions}
"""

recommend_chain_prompt = ChatPromptTemplate.from_template(recommend_template,
                                                          partial_variables={'recommend_chain_format_instructions': recommend_chain_format_instructions})

In [37]:
recommend_chain = (
    {"user_input": RunnablePassthrough(),
     "retrieved_documents": retriever}  # ✅ Correctly pass retriever output
    | recommend_chain_prompt
    | llm
    | output_parser  # ✅ Use structured output parser
)

In [38]:
response = recommend_chain.invoke("3월 17일에 영화를 본 사람 있어?")

In [39]:
print(response)

{'candidates': ['user001174', 'user000203', 'user000032', 'user003174']}


# 여기서부터는 movie_vectordb test

In [4]:
movies_vectorstore = Chroma(persist_directory="./db/movies_vectorstore_chroma_1630", embedding_function=embeddings)

  movies_vectorstore = Chroma(persist_directory="./db/movies_vectorstore_chroma_1630", embedding_function=embeddings)


In [5]:
# LLM 모델 생성 (1. GEMINI 2. OpenAI)
def load_gemini():
    model = ChatGoogleGenerativeAI(
        model='gemini-1.5-flash',
        temperature=0.3,
        max_tokens=5000,
        api_key=GEMINI_API_KEY
    )
    print(">>>>>>> Gemini loaded from recommend chain...")
    return model
llm = load_gemini()

>>>>>>> Gemini loaded from recommend chain...


In [6]:
# 🎯 Define Response Schema
recommend_response_schemas = [
    ResponseSchema(name="candidates",
                   description="A list of name of title.")
]

output_parser = StructuredOutputParser.from_response_schemas(recommend_response_schemas)
recommend_chain_format_instructions = output_parser.get_format_instructions()

In [7]:
retriever = movies_vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}  # Retrieve 10 most similar results
)

In [8]:
recommend_template = """
영화 제목을 반환하는 retriever 역할

[Search Results]
{retrieved_documents}

Return a JSON list of user IDs.

{recommend_chain_format_instructions}
"""

recommend_chain_prompt = ChatPromptTemplate.from_template(recommend_template,
                                                          partial_variables={'recommend_chain_format_instructions': recommend_chain_format_instructions})

In [9]:
recommend_chain = (
    {"user_input": RunnablePassthrough(),
     "retrieved_documents": retriever}  # ✅ Correctly pass retriever output
    | recommend_chain_prompt
    | llm
    | output_parser  # ✅ Use structured output parser
)

In [10]:
response = recommend_chain.invoke("제목에 숫자가 들어가는 영화들")
print(response)

{'candidates': ['명량-회오리 바다를 향하여', '누구나 제명에 죽고 싶다', '청년경찰', '프랑스 영화처럼', '소년들', '개들의 도시', '오싱', '천국으로 가는 이삿짐', '돌아간다', '사잇소리']}


<br><hr>

# 마트 데이터 추가 전처리
- release_date에서 release_month, release_day cols 추가
- "release_date", "movie_id" cols 삭제
- column 순서 변경

In [11]:
df = pd.read_csv("data/movies_mart_최종_forreal_forgood.csv", encoding='utf8')
df.shape

(14224, 20)

In [12]:
df.head(2)

Unnamed: 0,asset_id,movie_id,title,original_title,genre,adult,runtime,release_year,release_date,actors,director,orgnl_cntry,original_language,vote_average,vote_count,popularity,poster_path,backdrop_path,overview,분류
0,cjc|M4820686LSGE28658201,11,스타워즈 4-새로운 희망,Star Wars,"SF,모험,액션",False,121.0,1977.0,1977-05-25,"마크 해밀,캐리 피셔,해리슨 포드,알렉 기네스,피터 커싱",조지 루카스,미국,en,8.204,20724.0,118.411,/7XFfURIFCJxN1mfBg0SAjk5yGzg.jpg,/4qCqAdHcNKeAHcK8tJ8wNJZa9cx.jpg,"공화국이 붕괴하고 제국이 수립된 뒤 20년, 제다이 기사단은 전멸하고 강력한 제국군...",영화
1,cjc|M4774453LSGJ90713601,13,포레스트검프,Forrest Gump,"드라마,로맨스,코미디",False,142.0,1994.0,1994-06-23,"톰 행크스,로빈 라이트,샐리 필드,게리 시나이즈,미켈티 윌리암슨",로버트 저메키스,미국,en,8.47,27620.0,111.752,/xdJxoq0dtkchOkUz5UVKuxn7a2V.jpg,/mzfx54nfDPTUXZOG48u4LaEheDy.jpg,"불편한 다리, 남들보다 조금 떨어지는 지능을 가진 포레스트 검프는 헌신적인 어머니의...",영화


In [13]:
# release_date >> release_month & release_day로 변경
df["release_date"] = pd.to_datetime(df["release_date"])

df['release_month'] = df['release_date'].dt.month
df['release_day'] = df['release_date'].dt.day
df.head(2)

Unnamed: 0,asset_id,movie_id,title,original_title,genre,adult,runtime,release_year,release_date,actors,...,original_language,vote_average,vote_count,popularity,poster_path,backdrop_path,overview,분류,release_month,release_day
0,cjc|M4820686LSGE28658201,11,스타워즈 4-새로운 희망,Star Wars,"SF,모험,액션",False,121.0,1977.0,1977-05-25,"마크 해밀,캐리 피셔,해리슨 포드,알렉 기네스,피터 커싱",...,en,8.204,20724.0,118.411,/7XFfURIFCJxN1mfBg0SAjk5yGzg.jpg,/4qCqAdHcNKeAHcK8tJ8wNJZa9cx.jpg,"공화국이 붕괴하고 제국이 수립된 뒤 20년, 제다이 기사단은 전멸하고 강력한 제국군...",영화,5.0,25.0
1,cjc|M4774453LSGJ90713601,13,포레스트검프,Forrest Gump,"드라마,로맨스,코미디",False,142.0,1994.0,1994-06-23,"톰 행크스,로빈 라이트,샐리 필드,게리 시나이즈,미켈티 윌리암슨",...,en,8.47,27620.0,111.752,/xdJxoq0dtkchOkUz5UVKuxn7a2V.jpg,/mzfx54nfDPTUXZOG48u4LaEheDy.jpg,"불편한 다리, 남들보다 조금 떨어지는 지능을 가진 포레스트 검프는 헌신적인 어머니의...",영화,6.0,23.0


In [14]:
# release_date과 movie_id 삭제
df.drop(["release_date", "movie_id"], axis=1, inplace=True)
df.head(1)

Unnamed: 0,asset_id,title,original_title,genre,adult,runtime,release_year,actors,director,orgnl_cntry,original_language,vote_average,vote_count,popularity,poster_path,backdrop_path,overview,분류,release_month,release_day
0,cjc|M4820686LSGE28658201,스타워즈 4-새로운 희망,Star Wars,"SF,모험,액션",False,121.0,1977.0,"마크 해밀,캐리 피셔,해리슨 포드,알렉 기네스,피터 커싱",조지 루카스,미국,en,8.204,20724.0,118.411,/7XFfURIFCJxN1mfBg0SAjk5yGzg.jpg,/4qCqAdHcNKeAHcK8tJ8wNJZa9cx.jpg,"공화국이 붕괴하고 제국이 수립된 뒤 20년, 제다이 기사단은 전멸하고 강력한 제국군...",영화,5.0,25.0


In [17]:
# column 순서 변경
new_order = [
    'asset_id', 'title', 'original_title', 'genre', 'adult',
    'runtime', 'release_year', 'release_month', 'release_day', 'actors', 'director', 'orgnl_cntry',
    'original_language', 'vote_average', 'vote_count', 'popularity',
    'poster_path', 'backdrop_path', 'overview', '분류'
]

In [18]:
# 열 순서 변경
df = df[new_order]
df.head(1)

Unnamed: 0,asset_id,title,original_title,genre,adult,runtime,release_year,release_month,release_day,actors,director,orgnl_cntry,original_language,vote_average,vote_count,popularity,poster_path,backdrop_path,overview,분류
0,cjc|M4820686LSGE28658201,스타워즈 4-새로운 희망,Star Wars,"SF,모험,액션",False,121.0,1977.0,5.0,25.0,"마크 해밀,캐리 피셔,해리슨 포드,알렉 기네스,피터 커싱",조지 루카스,미국,en,8.204,20724.0,118.411,/7XFfURIFCJxN1mfBg0SAjk5yGzg.jpg,/4qCqAdHcNKeAHcK8tJ8wNJZa9cx.jpg,"공화국이 붕괴하고 제국이 수립된 뒤 20년, 제다이 기사단은 전멸하고 강력한 제국군...",영화


In [19]:
# date 열 + runtime + vote_count 정수값으로 변경
# 실수(float) → 정수(Int64) 변환 (NaN 허용)
df['runtime'] = df['runtime'].astype(pd.Int64Dtype())
df['release_year'] = df['release_year'].astype(pd.Int64Dtype())
df['release_month'] = df['release_month'].astype(pd.Int64Dtype())
df['release_day'] = df['release_day'].astype(pd.Int64Dtype())
df['vote_count'] = df['vote_count'].astype(pd.Int64Dtype())


In [20]:
df.head(1)

Unnamed: 0,asset_id,title,original_title,genre,adult,runtime,release_year,release_month,release_day,actors,director,orgnl_cntry,original_language,vote_average,vote_count,popularity,poster_path,backdrop_path,overview,분류
0,cjc|M4820686LSGE28658201,스타워즈 4-새로운 희망,Star Wars,"SF,모험,액션",False,121,1977,5,25,"마크 해밀,캐리 피셔,해리슨 포드,알렉 기네스,피터 커싱",조지 루카스,미국,en,8.204,20724,118.411,/7XFfURIFCJxN1mfBg0SAjk5yGzg.jpg,/4qCqAdHcNKeAHcK8tJ8wNJZa9cx.jpg,"공화국이 붕괴하고 제국이 수립된 뒤 20년, 제다이 기사단은 전멸하고 강력한 제국군...",영화


In [22]:
df.to_csv("data/movies_mart_14224.csv", encoding='utf8', index=False)

In [25]:
from chromadb import PersistentClient
import json

In [26]:
client = PersistentClient(path="./test_chromadb")


In [27]:
# 컬렉션 생성 또는 불러오기
collection = client.get_or_create_collection(name="movies")

# 샘플 데이터 삽입 (documents 포함)
collection.add(
    ids=["1", "2", "3"],
    embeddings=[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]],  # 예제용 더미 벡터
    metadatas=[
        {"title": "Inception", "genres": "Sci-Fi, Thriller", "year": 2010},
        {"title": "Titanic", "genres": "Romance, Drama", "year": 1997},
        {"title": "Interstellar", "genres": "Sci-Fi, Adventure", "year": 2014}
    ],
    documents=[
        "A mind-bending thriller about dream manipulation and subconscious heists.",
        "A love story set against the backdrop of a tragic shipwreck.",
        "A space exploration journey to find a new habitable planet for humanity."
    ]
)

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3


In [None]:
'[|김상현|남민주|]'
filter "|김상현|"

In [None]:
df [{
g : ["a","b"]

}]

page_content=  []

In [None]:
'['김상현',"남민주"]'
simiarity(filter(in) "'김상현'")


In [34]:
results = collection.query(
    query_embeddings=[[0.1, 0.2, 0.3]],
    n_results=1,
    # where={"genres": {"$in": ["Sci-Fi"]}},  # Sci-Fi 장르만 필터링
    where_document={"$contains": "thriller"}  # "space" 단어가 포함된 문서만 검색
)

print(results)


{'ids': [[]], 'distances': [[]], 'metadatas': [[]], 'embeddings': None, 'documents': [[]], 'uris': None, 'data': None}
