In [25]:
import gradio as gr
import lancedb
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.llms.ollama import Ollama
from llama_index.core import VectorStoreIndex, StorageContext, SimpleDirectoryReader
import os
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
import pandas as pd
from llama_index.core import ServiceContext

import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

In [3]:
# Settings를 사용하여 임베딩 및 LLM 설정
#Settings.embed_model = HuggingFaceEmbedding(model_name="bert-base-uncased")  # 768 차원의 벡터를 생성하는 모델
#Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

#all-mpnet-base-v2 (768차원, 정확도 높음, 속도 중간)
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")


#Settings.embed_model = HuggingFaceEmbedding(model_name="gemma2")

# 📌 Ollama 모델 설정 (로컬 모델 사용) 
#llm = OllamaLLM(model_name="gemma2")  # Ollama 모델 설정
#Settings.llm = llm


llm = Ollama(model="gemma2",base_url="http://localhost:11434/v1")
Settings.llm = llm

In [29]:
# 📌 GraphRAG에서 생성한 LanceDB 불러오기
#path = "/Users/jun/GitStudy/Data_4/Data/project5/model/openaitest_0209/output/20250206-001434/artifacts"
path = "/Users/jun/GitStudy/Data_4/Data/project5/model/graphrag_t_2/output/20250204-200327/artifacts"
os.chdir(path)

In [31]:
INPUT_DIR = path
LANCEDB_URI = f"/Users/jun/GitStudy/Data_4/Data/project5/model/graphrag_t_2/output/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [37]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 19


Unnamed: 0,id,human_readable_id,title,community,level,degree,x,y
0,a73a3176-fcbc-4164-b627-d7fcab2f4208,0,UGWORT TIGER GRASS COLOR CORRECTING TREATMENT ...,-1,0,0,,
1,b6090f14-116c-4e03-acdc-8a09a7601d99,1,GOOD FOR OILY SKIN,-1,0,0,,
2,88872b66-0c1d-4f36-a13f-d02a78185d5b,2,REDNESS REDUCING,0,0,1,0.0,0.0
3,3c6d86ba-46cb-4d67-aa31-3bdd75ee74e5,3,REDUCES IRRITATION,-1,0,0,,
4,9aa4edf8-1c7a-4f41-b9b7-47fa8c3ebd11,4,ACNE FIGHTING,-1,0,0,,


In [35]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 1


Unnamed: 0,id,human_readable_id,source,target,description,weight,combined_degree,text_unit_ids
0,16ebc622-875d-4831-a9f4-35ae87c99f84,0,REDNESS REDUCING,ADVANCED SNAIL 96 MUCIN POWER ESSENCE,The product description states that it reduces...,2.0,2,[e60c5576771b6969e5cc28927826c697ab796cca65489...


In [36]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

FileNotFoundError: [Errno 2] No such file or directory: '/Users/jun/GitStudy/Data_4/Data/project5/model/graphrag_t_2/output/20250204-200327/artifacts/create_final_covariates.parquet'

In [8]:
indexes = {name: VectorStoreIndex.from_vector_store(store, storage_context=storage_contexts[name]) for name, store in vector_stores.items()}
indexes

{'default-community-full_content': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x34410d990>,
 'default-entity-description': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x3440cee50>,
 'default-text_unit-text': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x3440cd310>}

In [9]:
# ✅ 여러 테이블을 사용할 수 있도록 Query 엔진을 딕셔너리 형태로 저장
query_engines = {name: index.as_query_engine() for name, index in indexes.items()}
query_engines

{'default-community-full_content': <llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x341f87210>,
 'default-entity-description': <llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x3440c5890>,
 'default-text_unit-text': <llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x3440c56d0>}

In [14]:
# 📌 파일 업로드 후 문서 처리
def process_uploaded_files(files):
    """사용자가 업로드한 파일을 처리하여 LlamaIndex에 추가"""
    if not files:
        return None  # 파일이 없으면 무시

    # 파일 저장 경로
    upload_dir = "uploaded_files"
    os.makedirs(upload_dir, exist_ok=True)

    # 업로드된 파일 저장
    file_paths = []
    for file in files:
        file_path = os.path.join(upload_dir, file.name)
        file_paths.append(file_path)

        # Parquet 파일 처리
        if file.name.endswith(".parquet"):
            try:
                df = pd.read_parquet(file)  # ✅ Parquet 파일을 DataFrame으로 변환
                text_data = df.to_string(index=False)  # ✅ DataFrame을 문자열로 변환
                text_file_path = file_path.replace(".parquet", ".txt")  # 변환된 파일명
                with open(text_file_path, "w", encoding="utf-8") as text_file:
                    text_file.write(text_data)  # ✅ TXT 파일로 저장하여 LlamaIndex가 읽을 수 있도록 변환
                file_paths.append(text_file_path)  # 변환된 파일을 추가
            except Exception as e:
                print(f"❌ Parquet 파일 변환 실패: {e}")
                return None  # 변환 실패 시 무시

        else:
            # 기존 방식대로 파일 저장
            with open(file_path, "wb") as f:
                f.write(file.read())

    # 📌 새 문서 인덱싱
    documents = SimpleDirectoryReader(input_files=file_paths).load_data()
    new_index = VectorStoreIndex.from_documents(documents)
    return new_index.as_query_engine()

# 📌 사용자 메시지 처리
def answer(message, history, files):
    
    global query_engines
    """사용자의 질문을 받고, 기존 GraphRAG 데이터 + 업로드된 문서 데이터로 답변"""
    
    # 기존 GraphRAG 데이터 (여러 테이블에서 처리)
    query_engine_list = list(query_engines.values())  
    
    # 업로드된 파일이 있을 경우 새롭게 인덱싱하여 추가
    new_query_engine = process_uploaded_files(files)
    if new_query_engine:
        query_engine_list.append(new_query_engine)
        query_engines["uploaded_files"] = new_query_engine # ✅ 업로드한 문서도 추가
    
    # 모든 쿼리 엔진에서 질의 수행
    responses = []
    for qe in query_engine_list:
        print(qe)
        if hasattr(qe, 'query'):  # query 엔진이 query 메서드를 가지고 있는지 확인
            responses.append(qe.query(message["text"]))
        else:
            print(f"❌ {qe}는 query 메서드를 가지고 있지 않습니다.")
    
    # 📌 응답을 종합하여 반환
    return "\n\n---\n\n".join([str(resp) for resp in responses])

In [25]:
# 📌 Gradio 인터페이스 설정
demo = gr.ChatInterface(
    answer,
    type="messages",
    title="GraphRAG + Ollama RAG Chatbot",
    description="GraphRAG에서 생성한 LanceDB 데이터와 사용자가 업로드한 문서를 활용한 Ollama 기반 RAG Chatbot!",
    textbox=gr.MultimodalTextbox(file_types=[".pdf", ".txt"]),
    multimodal=True  # 파일 업로드 허용
)

# 📌 실행
demo.launch()

* Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.






<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x341f87210>
relation :  {}


Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniconda/base/envs/leo4study/lib/python3.11/site-packages/llama_index/vector_stores/lancedb/base.py", line 529, in query
    print(f"item_json : {item_json}")
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Caskroom/miniconda/base/envs/leo4study/lib/python3.11/site-packages/llama_index/core/vector_stores/utils.py", line 70, in metadata_dict_to_node
    raise ValueError("Node content not found in metadata dict.")
ValueError: Node content not found in metadata dict.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniconda/base/envs/leo4study/lib/python3.11/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._lib