In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("open_api_key")

## 그래프 DB

### 환경 세팅

In [1]:
from pathlib import Path
working_dir = Path('working_directory')
working_dir.mkdir(parents=True, exist_ok=True)

In [2]:
!graphrag init --root ./working_directory

2025-07-29 03:26:20.0022 - INFO - graphrag.cli.initialize - Initializing project at C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory


### 그래프 DB 구축

In [3]:
input_dir = working_dir / 'input'
input_dir.mkdir(parents=True, exist_ok=True)

In [None]:
import shutil
import os
source_path = r"data\How_to_invest_money.txt"
destination_path = r"working_directory\input\How_to_invest_money.txt"

shutil.copy(source_path, destination_path)

if os.path.exists(destination_path):
    print(f"파일이 {destination_path}에 성공적으로 복사됨")
else:
    print("복사 실패")

In [4]:
!graphrag index --root ./working_directory

2025-07-29 03:28:05.0365 - INFO - graphrag.cli.index - Logging enabled at C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\logs\logs.txt
2025-07-29 03:28:06.0302 - INFO - graphrag.index.validate_config - LLM Config Params Validated
2025-07-29 03:28:06.0761 - INFO - graphrag.index.validate_config - Embedding LLM Config Params Validated
2025-07-29 03:28:06.0761 - INFO - graphrag.cli.index - Starting pipeline run. False
2025-07-29 03:28:06.0762 - INFO - graphrag.cli.index - Using default configuration: {
    "root_dir": "C:\\Users\\kimji\\Desktop\\ProgramFile\\Study\\Study_LLM\\working_directory",
    "models": {
        "default_chat_model": {
            "api_key": "==== REDACTED ====",
            "auth_type": "api_key",
            "type": "openai_chat",
            "model": "gpt-4-turbo-preview",
            "encoding_model": "cl100k_base",
            "api_base": null,
            "api_version": null,
            "deployment_name": null,
            "proxy": null

[2025-07-28T18:32:08Z WARN  lance::dataset::write::insert] No existing dataset at C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\output\lancedb\default-entity-description.lance, it will be created
[2025-07-28T18:32:09Z WARN  lance::dataset::write::insert] No existing dataset at C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\output\lancedb\default-community-full_content.lance, it will be created
[2025-07-28T18:32:10Z WARN  lance::dataset::write::insert] No existing dataset at C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\output\lancedb\default-text_unit-text.lance, it will be created


In [None]:
!graphrag query --query "돈을 투자하는 방법은?" --method local --root ./working_directory

## Graph RAG 질의

In [None]:
import os
os.chdir(r'C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory')
print(os.getcwd()) 

In [None]:
import pandas as pd
pd.read_parquet("output/entities.parquet").head()


In [None]:
import pandas as pd
pd.read_parquet("output/communities.parquet").head()

In [None]:
!graphrag query\
--root ./ \
--method global \
--query "부동산 담보 대출(real-estate mortgages)의 위험 요소는 무엇인가?"


In [None]:
!graphrag query\
--root ./ \
--method local \
--query "부동산 담보 대출(real-estate mortgages)의 위험 요소는 무엇인가?"

In [None]:
!graphrag query\
--root ./ \
--method global \
--query "산업채권 평가  시 순유동자산(net quick assets)을 어떻게 분석해야 하나요"


## GraphRAG(Neo4j + 랭체인) 구현

### 지식그래프 구축

In [5]:
import os

os.chdir(r'C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory')
GRAPHRAG_FOLDER = r"C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\output"
print(os.getcwd()) 

C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory


In [14]:
df_check = pd.read_parquet(f"{GRAPHRAG_FOLDER}/community_reports.parquet")
print(df_check.columns.tolist())


['id', 'human_readable_id', 'community', 'level', 'parent', 'children', 'title', 'summary', 'full_content', 'rank', 'rating_explanation', 'findings', 'full_content_json', 'period', 'size']


In [None]:
from neo4j import GraphDatabase
import pandas as pd
import time

NEO4J_URI= r"neo4j+s://64ba1d93.databases.neo4j.io"
NEO4J_USERNAME = r"neo4j"
NEO4J_PASSWORD = r"aii9OqmpymY37RWOR-sYTkK0ZxsFzN1Dkrsn19GA4YE"
NEO4J_DATABASE = r"neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a approach

    Parameters : statement is the Cypher query to execute, df is the dataframe to
    import, and batch_size is the number of rows to import in each batch.
    """

    total = len(df)
    start_s = time.time()
    for start in range(0, total, batch_size):
        batch = df.iloc[start : min(start+batch_size, total)]
        result = driver.execute_query(
            "UNWIND $rows AS value" + statement,
            rows = batch.to_dict("records"),
            database_ = NEO4J_DATABASE,
        )
        print(result.summary.counters)
    print(f"{total} rows in {time.time() - start_s} s.")
    return total


statements = [
    "\ncreate constraint chunk_id if not  exists for (c: __Chunk__) require c.id is unique",
    "\ncreate constraint document_id if not exists for (d:__Document__) require d.id is unique",
    "\ncreate constraint community_id if not exists for (c:__Community__) require c.community is unique",
    "\ncreate constraint entity_title if not exists for (e:__Entity__) require e.name is unique",
    "\ncreate constraint entity_id if not exists for (e:__Entity__) require e.id is unique",
    "\ncreate constraint covariate_title if not exists for (e:__Covariate__) require e.title is unique",
    "\ncreate constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique",
    "\n",
]

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


doc_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/documents.parquet", columns=["id","title"])
doc_statement = """
MERGE (d : __Document__ {id: value.id})
    SET d += value {.title}
"""
batched_import(doc_statement, doc_df)

text_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/text_units.parquet", columns=["id","text","n_tokens","document_ids"])
text_statement ="""
    MERGE (c : __Chunk__ {id: value.id})
    SET c += value {.text, .n_tokens}
    WITH c, value
    UNWIND value.document_ids AS document
    MATCH (d:__Document__ {id: document})
    MERGE (c)-[:PART_OF]->(d)
"""
batched_import(text_statement, text_df)

entity_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/entities.parquet", columns=["title", "type","description", "human_readable_id", "id", "text_unit_ids"])
entity_statement ="""
    MERGE (e : __Entity__ {id: value.id})
    SET e.human_readable_id = value.human_readable_id, e.description = value.description, e.name = coalesce(replace(value.title, '"', ''), 'Unknown')
    WITH e, value
    CALL apoc.create.addLabels(e, CASE WHEN coalesce(value.type, "") = "" THEN [] ELSE [apoc.text.upperCamelCase(replace(value.type, '"', ''))] END) YIELD node
    UNWIND value.text_unit_ids AS text_unit
    MATCH (c : __Chunk__ {id: text_unit})
    MERGE (c)-[:HAS_ENTITY]->(e)
"""
batched_import(entity_statement, entity_df)


# 1. RELATIONSHIP
rel_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/relationships.parquet", columns=[
    "id", "source", "target", "combined_degree", "weight", "human_readable_id", "description", "text_unit_ids"
])
rel_df = rel_df.rename(columns={"combined_degree": "rank"})
rel_df = rel_df.dropna(subset=["id", "source", "target"])  # MERGE id null 방지

rel_statement = """
    WITH replace(value.source, '"', '') AS source_name,
         replace(value.target, '"', '') AS target_name,
         value AS value
    MATCH (source : __Entity__ {name: source_name})
    MATCH (target : __Entity__ {name: target_name})
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""
batched_import(rel_statement, rel_df)

# 2. COMMUNITY
community_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/communities.parquet", columns=[
    "id", "level", "title", "text_unit_ids", "relationship_ids"
])
community_df = community_df.dropna(subset=["title"])  # MERGE 키 null 방지

community_statement = """
    MERGE (c : __Community__ {community: value.title})
    SET c.title = value.title, c.level = value.level
    WITH c, value
    UNWIND value.text_unit_ids as text_unit_id
    MATCH (t : __Chunk__ {id: text_unit_id})
    MERGE (c)-[:HAS_CHUNK]-> (t)
    WITH distinct c, value
    UNWIND value.relationship_ids as rel_id
    MATCH (start : __Entity__)-[:RELATED {id: rel_id}]->(end: __Entity__)
    MERGE (start)-[:IN_COMMUNITY]->(c)
    MERGE (end)-[:IN_COMMUNITY]->(c)
    RETURN count(distinct c) as createdCommunities
"""
batched_import(community_statement, community_df)

# 3. COMMUNITY REPORT
community_report_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/community_reports.parquet", columns=['id', 'community', 'level', 'title', 'summary', 'full_content', 'rank', 'rating_explanation', 'findings'])
community_report_df["community"] = "Community" + community_report_df["community"].astype(str)

community_report_statement = """
    MERGE (c:__Community__ {community: value.community})
    SET c.level = value.level,
        c.name = value.title,
        c.rank = value.rank,
        c.rank_explanation = value.rank_explanation,
        c.full_content = value.full_content,
        c.summary = value.summary
    WITH c, value
    UNWIND range(0, size(value.findings)-1) AS finding_idx
    WITH c, value, finding_idx, value.findings[finding_idx] AS finding
    MERGE (f:Finding {id: value.community + "_" + finding_idx})
    MERGE (c)-[:HAS_FINDING]->(f)
    SET f += finding
"""
batched_import(community_report_statement, community_report_df)

'''
# 4. NODE
node_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/node.parquet", columns=[
    "id", "human_readable_id", "title", "community", "level", "degree", "x", "y"
])
node_df["community"] = "Community" + node_df["community"].astype(str)
node_df = node_df.dropna(subset=["title"])  # replace 사용 방지

node_statement = """
    WITH value, replace(value.title, '"', '') AS clean_title
    MATCH (e : __Entity__) WHERE e.name = clean_title
    MERGE (c:__Community__ {community: value.community})
    MERGE (e)-[:IN_COMMUNITY]->(c)
"""
batched_import(node_statement, node_df)
'''


create constraint chunk_id if not  exists for (c: __Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint community_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint covariate_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique
{'_contains_updates': True, 'properties_set': 1}
1 rows in 0.23665452003479004 s.
{'_contains_updates': True, 'properties_set': 58}
29 rows in 0.6263918876647949 s.
{'_contains_updates': True, 'properties_set': 51}
17 rows in 0.282318115234375 s.
{'_contains_updates': True, 'properties_set': 90}
18 rows in 0.2740023136138916 s.
{'_contains_updates': True, 'pro

'\n# 4. NODE\nnode_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/node.parquet", columns=[\n    "id", "human_readable_id", "title", "community", "level", "degree", "x", "y"\n])\nnode_df["community"] = "Community" + node_df["community"].astype(str)\nnode_df = node_df.dropna(subset=["title"])  # replace 사용 방지\n\nnode_statement = """\n    WITH value, replace(value.title, \'"\', \'\') AS clean_title\n    MATCH (e : __Entity__) WHERE e.name = clean_title\n    MERGE (c:__Community__ {community: value.community})\n    MERGE (e)-[:IN_COMMUNITY]->(c)\n"""\nbatched_import(node_statement, node_df)\n'

### 질의 실습

In [None]:
# 로컬 검색
from langchain_neo4j import Neo4jGraph
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import Neo4jVector

NEO4J_URI= r"neo4j+s://64ba1d93.databases.neo4j.io"
NEO4J_USERNAME = r"neo4j"
NEO4J_PASSWORD = r"aii9OqmpymY37RWOR-sYTkK0ZxsFzN1Dkrsn19GA4YE"
NEO4J_DATABASE = r"neo4j"

embedding = OpenAIEmbeddings(api_key=api_key)
graph = Neo4jVector.from_existing_graph(
    embedding=embedding,
    node_label="__Entity__",
    text_node_properties=["description"],
    embedding_node_property="embedding",
    url=NEO4J_URI,                    # ✅ 수정
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
)

neo4j_graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
)


def fetch_entity_context(entity_name):
    context = {"name": entity_name}
    try:
        chunk_query = """
            MATCH (e:__Entity__ {name: $entity_name})<-[:HAS_ENTITY]-(c:__Chunk__)
            RETURN c.text AS text
        """
        chunk_result = neo4j_graph.query(chunk_query, {"entity_name": entity_name})
        context["text_chunks"] = [r["text"] for r in chunk_result] if chunk_result else ["No text chunk available"]

        community_query = """
            MATCH (e:__Entity__ {name: $entity_name})-[:IN_COMMUNITY]->(com:__Community__)
            RETURN com.full_content AS report
        """
        community_result = neo4j_graph.query(community_query, {"entity_name": entity_name})
        context["community_reports"] = [r["report"] for r in community_result] if community_result else ["No community report available"]

        related_query = """
            MATCH (e:__Entity__ {name: $entity_name})-[:RELATED]->(related:__Entity__)
            RETURN related.name AS name, related.description AS description
        """
        related_result = neo4j_graph.query(related_query, {"entity_name": entity_name})
        context["related_entities"] = (
            [{"name": r["name"], "decription": r["description"]} for r in related_result]
            if related_result else []
        )
    except Exception as e:
        context["error"] = f"Error fetching context : {str(e)}"
    return context


def create_structured_context(all_contexts, query):
    context_str = "##질문과 관련된 엔티티 정보\n\n"
    context_str += "아래는 질문에 답변하는 데 유용한 엔티티들의 구조화된 정보입니다.\n\n"

    for i, ctx in enumerate(all_contexts, 1):
        context_str += f"### 엔티티 {i}: {ctx['name']}\n"
        context_str += f"- **설명**: {ctx['description']}\n"
        context_str += "- **텍스트 청크**:\n"
        for chunk in ctx['text_chunks']:
            context_str += f" - {chunk}\n"
        context_str += "- **커뮤니티 보고서**:\n"
        for report in ctx['community_reports']:
            context_str += f" - {report}\n"
        
        if ctx['related_entities']:
            context_str += "- **관련 엔티티**:\n"
            for rel in ctx['related_entities']:
                context_str += f" - {rel['name']} : {rel['decription']}\n"
        else:
            context_str += "- **관련 엔티티** : 없음\n"
        context_str += "\n"
    return context_str


llm = ChatOpenAI(model="gpt-4o", api_key=api_key)
retriever = graph.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # ✅ 오타 수정

query = "마일당 순수익(NET INCOME PER MILE)을 어떻게 분석해야 하나요?"
results = retriever.get_relevant_documents(query)

all_contexts = []
for result in results:
    entity_name = result.metadata.get("name", "Unknown")
    description = result.page_content
    context = fetch_entity_context(entity_name)
    context["name"] = entity_name
    context["description"] = description
    all_contexts.append(context)

context_str = create_structured_context(all_contexts, query)

prompt = f"아래 맥락에 기반해서, 주어진 질문에 한국어로 답하세요\n\n **질문** : {query}\n\n **맥락**:\n{context_str}"

response = llm.invoke(prompt)
print("Final Response")
print(response.content)


  results = retriever.get_relevant_documents(query)


Final Response
마일당 순수익(NET INCOME PER MILE)을 분석하기 위해서는, 단순히 수익을 마일로 나눈 값 그 자체를 평가하는 것뿐만 아니라, 이를 통해 다양한 경영 및 재무적인 시사점을 도출할 수 있어야 합니다. 다음과 같은 몇 가지 방법으로 분석을 진행할 수 있습니다:

1. **비교 분석**: 동일 산업 내의 다른 회사들과 비교하여 경쟁력을 평가합니다. 마일당 순수익이 높다면, 이는 운영 효율성이 높거나 비용 절감에 성공했음을 나타낼 수 있습니다.

2. **추세 분석**: 시간이 지남에 따라 마일당 순수익이 어떻게 변화하고 있는지를 살펴봅니다. 이는 사업의 개선 혹은 악화 추세를 파악할 수 있게 해주며, 장기적인 전략 수정의 필요성을 진단하는 데 유용합니다.

3. **원가-수익 분석**: 마일당 발생하는 수익과 비용을 상세히 분석하여 어느 부분에서 비용 절감이 가능한지를 파악합니다. 이는 비용 구조를 개선하기 위한 실질적인 방안을 마련하는 데 기여할 수 있습니다.

4. **운영 효율성 평가**: 마일당 순수익을 통해 전체적인 운영 효율성을 평가할 수 있습니다. 예를 들어, 수익성이 낮다면, 어떤 부분에서 비효율적인 운영이 발생하고 있는지를 검토할 필요가 있습니다.

5. **재무적 전략 수립**: 마일당 순수익은 기업의 전반적인 재무 전략, 특히 투자 및 예산 편성에 중요한 역할을 할 수 있습니다. 예를 들어, 특정 노선의 수익성이 낮다면, 그 노선에 대한 투자를 줄이거나 운영 방식을 변경하는 등의 의사결정을 내릴 수 있습니다.

마일당 순수익 분석은 기업의 재무 상태와 운영 전략을 개선하는 데 핵심적인 정보를 제공하며, 이를 통해 경쟁력을 유지하고 강화할 수 있는 방법을 모색할 수 있습니다.


In [24]:
# 글로벌 검색
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from tqdm import tqdm

llm = ChatOpenAI(model="gpt-4o", api_key=api_key)

MAP_SYSTEM_PROMPT = """
    --- 역할 ---
    제공된 컨텍스트를 활용하여 사용자의 질문에 답하는 어시스턴트입니다.

    --- 목표 ---
    주정진 컨텍스트가 질문을 답하기에 적절하다면 질문에 대한 답을 한 뒤, 
    답변의 중요도 점수를 입력하여 JSON 형식으로 생성하세요
    정보가 부족하다면 "모르겠습니다"라고 답하세요.

    각 포인트는 다음을 포함해야 합니다.
    - 답변 : 질문에 대한 답변
    - 중요도 점수 : 0~100점 사이의 정수

    데이터 참조 예:
    "예시 문장 [Data : Reports (2,7,64,46,34, +more)]"
    (한 참조에 5개 이상의 id는 "+more"를 사용)

    출력 예:
    {{"Answer" : "답변 [Data : Reports (보고서들 id들)]", "score":점수}}
"""

map_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", MAP_SYSTEM_PROMPT),
        ("human", "question : {question}\n\n context:{context}"),
    ]
)
map_chain = map_prompt|llm|StrOutputParser()


REDUCE_SYSTEM_PROMPT = """
    --- 역할 ---
    맵 단계에서 처리된 여러 결과를 종합하여 사용자의 지문에 답하는 어시스턴트입니다.

    --- 목표 ---
    제공된 맵 단계 결과를 바탕으로, 질문에 대한 종합적인 답변을 마크다운 형식으로 작성하세요
    중요도 점수를 고려하여 핵심적인 결과 위주로 반영하며, 불필요한 세부 사항은 제외하세요.
    핵심 포인트와 시사점을 포함하고, 정보가 부족한 경우 "모르겠습니다."라고 답하세요

    --- 맵 단계 결과 ---
    {report_data}
    데이터 참조 형식은 아래를 따르세요:
    " 예시 문장 [Data: Reports (2,7,34,46,64,+more)]"
    (참조 ID가 5개 이상일 경우 "+more" 사용)
    대상 응답 길이 및 형식 : {response_type}
"""

reduce_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", REDUCE_SYSTEM_PROMPT),
        ("human", "{question}"),
    ]
)

reduce_chain = reduce_prompt|llm|StrOutputParser()

response_type: str = "multiple paragraphs"
def global_retriever(query:str, level:int, response_type:str=response_type) -> str:
    community_data = graph.query(
        """
            MATCH (c:__Community__)
            WHERE c.level = $level
            RETURN c.full_context AS output
        """,
        params={"level":level},
    )

    intermediate_results = []
    for community in tqdm(community_data, desc="Processing communities"):
        intermediate_response = map_chain.invoke({"question":query, "context":community["output"]})
        intermediate_results.append(intermediate_response)
    final_response = reduce_chain.invoke(
        {
            "report_data" : intermediate_results,
            "question" : query,
            "response_type" : response_type,
        }    
    )
    return final_response

# print(global_retriever("마일당 순수익(NET INCOME PER MILE)을 어떻게 분석해야 하나요??",1))
print(global_retriever("이 책의 주제가 뭐야?",0))

Processing communities: 100%|██████████| 10/10 [00:10<00:00,  1.01s/it]


모르겠습니다. 제공된 데이터에서 책의 주제에 대한 정보는 포함되어 있지 않습니다. [Data: Reports (2,7,34,46,64,+more)]


In [31]:
print(global_retriever("레포트에서 다루는 핵심 주제는 무엇인가요?", 0))


Processing communities: 100%|██████████| 10/10 [00:12<00:00,  1.27s/it]


모르겠습니다. 제공된 맵 단계 결과에서는 레포트의 핵심 주제를 파악할 수 있는 정보가 부족합니다. 각 결과에서 동일하게 "모르겠습니다"라는 응답이 반복되고 있으며, 중요한 정보나 힌트를 제공하지 않고 있습니다. 추가적인 맥락이나 데이터가 필요하다면, 다른 질문을 시도하거나 추가 정보를 제공해 주실 수 있습니다.


In [36]:
import pandas as pd

GRAPHRAG_FOLDER = r"C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\output"

community_check = pd.read_parquet(f"{GRAPHRAG_FOLDER}/communities.parquet")
entities_check = pd.read_parquet(f"{GRAPHRAG_FOLDER}/entities.parquet")
text_units_check = pd.read_parquet(f"{GRAPHRAG_FOLDER}/text_units.parquet")

In [34]:
community_check.head()

Unnamed: 0,id,human_readable_id,community,level,parent,children,title,entity_ids,relationship_ids,text_unit_ids,period,size
0,7c9800ce-59c4-4a80-b18f-0a5de5bfec39,0,0,0,-1,[],Community 0,"[3c1625ef-ff3b-4823-b813-1eeea516444c, a9e4156...","[04da4aa4-3a2f-4ab2-be1e-5c624c92c973, 0d3e1f8...",[3291681cefdb3df93a491259dd0c6ca86e3aed2809d8d...,2025-07-27,4
1,5ee1c6ca-c160-4231-b8be-0a000831e5f3,1,1,0,-1,[],Community 1,"[3053af77-d624-4184-b5c4-2df39616d8d2, f029798...","[168203ac-b7bb-4e5c-bfbe-6f33bf1421f4, 2eb5762...",[3291681cefdb3df93a491259dd0c6ca86e3aed2809d8d...,2025-07-27,3
2,3a0b18b7-1cda-4d34-80d1-91454119742e,2,2,0,-1,[],Community 2,"[39e7cf42-3ba6-42a4-9216-bf38d18ab448, d827bd7...","[63b2e9bb-4bb4-48b3-9933-0115d07dd1a7, 9ee7c94...",[7545e407be21a6c05c05035ab3d90113699580ec998fb...,2025-07-27,4


In [37]:
entities_check.head(10)

Unnamed: 0,id,human_readable_id,title,type,description,text_unit_ids,frequency,degree,x,y
0,a9e41568-4407-4f5e-ab61-1023a25f1b97,0,PROJECT GUTENBERG,ORGANIZATION,Project Gutenberg is a pioneering digital libr...,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,4,5,0,0
1,f229545c-7b63-4238-8e34-0999dd285ed0,1,UNITED STATES,GEO,The United States is a country recognized for ...,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,2,2,0,0
2,afb5b14c-9ad6-404d-a88e-ac8ac72635dc,2,GEORGE GARR HENRY,PERSON,"Author of ""How to Invest Money"", Vice-Presiden...",[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,3,0,0
3,0ddf73ab-939e-4de9-bd0c-e10faceffd2b,3,JULIA NEUFELD,PERSON,"Contributor to the production of the eBook ""Ho...",[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,0,0
4,2a45468c-8b01-4f93-8d90-88d9d8606556,4,ONLINE DISTRIBUTED PROOFREADING TEAM,ORGANIZATION,Group involved in the production of the eBook ...,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,2,0,0
5,0aee33e0-7f89-49cb-818c-980079aaf8b9,5,INTERNET ARCHIVE/AMERICAN LIBRARIES,ORGANIZATION,"Source of images for the eBook ""How to Invest ...",[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,0,0
6,cd6b4392-861e-4d34-96a1-cb2d598b4540,6,FUNK & WAGNALLS COMPANY,ORGANIZATION,"Publisher of ""How to Invest Money"", located in...",[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,0,0
7,a50e280e-a68a-4356-940a-0aacf399dfef,7,GUARANTY TRUST COMPANY OF NEW YORK,ORGANIZATION,Organization where George Garr Henry served as...,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,0,0
8,d9a3c4c6-6668-4c2d-bb85-a1c5d2991383,8,SYSTEM MAGAZINE,,,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,0,0
9,d827bd75-813d-4a1f-8b5d-7ebecca70fff,9,PROJECT GUTENBERG LITERARY ARCHIVE FOUNDATION,ORGANIZATION,The Project Gutenberg Literary Archive Foundat...,[ad31c63c1ca6a1f679fbbfcad23e44035953fcb759fc4...,3,4,0,0


In [38]:
text_units_check.head(10)

Unnamed: 0,id,human_readable_id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,549303b4d8de9adc2722b88096c96047d333a0ae0b5369...,1,﻿The Project Gutenberg eBook of How to Invest ...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,"[a9e41568-4407-4f5e-ab61-1023a25f1b97, f229545...","[75f1b558-a2c5-45c7-b164-90d431c8bb23, 22f448a...",[]
1,552d40b335e06d33643e78da48f7499a526f8fa1f27b57...,2,way in which to dispose of it. It is obviousl...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
2,c024dc1d1b06f88ddd921300222ed0a569307e0b55a448...,3,\nmore than obedience to the old rule which fo...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
3,c071fb8607b6007d80519d742ec2a81ff896ef3e5e95f3...,4,"terms of a lease, by the railroads\nwhich use...",1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
4,4bd9417eeedfa8493f4699924f3ee9a4324d5cfebb3c60...,5,ust.\n\n\n\n\nII\n\nRAILROAD MORTGAGE BONDS\n\...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
5,81b68038f6f230e5da687de75906f0f89021a61caa5c55...,6,", to mortgage bonds upon the\ngeneral mileage ...",1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
6,47580f85d0dd81692165c417576f6c4cf7b015ec21838c...,7,", it does not always follow that its operating...",1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
7,30ba7af0e052a9f08830b2853c6777672a88fcb6ae3ec7...,8,"outstanding April 1st, 1908, at the market pr...",1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
8,a2a423b48b790e6d9511ccd340618421bf574e7e866d55...,9,to one of two standard forms: (1) The conditi...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
9,e7adb4654819efd85be45646189342bc909210a39f4c95...,10,both. Two of these railroads offered to the ho...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
