In [14]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("open_api_key")

## 그래프 DB

### 환경 세팅

In [2]:
from pathlib import Path
working_dir = Path('working_directory')
working_dir.mkdir(parents=True, exist_ok=True)

In [3]:
!graphrag init --root ./working_directory

2025-07-29 22:02:57.0769 - INFO - graphrag.cli.initialize - Initializing project at C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory


### 그래프 DB 구축

In [4]:
input_dir = working_dir/'input'
input_dir.mkdir(parents=True, exist_ok=True)

In [None]:
import shutil
import os
source_path = r"data\How_to_invest_money.txt"
destination_path = r"working_directory\input\How_to_invest_money.txt"

shutil.copy(source_path, destination_path)

if os.path.exists(destination_path):
    print(f"파일이 {destination_path}에 성공적으로 복사됨")
else:
    print("복사 실패")

In [5]:
!graphrag index --root ./working_directory

2025-07-29 22:03:54.0925 - INFO - graphrag.cli.index - Logging enabled at C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\logs\logs.txt
2025-07-29 22:03:55.0988 - INFO - graphrag.index.validate_config - LLM Config Params Validated
2025-07-29 22:03:57.0124 - INFO - graphrag.index.validate_config - Embedding LLM Config Params Validated
2025-07-29 22:03:57.0125 - INFO - graphrag.cli.index - Starting pipeline run. False
2025-07-29 22:03:57.0126 - INFO - graphrag.cli.index - Using default configuration: {
    "root_dir": "C:\\Users\\kimji\\Desktop\\ProgramFile\\Study\\Study_LLM\\working_directory",
    "models": {
        "default_chat_model": {
            "api_key": "==== REDACTED ====",
            "auth_type": "api_key",
            "type": "openai_chat",
            "model": "gpt-4-turbo-preview",
            "encoding_model": "cl100k_base",
            "api_base": null,
            "api_version": null,
            "deployment_name": null,
            "proxy": null

  warn(
INFO:graphrag.index.workflows.finalize_graph:Workflow completed: finalize_graph
INFO:graphrag.api.index:Workflow finalize_graph completed successfully
INFO:graphrag.index.workflows.extract_covariates:Workflow started: extract_covariates
INFO:graphrag.index.workflows.extract_covariates:Workflow completed: extract_covariates
INFO:graphrag.api.index:Workflow extract_covariates completed successfully
INFO:graphrag.index.workflows.create_communities:Workflow started: create_communities
INFO:graphrag.utils.storage:reading table from storage: entities.parquet
INFO:graphrag.utils.storage:reading table from storage: relationships.parquet
INFO:graphrag.index.workflows.create_communities:Workflow completed: create_communities
INFO:graphrag.api.index:Workflow create_communities completed successfully
INFO:graphrag.index.workflows.create_final_text_units:Workflow started: create_final_text_units
INFO:graphrag.utils.storage:reading table from storage: text_units.parquet
INFO:graphrag.utils.s

In [None]:
!graphrag query --query "돈을 투자하는 방법은?" --method local --root ./working_directory

## Graph RAG 질의

In [None]:
import os
os.chdir(r'C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory')
print(os.getcwd()) 

In [None]:
import pandas as pd
pd.read_parquet("output/entities.parquet").head()


In [None]:
import pandas as pd
pd.read_parquet("output/communities.parquet").head()

In [None]:
!graphrag query\
--root ./ \
--method global \
--query "부동산 담보 대출(real-estate mortgages)의 위험 요소는 무엇인가?"


In [None]:
!graphrag query\
--root ./ \
--method local \
--query "부동산 담보 대출(real-estate mortgages)의 위험 요소는 무엇인가?"

In [None]:
!graphrag query\
--root ./ \
--method global \
--query "산업채권 평가  시 순유동자산(net quick assets)을 어떻게 분석해야 하나요"


## GraphRAG(Neo4j + 랭체인) 구현

### 지식그래프 구축

In [1]:
import os

os.chdir(r'C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory')
GRAPHRAG_FOLDER = r"C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\output"
print(os.getcwd()) 

C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory


In [2]:
from neo4j import GraphDatabase

NEO4J_URI= r"neo4j+s://64ba1d93.databases.neo4j.io"
NEO4J_USERNAME = r"neo4j"
NEO4J_PASSWORD = r"aii9OqmpymY37RWOR-sYTkK0ZxsFzN1Dkrsn19GA4YE"
NEO4J_DATABASE = r"neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

constraints = [
    "DROP CONSTRAINT chunk_id IF EXISTS",
    "DROP CONSTRAINT document_id IF EXISTS",
    "DROP CONSTRAINT community_id IF EXISTS",
    "DROP CONSTRAINT entity_title IF EXISTS",
    "DROP CONSTRAINT entity_id IF EXISTS",
    "DROP CONSTRAINT covariate_title IF EXISTS",
    "DROP CONSTRAINT related_id IF EXISTS"
]

for stmt in constraints:
    driver.execute_query(stmt, database_=NEO4J_DATABASE)


In [3]:
# 모든 노드 및 관계 삭제
driver.execute_query("MATCH (n) DETACH DELETE n", database_=NEO4J_DATABASE)


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x000002AA1AA2F690>, keys=[])

In [4]:
from neo4j import GraphDatabase
import pandas as pd
import time

NEO4J_URI= r"neo4j+s://64ba1d93.databases.neo4j.io"
NEO4J_USERNAME = r"neo4j"
NEO4J_PASSWORD = r"aii9OqmpymY37RWOR-sYTkK0ZxsFzN1Dkrsn19GA4YE"
NEO4J_DATABASE = r"neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a approach

    Parameters : statement is the Cypher query to execute, df is the dataframe to
    import, and batch_size is the number of rows to import in each batch.
    """

    total = len(df)
    start_s = time.time()
    for start in range(0, total, batch_size):
        batch = df.iloc[start : min(start+batch_size, total)]
        result = driver.execute_query(
            "UNWIND $rows AS value" + statement,
            rows = batch.to_dict("records"),
            database_ = NEO4J_DATABASE,
        )
        print(result.summary.counters)
    print(f"{total} rows in {time.time() - start_s} s.")
    return total


statements = [
    "\ncreate constraint chunk_id if not  exists for (c: __Chunk__) require c.id is unique",
    "\ncreate constraint document_id if not exists for (d:__Document__) require d.id is unique",
    "\ncreate constraint community_id if not exists for (c:__Community__) require c.community is unique",
    "\ncreate constraint entity_title if not exists for (e:__Entity__) require e.name is unique",
    "\ncreate constraint entity_id if not exists for (e:__Entity__) require e.id is unique",
    "\ncreate constraint covariate_title if not exists for (e:__Covariate__) require e.title is unique",
    "\ncreate constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique",
    "\n",
]

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


doc_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/documents.parquet", columns=["id","title"] )
doc_statement = """
MERGE (d : __Document__ {id: value.id})
    SET d += value {.title}
"""
batched_import(doc_statement, doc_df)

text_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/text_units.parquet", columns=["id","text","n_tokens","document_ids"])
text_statement ="""
    MERGE (c : __Chunk__ {id: value.id})
    SET c += value {.text, .n_tokens}
    WITH c, value
    UNWIND value.document_ids AS document
    MATCH (d:__Document__ {id: document})
    MERGE (c)-[:PART_OF]->(d)
"""
batched_import(text_statement, text_df)

entity_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/entities.parquet", columns=["title", "type","description", "human_readable_id", "id", "text_unit_ids"])
entity_statement ="""
    MERGE (e : __Entity__ {id: value.id})
    SET e.human_readable_id = value.human_readable_id, e.description = value.description, e.name = coalesce(replace(value.title, '"', ''), 'Unknown')
    WITH e, value
    CALL apoc.create.addLabels(e, CASE WHEN coalesce(value.type, "") = "" THEN [] ELSE [apoc.text.upperCamelCase(replace(value.type, '"', ''))] END) YIELD node
    UNWIND value.text_unit_ids AS text_unit
    MATCH (c : __Chunk__ {id: text_unit})
    MERGE (c)-[:HAS_ENTITY]->(e)
"""
batched_import(entity_statement, entity_df)


# 1. RELATIONSHIP
rel_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/relationships.parquet", columns=[
    "id", "source", "target", "combined_degree", "weight", "human_readable_id", "description", "text_unit_ids"
])
rel_df = rel_df.rename(columns={"combined_degree": "rank"})
rel_df = rel_df.dropna(subset=["id", "source", "target"])  # MERGE id null 방지

rel_statement = """
    WITH replace(value.source, '"', '') AS source_name,
         replace(value.target, '"', '') AS target_name,
         value AS value
    MATCH (source : __Entity__ {name: source_name})
    MATCH (target : __Entity__ {name: target_name})
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""
batched_import(rel_statement, rel_df)

# 2. COMMUNITY
community_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/communities.parquet", columns=[
    "id", "level", "title", "text_unit_ids", "relationship_ids"
])
community_df = community_df.dropna(subset=["title"])  # MERGE 키 null 방지

community_statement = """
    MERGE (c : __Community__ {community: value.title})
    SET c.title = value.title, c.level = value.level
    WITH c, value
    UNWIND value.text_unit_ids as text_unit_id
    MATCH (t : __Chunk__ {id: text_unit_id})
    MERGE (c)-[:HAS_CHUNK]-> (t)
    WITH distinct c, value
    UNWIND value.relationship_ids as rel_id
    MATCH (start : __Entity__)-[:RELATED {id: rel_id}]->(end: __Entity__)
    MERGE (start)-[:IN_COMMUNITY]->(c)
    MERGE (end)-[:IN_COMMUNITY]->(c)
    RETURN count(distinct c) as createdCommunities
"""
batched_import(community_statement, community_df)

# 3. COMMUNITY REPORT
community_report_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/community_reports.parquet", columns=['id', 'community', 'level', 'title', 'summary', 'full_content', 'rank', 'rating_explanation', 'findings'])
community_report_df["community"] = "Community" + community_report_df["community"].astype(str)

community_report_statement = """
    MERGE (c:__Community__ {community: value.community})
    SET c.level = value.level,
        c.name = value.title,
        c.rank = value.rank,
        c.rank_explanation = value.rank_explanation,
        c.full_content = value.full_content,
        c.summary = value.summary
    WITH c, value
    UNWIND range(0, size(value.findings)-1) AS finding_idx
    WITH c, value, finding_idx, value.findings[finding_idx] AS finding
    MERGE (f:Finding {id: value.community + "_" + finding_idx})
    MERGE (c)-[:HAS_FINDING]->(f)
    SET f += finding
"""
batched_import(community_report_statement, community_report_df)

'''
# 4. NODE
node_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/node.parquet", columns=[
    "id", "human_readable_id", "title", "community", "level", "degree", "x", "y"
])
node_df["community"] = "Community" + node_df["community"].astype(str)
node_df = node_df.dropna(subset=["title"])  # replace 사용 방지

node_statement = """
    WITH value, replace(value.title, '"', '') AS clean_title
    MATCH (e : __Entity__) WHERE e.name = clean_title
    MERGE (c:__Community__ {community: value.community})
    MERGE (e)-[:IN_COMMUNITY]->(c)
"""
batched_import(node_statement, node_df)
'''


create constraint chunk_id if not  exists for (c: __Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint community_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint covariate_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique
{'_contains_updates': True, 'labels_added': 1, 'nodes_created': 1, 'properties_set': 2}
1 rows in 0.2690107822418213 s.
{'_contains_updates': True, 'labels_added': 29, 'relationships_created': 29, 'nodes_created': 29, 'properties_set': 87}
29 rows in 0.6376347541809082 s.
{'_contains_updates': True, 'labels_added': 16, 'relationships_created': 22, 'nodes_creat

'\n# 4. NODE\nnode_df = pd.read_parquet(f"{GRAPHRAG_FOLDER}/node.parquet", columns=[\n    "id", "human_readable_id", "title", "community", "level", "degree", "x", "y"\n])\nnode_df["community"] = "Community" + node_df["community"].astype(str)\nnode_df = node_df.dropna(subset=["title"])  # replace 사용 방지\n\nnode_statement = """\n    WITH value, replace(value.title, \'"\', \'\') AS clean_title\n    MATCH (e : __Entity__) WHERE e.name = clean_title\n    MERGE (c:__Community__ {community: value.community})\n    MERGE (e)-[:IN_COMMUNITY]->(c)\n"""\nbatched_import(node_statement, node_df)\n'

### 질의 실습

In [15]:
# 로컬 검색
from langchain_neo4j import Neo4jGraph
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import Neo4jVector

NEO4J_URI= r"neo4j+s://64ba1d93.databases.neo4j.io"
NEO4J_USERNAME = r"neo4j"
NEO4J_PASSWORD = r"aii9OqmpymY37RWOR-sYTkK0ZxsFzN1Dkrsn19GA4YE"
NEO4J_DATABASE = r"neo4j"

embedding = OpenAIEmbeddings(api_key=api_key)
graph = Neo4jVector.from_existing_graph(
    embedding=embedding,
    node_label="__Entity__",
    text_node_properties=["description"],
    embedding_node_property="embedding",
    url=NEO4J_URI,                  
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
)

neo4j_graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
)


def fetch_entity_context(entity_name):
    context = {"name": entity_name}
    try:
        chunk_query = """
            MATCH (e:__Entity__ {name: $entity_name})<-[:HAS_ENTITY]-(c:__Chunk__)
            RETURN c.text AS text
        """
        chunk_result = neo4j_graph.query(chunk_query, {"entity_name": entity_name})
        context["text_chunks"] = [r["text"] for r in chunk_result] if chunk_result else ["No text chunk available"]

        community_query = """
            MATCH (e:__Entity__ {name: $entity_name})-[:IN_COMMUNITY]->(com:__Community__)
            RETURN com.full_content AS report
        """
        community_result = neo4j_graph.query(community_query, {"entity_name": entity_name})
        context["community_reports"] = [r["report"] for r in community_result] if community_result else ["No community report available"]

        related_query = """
            MATCH (e:__Entity__ {name: $entity_name})-[:RELATED]->(related:__Entity__)
            RETURN related.name AS name, related.description AS description
        """
        related_result = neo4j_graph.query(related_query, {"entity_name": entity_name})
        context["related_entities"] = (
            [{"name": r["name"], "decription": r["description"]} for r in related_result]
            if related_result else []
        )
    except Exception as e:
        context["error"] = f"Error fetching context : {str(e)}"
    return context


def create_structured_context(all_contexts, query):
    context_str = "##질문과 관련된 엔티티 정보\n\n"
    context_str += "아래는 질문에 답변하는 데 유용한 엔티티들의 구조화된 정보입니다.\n\n"

    for i, ctx in enumerate(all_contexts, 1):
        context_str += f"### 엔티티 {i}: {ctx['name']}\n"
        context_str += f"- **설명**: {ctx['description']}\n"
        context_str += "- **텍스트 청크**:\n"
        for chunk in ctx['text_chunks']:
            context_str += f" - {chunk}\n"
        context_str += "- **커뮤니티 보고서**:\n"
        for report in ctx['community_reports']:
            context_str += f" - {report}\n"
        
        if ctx['related_entities']:
            context_str += "- **관련 엔티티**:\n"
            for rel in ctx['related_entities']:
                context_str += f" - {rel['name']} : {rel['decription']}\n"
        else:
            context_str += "- **관련 엔티티** : 없음\n"
        context_str += "\n"
    return context_str


llm = ChatOpenAI(model="gpt-4o", api_key=api_key)
retriever = graph.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # ✅ 오타 수정

query = "마일당 순수익(NET INCOME PER MILE)을 어떻게 분석해야 하나요?"
results = retriever.get_relevant_documents(query)

all_contexts = []
for result in results:
    entity_name = result.metadata.get("name", "Unknown")
    description = result.page_content
    context = fetch_entity_context(entity_name)
    context["name"] = entity_name
    context["description"] = description
    all_contexts.append(context)

context_str = create_structured_context(all_contexts, query)

prompt = f"아래 맥락에 기반해서, 주어진 질문에 한국어로 답하세요\n\n **질문** : {query}\n\n **맥락**:\n{context_str}"

response = llm.invoke(prompt)
print("Final Response")
print(response.content)


  results = retriever.get_relevant_documents(query)


Final Response
마일당 순수익을 분석하기 위해서는 다양한 요소를 고려해야 합니다. 먼저, 수익과 비용을 명확히 정의하고 계산해야 합니다. 수익은 주로 화물 운송이나 승객 운송에서 발생하는 수익을 포함하며, 비용은 연료비, 인건비, 유지보수비, 차량 감가상각 등을 포함합니다. 

이후 총 수익에서 총 비용을 차감하여 총 순수익을 계산한 다음, 이를 총 마일 수로 나누어 마일당 순수익을 산출합니다. 

또한, 이를 분석할 때는 시간에 따른 추세를 파악하기 위해 기간별로 데이터를 비교하거나, 경쟁사나 업계 평균과 비교하여 자사의 경쟁력을 평가하는 것도 중요합니다. 이러한 분석 과정에서 특정 마일에서 왜 수익이 증가하거나 감소했는지를 이해하고, 필요한 경우 비용 절감이나 수익 증대를 위한 전략을 수립하게 됩니다. 

마지막으로, 외부 경제 요인들, 예를 들어 연료 가격의 변동이나 정책 변화 등이 수익성에 어떤 영향을 미치는지 평가해야 합니다.


In [13]:
# 글로벌 검색
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from tqdm import tqdm

llm = ChatOpenAI(model="gpt-4o", api_key=api_key)

MAP_SYSTEM_PROMPT = """
    --- 역할 ---
    제공된 컨텍스트를 활용하여 사용자의 질문에 답하는 어시스턴트입니다.

    --- 목표 ---
    주정진 컨텍스트가 질문을 답하기에 적절하다면 질문에 대한 답을 한 뒤, 
    답변의 중요도 점수를 입력하여 JSON 형식으로 생성하세요
    정보가 부족하다면 "모르겠습니다"라고 답하세요.

    각 포인트는 다음을 포함해야 합니다.
    - 답변 : 질문에 대한 답변
    - 중요도 점수 : 0~100점 사이의 정수

    데이터 참조 예:
    "예시 문장 [Data : Reports (2,7,64,46,34, +more)]"
    (한 참조에 5개 이상의 id는 "+more"를 사용)

    출력 예:
    {{"Answer" : "답변 [Data : Reports (보고서들 id들)]", "score":점수}}
"""

map_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", MAP_SYSTEM_PROMPT),
        ("human", "question : {question}\n\n context:{context}"),
    ]
)
map_chain = map_prompt|llm|StrOutputParser()


REDUCE_SYSTEM_PROMPT = """
    --- 역할 ---
    맵 단계에서 처리된 여러 결과를 종합하여 사용자의 지문에 답하는 어시스턴트입니다.

    --- 목표 ---
    제공된 맵 단계 결과를 바탕으로, 질문에 대한 종합적인 답변을 마크다운 형식으로 작성하세요
    중요도 점수를 고려하여 핵심적인 결과 위주로 반영하며, 불필요한 세부 사항은 제외하세요.
    핵심 포인트와 시사점을 포함하고, 정보가 부족한 경우 "모르겠습니다."라고 답하세요

    --- 맵 단계 결과 ---
    {report_data}
    데이터 참조 형식은 아래를 따르세요:
    " 예시 문장 [Data: Reports (2,7,34,46,64,+more)]"
    (참조 ID가 5개 이상일 경우 "+more" 사용)
    대상 응답 길이 및 형식 : {response_type}
"""

reduce_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", REDUCE_SYSTEM_PROMPT),
        ("human", "{question}"),
    ]
)

reduce_chain = reduce_prompt|llm|StrOutputParser()

response_type: str = "multiple paragraphs"
def global_retriever(query:str, level:int, response_type:str=response_type) -> str:
    community_data = graph.query(
        """
            MATCH (c:__Community__)
            WHERE c.level = $level
            RETURN c.full_context AS output
        """,
        params={"level":level},
    )

    intermediate_results = []
    for community in tqdm(community_data, desc="Processing communities"):
        intermediate_response = map_chain.invoke({"question":query, "context":community["output"]})
        intermediate_results.append(intermediate_response)
    final_response = reduce_chain.invoke(
        {
            "report_data" : intermediate_results,
            "question" : query,
            "response_type" : response_type,
        }    
    )
    return final_response

# print(global_retriever("마일당 순수익(NET INCOME PER MILE)을 어떻게 분석해야 하나요??",1))
print(global_retriever("이 책의 주제가 뭐야?",1))

Processing communities: 0it [00:00, ?it/s]


모르겠습니다. 제공된 데이터를 바탕으로 이 책의 주제를 파악할 수 있는 정보가 없습니다. 추가적인 자료가 있으면 기꺼이 도와드리겠습니다.


In [14]:
print(global_retriever("레포트에서 다루는 핵심 주제는 무엇인가요?", 0))

Processing communities: 100%|██████████| 6/6 [00:04<00:00,  1.27it/s]


모르겠습니다. 제공된 정보를 바탕으로 레포트에서 다루는 핵심 주제를 구체적으로 파악할 수 없습니다. 추가적인 정보나 구체적인 데이터가 필요합니다.


In [5]:
import pandas as pd

GRAPHRAG_FOLDER = r"C:\Users\kimji\Desktop\ProgramFile\Study\Study_LLM\working_directory\output"

community_check = pd.read_parquet(f"{GRAPHRAG_FOLDER}/communities.parquet")
entities_check = pd.read_parquet(f"{GRAPHRAG_FOLDER}/entities.parquet")
text_units_check = pd.read_parquet(f"{GRAPHRAG_FOLDER}/text_units.parquet")

In [8]:
community_check.head()

Unnamed: 0,id,human_readable_id,community,level,parent,children,title,entity_ids,relationship_ids,text_unit_ids,period,size
0,7721efc7-864b-47e5-bfb7-6e242633cb90,0,0,0,-1,[],Community 0,"[4c1181d5-9fa6-4690-986d-4010e1ed661e, 1fa7b4f...","[30c09862-3701-41b3-9695-f146ffc61bda, e0ec4a7...",[3291681cefdb3df93a491259dd0c6ca86e3aed2809d8d...,2025-07-29,3
1,8577641d-1ff9-49b8-b343-16f2d9f26a27,1,1,0,-1,[],Community 1,"[896a6881-1d73-4cac-87e2-bc9db1e5de48, e0cd2bc...","[79b648fa-b1e6-4b31-863f-5ed6bedd26dd, cb7497f...",[7545e407be21a6c05c05035ab3d90113699580ec998fb...,2025-07-29,4
2,7dfe7b68-e087-445d-9f01-e536cbeb2b60,2,2,0,-1,[],Community 2,"[44664a45-6d60-49b0-aaed-54ee1ffa57d6, bf9509f...","[18f7f83e-d79c-4769-8dcf-7d9fd8c9d086, 72e5df5...",[3291681cefdb3df93a491259dd0c6ca86e3aed2809d8d...,2025-07-29,3


In [10]:
entities_check.head(10)

Unnamed: 0,id,human_readable_id,title,type,description,text_unit_ids,frequency,degree,x,y
0,bf9509fe-2d8b-429c-9a43-79c0d497b785,0,PROJECT GUTENBERG,ORGANIZATION,Project Gutenberg is a pioneering digital libr...,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,4,4,17.980831,6.132166
1,abcd315d-a2da-45f6-ac01-baf8ac5dcdda,1,GEORGE GARR HENRY,PERSON,"Author of ""How to Invest Money"", Vice-Presiden...",[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,2,,
2,6d15953a-53ae-4c41-8fde-a085c6fd3f4d,2,UNITED STATES,GEO,The United States is a country where Project G...,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,2,2,17.496731,4.252052
3,688c8c71-fbd9-491c-9111-2366f7192ede,3,JULIA NEUFELD,PERSON,"Contributor to the production of the eBook ""Ho...",[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,,
4,d9d38309-ac41-4972-b5d1-5a62b039c382,4,ONLINE DISTRIBUTED PROOFREADING TEAM,ORGANIZATION,"Group involved in the production of ""How to In...",[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,2,,
5,e24dd398-2718-441a-b833-68486bb1f373,5,INTERNET ARCHIVE/AMERICAN LIBRARIES,ORGANIZATION,Source of images used for producing the eBook ...,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,,
6,6ca7f6fe-ad6b-45e5-b13e-1d96e86b45e1,6,FUNK & WAGNALLS COMPANY,ORGANIZATION,"Publisher of ""How to Invest Money"", located in...",[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,,
7,aecea348-7f67-4907-89f6-bb554dd617c1,7,GUARANTY TRUST COMPANY OF NEW YORK,,,[549303b4d8de9adc2722b88096c96047d333a0ae0b536...,1,1,,
8,e0cd2bc9-c5bf-4c3c-9ad7-15229ecce2c8,8,PROJECT GUTENBERG LITERARY ARCHIVE FOUNDATION,ORGANIZATION,The Project Gutenberg Literary Archive Foundat...,[ad31c63c1ca6a1f679fbbfcad23e44035953fcb759fc4...,3,4,16.869572,6.582345
9,896a6881-1d73-4cac-87e2-bc9db1e5de48,9,INTERNAL REVENUE SERVICE (IRS),ORGANIZATION,The U.S. federal agency responsible for tax co...,[7545e407be21a6c05c05035ab3d90113699580ec998fb...,1,1,18.738295,5.064562


In [12]:
text_units_check.head(50)

Unnamed: 0,id,human_readable_id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,549303b4d8de9adc2722b88096c96047d333a0ae0b5369...,1,﻿The Project Gutenberg eBook of How to Invest ...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,"[bf9509fe-2d8b-429c-9a43-79c0d497b785, abcd315...","[4931c2ec-2b8c-4c38-97b2-d5ccd2647e6b, 33d0a3e...",[]
1,552d40b335e06d33643e78da48f7499a526f8fa1f27b57...,2,way in which to dispose of it. It is obviousl...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
2,c024dc1d1b06f88ddd921300222ed0a569307e0b55a448...,3,\nmore than obedience to the old rule which fo...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
3,c071fb8607b6007d80519d742ec2a81ff896ef3e5e95f3...,4,"terms of a lease, by the railroads\nwhich use...",1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
4,4bd9417eeedfa8493f4699924f3ee9a4324d5cfebb3c60...,5,ust.\n\n\n\n\nII\n\nRAILROAD MORTGAGE BONDS\n\...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
5,81b68038f6f230e5da687de75906f0f89021a61caa5c55...,6,", to mortgage bonds upon the\ngeneral mileage ...",1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
6,47580f85d0dd81692165c417576f6c4cf7b015ec21838c...,7,", it does not always follow that its operating...",1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
7,30ba7af0e052a9f08830b2853c6777672a88fcb6ae3ec7...,8,"outstanding April 1st, 1908, at the market pr...",1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
8,a2a423b48b790e6d9511ccd340618421bf574e7e866d55...,9,to one of two standard forms: (1) The conditi...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
9,e7adb4654819efd85be45646189342bc909210a39f4c95...,10,both. Two of these railroads offered to the ho...,1200,[76b874f8ddf312c5c5d34517a4587852e8ef79438600b...,,,[]
