In [1]:
cd ../

/Users/linafaik/Documents/projects/knowledge_graph_llm


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import os
from config import OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

import nest_asyncio
nest_asyncio.apply()

%load_ext autoreload
%autoreload 2

In [3]:
path_input_text  = "data/panama_papers"
path_output_storage = "storage"
path_output = "outputs"

LLM_MODEL = "gpt-4o-mini"
EMBEDDING_MODEL ="text-embedding-3-small"
TEMPERATURE = 0.1

In [4]:
if not os.path.exists(path_output_storage):
    os.makedirs(path_output_storage)

if not os.path.exists(path_output):
    os.makedirs(path_output)

## 0. Data loading

In [5]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(path_input_text).load_data()

## 1. Schema-based extraction

In [6]:
from typing import Literal
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

# Define a name for the keyword extractor
kw_extractor_name = "schema_llm"

# Define the possible entity types for the knowledge graph
entities = Literal["PERSON", "COMPANY", "COUNTRY", "BANK", "SCANDAL"]

# Define the possible relations between the entities in the knowledge graph
relations = Literal["OWNS", "LOCATED_IN", "INVOLVED_IN"]

# Define the schema that outlines which entities can have which relations
schema = {
    "PERSON": ["OWNS", "LOCATED_IN", "INVOLVED_IN"],
    "COMPANY": ["OWNS", "LOCATED_IN", "INVOLVED_IN"],
    "COUNTRY": ["LOCATED_IN"],
    "BANK": ["LOCATED_IN", "INVOLVED_IN"],
    "SCANDAL": ["INVOLVED_IN"],
}

# Create an instance of SchemaLLMPathExtractor to extract paths based on the defined schema
kg_extractor = SchemaLLMPathExtractor(
  llm=OpenAI(model=LLM_MODEL, temperature=TEMPERATURE),  # Use OpenAI's language model with the specified parameters
  possible_entities=entities,  # Define the types of entities to extract
  possible_relations=relations,  # Define the types of relations to extract
  kg_validation_schema=schema,  # Use the predefined schema for validation
  strict=True,  # Enforce strict validation; only entities and relations defined in the schema will be allowed
)

# Create a PropertyGraphIndex from the provided documents, using the specified embedding model
index = PropertyGraphIndex.from_documents(
    documents,  # The input documents to be processed and indexed
    embed_model=OpenAIEmbedding(model_name=EMBEDDING_MODEL),  # Use OpenAI's embedding model for document representation
    show_progress=True,  # Display progress during indexing
    kg_extractors=[kg_extractor],  # Use the previously defined SchemaLLMPathExtractor for extracting knowledge graph paths
)

# Define the storage path for the keyword extractor
path_output_storage_kg_extractor = f"{path_output_storage}/{kw_extractor_name}/"

# Create the storage directory if it doesn't already exist
if not os.path.exists(path_output_storage_kg_extractor):
    os.makedirs(path_output_storage_kg_extractor)

# Persist the index's storage context to the specified directory
index.storage_context.persist(persist_dir=path_output_storage_kg_extractor)

# Save the knowledge graph as a NetworkX graph to an HTML file
index.property_graph_store.save_networkx_graph(name=f"{path_output}/kg_{kw_extractor_name}.html")

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 18/18 [00:00<00:00, 1557.36it/s]
Extracting paths from text with schema: 100%|██████████| 18/18 [00:25<00:00,  1.39s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
Generating embeddings: 100%|██████████| 4/4 [00:01<00:00,  3.67it/s]


## 2. Free-form extraction

In [8]:
from typing import Literal
from llama_index.core.indices.property_graph import SimpleLLMPathExtractor
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

# Define a name for the keyword extractor
kw_extractor_name = "free_form"

# Create an instance of SimpleLLMPathExtractor
kg_extractor = SimpleLLMPathExtractor(
    llm=OpenAI(model=LLM_MODEL, temperature=TEMPERATURE)
    )

# Create a PropertyGraphIndex from the provided documents, using the specified embedding model
index = PropertyGraphIndex.from_documents(
    documents,  # The input documents to be processed and indexed
    embed_model=OpenAIEmbedding(model_name=EMBEDDING_MODEL),  # Use OpenAI's embedding model for document representation
    show_progress=True,  # Display progress during indexing
    kg_extractors=[kg_extractor],  # Use the previously defined SchemaLLMPathExtractor for extracting knowledge graph paths
)

# Define the storage path for the keyword extractor
path_output_storage_kg_extractor = f"{path_output_storage}/{kw_extractor_name}/"

# Create the storage directory if it doesn't already exist
if not os.path.exists(path_output_storage_kg_extractor):
    os.makedirs(path_output_storage_kg_extractor)

# Persist the index's storage context to the specified directory
index.storage_context.persist(persist_dir=path_output_storage_kg_extractor)

# Save the knowledge graph as a NetworkX graph to an HTML file
index.property_graph_store.save_networkx_graph(name=f"{path_output}/kg_{kw_extractor_name}.html")

Parsing nodes: 100%|██████████| 18/18 [00:00<00:00, 832.76it/s]
Extracting paths from text: 100%|██████████| 18/18 [00:14<00:00,  1.27it/s]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Generating embeddings: 100%|██████████| 4/4 [00:01<00:00,  3.02it/s]


## 3. Dynamic extraction

In [14]:
from llama_index.core.indices.property_graph import DynamicLLMPathExtractor
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

# Define a name for the keyword extractor
kw_extractor_name = "dynamic_llm"

# Define the possible entity types for the knowledge graph
entities = ["PERSON", "COMPANY", "COUNTRY", "BANK", "SCANDAL"]

# Define the possible relations between the entities in the knowledge graph
relations = ["OWNS", "LOCATED_IN", "INVOLVED_IN"]

# Create an instance of SimpleLLMPathExtractor
kg_extractor = DynamicLLMPathExtractor(
    llm=OpenAI(model=LLM_MODEL, temperature=TEMPERATURE),
    allowed_entity_types=entities,
    allowed_relation_types=relations,
    )

# Create a PropertyGraphIndex from the provided documents, using the specified embedding model
index = PropertyGraphIndex.from_documents(
    documents,  # The input documents to be processed and indexed
    embed_model=OpenAIEmbedding(model_name=EMBEDDING_MODEL),  # Use OpenAI's embedding model for document representation
    show_progress=True,  # Display progress during indexing
    kg_extractors=[kg_extractor],  # Use the previously defined SchemaLLMPathExtractor for extracting knowledge graph paths
)

# Define the storage path for the keyword extractor
path_output_storage_kg_extractor = f"{path_output_storage}/{kw_extractor_name}/"

# Create the storage directory if it doesn't already exist
if not os.path.exists(path_output_storage_kg_extractor):
    os.makedirs(path_output_storage_kg_extractor)

# Persist the index's storage context to the specified directory
index.storage_context.persist(persist_dir=path_output_storage_kg_extractor)

# Save the knowledge graph as a NetworkX graph to an HTML file
index.property_graph_store.save_networkx_graph(name=f"{path_output}/kg_{kw_extractor_name}.html")

Parsing nodes: 100%|██████████| 18/18 [00:00<00:00, 1805.94it/s]
Extracting and inferring knowledge graph from text: 100%|██████████| 18/18 [00:33<00:00,  1.84s/it]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Generating embeddings: 100%|██████████| 4/4 [00:01<00:00,  3.50it/s]


## 4. Implicit extraction

In [15]:
from llama_index.core.indices.property_graph import ImplicitPathExtractor
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

# Define a name for the keyword extractor
kw_extractor_name = "implicit"

# Create an instance of SimpleLLMPathExtractor
kg_extractor = ImplicitPathExtractor()

# Create a PropertyGraphIndex from the provided documents, using the specified embedding model
index = PropertyGraphIndex.from_documents(
    documents,  # The input documents to be processed and indexed
    embed_model=OpenAIEmbedding(model_name=EMBEDDING_MODEL),  # Use OpenAI's embedding model for document representation
    show_progress=True,  # Display progress during indexing
    kg_extractors=[kg_extractor],  # Use the previously defined SchemaLLMPathExtractor for extracting knowledge graph paths
)

# Define the storage path for the keyword extractor
path_output_storage_kg_extractor = f"{path_output_storage}/{kw_extractor_name}/"

# Create the storage directory if it doesn't already exist
if not os.path.exists(path_output_storage_kg_extractor):
    os.makedirs(path_output_storage_kg_extractor)

# Persist the index's storage context to the specified directory
index.storage_context.persist(persist_dir=path_output_storage_kg_extractor)

# Save the knowledge graph as a NetworkX graph to an HTML file
index.property_graph_store.save_networkx_graph(name=f"{path_output}/kg_{kw_extractor_name}.html")

Parsing nodes: 100%|██████████| 18/18 [00:00<00:00, 1673.33it/s]
Extracting implicit paths: 100%|██████████| 18/18 [00:00<00:00, 60397.98it/s]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s]
Generating embeddings: 0it [00:00, ?it/s]
