Using REBEL model to extract entities and relationships instead of LLM.

Follow along: https://colab.research.google.com/drive/1G6pcR0pXvSkdMQlAK_P-IrYgo-_staxd?usp=sharing#scrollTo=XX_GxhusPZR8

Install Nebula Graph: `curl -fsSL nebula-up.siwei.io/install.sh | bash`

Also follow along: https://colab.research.google.com/drive/1tLjOg2ZQuIClfuWrAC2LdiZHCov8oUbs#scrollTo=kkHpLzEuYo_9

In [14]:
import os

import openai
import pandas as pd
from dotenv import load_dotenv
from transformers import pipeline
from llama_index.core import (
    Document,
    KnowledgeGraphIndex,
    StorageContext,
)
from llama_index.core.settings import Settings
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.graph_stores.nebula import NebulaGraphStore

load_dotenv()

openai.api_key = os.environ["OPENAI_API_KEY"]

Setup nebula graph:

In [15]:
%load_ext ngql
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula

The ngql extension is already loaded. To reload it, use:
  %reload_ext ngql
Connection Pool Created


Unnamed: 0,Name
0,llamaindex
1,policies
2,policies_2
3,policies_3
4,policies_aa


In [5]:
%%ngql
CREATE SPACE policies_aa(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);
USE policies_aa;
CREATE TAG entity(name string);
CREATE EDGE relationship(name string);
CREATE TAG INDEX entity_index ON entity(name(256));

In [22]:
space_name = "policies_aa"
edge_types, rel_prop_names = ["relationship"], [
    "name"
]  # default, could be omit if create from an empty kg
tags = ["entity"]  # default, could be omit if create from an empty kg

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)

Load review documents:

In [33]:
def load_policies(
    file: str = "../../data/clean/encoded_policy.csv",
    start_index: int = 0,
    limit: int = 100,
    airline: str = None,
    as_dataframe: bool = False,
):
    # load dataframe:
    df = pd.read_csv(file)[start_index:start_index + limit]
    df["Content"] = df.apply(lambda row: f"{row['Header 1']}\n{row['Header 2']}\n{row['Concat']}".replace("\n\n", "\n").replace("\n\n", "\n"), axis=1)
    if airline is not None:
        df = df[df["Airline"] == airline]
    print(f"Loaded policies: {df.shape}")
    if as_dataframe:
        return df
    documents = [
        Document(
            text=row["Content"],
            metadata={
                "airline": row["Airline"],
                "policy": row["Header 1"],
                "topic": row["Header 2"],
            }
        )
        for _, row in df.iterrows()
    ]
    return documents
    # splitter = SentenceSplitter(
    #    chunk_size=200,
    #    chunk_overlap=0,
    #    paragraph_separator="\n\n"
    #)
    # 
    #nodes = splitter.get_nodes_from_documents(documents)
    # return nodes

policies = load_policies(airline="American Airlines")

Loaded policies: (29, 5)


In [34]:
load_policies(limit=1000, airline="American Airlines")

Loaded policies: (29, 5)


[Document(id_='e5032c6e-9378-4e36-b248-cc337428cef4', embedding=None, metadata={'airline': 'American Airlines', 'policy': 'Restricted Items Policy', 'topic': '1 personal item and 1 carry-on'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Restricted Items Policy\n1 personal item and 1 carry-on\nPersonal item\nYour personal item like a purse or small handbag must fit under the seat in front of you. Dimensions should not exceed 18 x 14 x 8 inches (45 x 35 x 20 cm).  \nThese don’t count as your personal item or carry-on:  \nDiaper bags (1 per child) Breast pump Small, soft-sided cooler of breast milk Child safety seats, strollers and medical or mobility devices  \nCarry-on requirements\nCarry-on requirements\nShouldn’t exceed 22 x 14 x 9 inches / 56 x 36 x 23 centimeters (including handles and wheels) Must fit in the sizer at the airport  \nIf your items don’t fit in the overhead bin or under the seat, they may need to be checked. There may be add

Load Graph Index Query Engine:

In [35]:
Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.chunk_size = 256
Settings.node_parser = SentenceSplitter(
    chunk_size=200,
    chunk_overlap=20,
    paragraph_separator="\n\n"
)
# Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
# Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
# Settings.num_output = 512
# Settings.context_window = 3900

documents = load_policies(limit=1000, airline="American Airlines")
index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=10,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
    # kg_triplet_extract_fn=extract_triplets,
)

Loaded policies: (29, 5)


In [37]:
index.storage_context.persist(persist_dir="./data/index/policies_aa")

Check it was loaded to Nebula:

In [None]:
%ngql USE llamaindex;
%ngql MATCH ()-[e]->() RETURN e LIMIT 100;

In [None]:
for r in index.as_retriever().retrieve("What does Singapore Airlines offer?"):
    print(r.text)

In [None]:
response = index.as_query_engine().query("Tell me about Singapore Airlines")
print(response)

In [None]:
response = index.as_query_engine().query("Tell me about Singapore Airlines")
print(response)

Doesnt seem to be from the KG...

In [None]:
from llama_index.core.query_engine import KnowledgeGraphQueryEngine


nl2kg_query_engine = KnowledgeGraphQueryEngine(
    storage_context=storage_context,
    service_context=service_context,
    llm=llm,
)