Using REBEL model to extract entities and relationships instead of LLM.

Follow along: https://colab.research.google.com/drive/1G6pcR0pXvSkdMQlAK_P-IrYgo-_staxd?usp=sharing#scrollTo=XX_GxhusPZR8

Install Nebula Graph: `curl -fsSL nebula-up.siwei.io/install.sh | bash`

Also follow along: https://colab.research.google.com/drive/1tLjOg2ZQuIClfuWrAC2LdiZHCov8oUbs#scrollTo=kkHpLzEuYo_9

In [1]:
import os

import openai
import pandas as pd
from dotenv import load_dotenv
from transformers import pipeline
from llama_index.core import (
    Document,
    KnowledgeGraphIndex,
    StorageContext,
)
from llama_index.core.settings import Settings
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.graph_stores.nebula import NebulaGraphStore

load_dotenv()

openai.api_key = os.environ["OPENAI_API_KEY"]

Setup nebula graph:

In [2]:
%load_ext ngql
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula

Connection Pool Created


Unnamed: 0,Name
0,llamaindex
1,policies


In [3]:
%%ngql
CREATE SPACE policies_2(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);
USE policies_2;
CREATE TAG entity(name string);
CREATE EDGE relationship(relationship string);
CREATE TAG INDEX entity_index ON entity(name(256));

In [4]:
space_name = "policies_2"
edge_types, rel_prop_names = ["relationship"], [
    "relationship"
]  # default, could be omit if create from an empty kg
tags = ["entity"]  # default, could be omit if create from an empty kg

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)

Load review documents:

In [5]:
def load_policies(
    file: str = "../../data/clean/policy_chunks.csv",
    start_index: int = 0,
    limit: int = 100,
):
    # load dataframe:
    df = pd.read_csv(file)[start_index:start_index + limit]
    print(f"Loaded policies: {df.shape}")
    df["Content"] = df.apply(lambda row: f"{row['Header 1']}\n{row['Header 2']}\n{row['Concat']}".replace("\n\n", "\n").replace("\n\n", "\n"), axis=1)
    documents = [
        Document(
            text=row["Content"],
            metadata={
                "airline": row["Airline"],
                "title": row["Header 1"],
                "type": row["Header 2"],
            }
        )
        for _, row in df.iterrows()
    ]
    return documents
    # splitter = SentenceSplitter(
    #    chunk_size=200,
    #    chunk_overlap=0,
    #    paragraph_separator="\n\n"
    #)
    # 
    #nodes = splitter.get_nodes_from_documents(documents)
    # return nodes

policies = load_policies()

Loaded policies: (100, 4)


Load extract triplets functions:

In [None]:
# Function to parse the generated text and extract the triplets
# Rebel outputs a specific format. This code is mostly copied from the model card!

triplet_extractor = pipeline(
    "text2text-generation",
    model="Babelscape/rebel-large",
    tokenizer="Babelscape/rebel-large",
    device=-1,
    temperature=0.3,
    top_p=0.2,
    # do_sample=True,
) # , device="cuda:0")


def extract_triplets(input_text):
    text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(input_text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])[0]

    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and subject in input_text and relation != '' and relation in input_text and object_ != '' and object_ in input_text:
        triplets.append((subject.strip(), relation.strip(), object_.strip()))

    return triplets

In [None]:
splitter = SentenceSplitter(
    chunk_size=200,
    chunk_overlap=0,
    paragraph_separator="\n\n"
)
 
nodes = splitter.get_nodes_from_documents(policies)

In [None]:
nodes[0].text 

In [None]:
print(nodes[0].text)

In [None]:
triplet_extractor.tokenizer.batch_decode([triplet_extractor(nodes[0].text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])

In [None]:
extract_triplets(nodes[0].text)

In [None]:
import spacy
import spacy_components

nlp = spacy.load("en_core_web_sm")

nlp.add_pipe("rebel", after="senter", config={
    'device':0, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )
# input_sentence = "Gràcia is a district of the city of Barcelona, Spain. It comprises the neighborhoods of Vila de Gràcia, Vallcarca i els Penitents, El Coll, La Salut and Camp d'en Grassot i Gràcia Nova. Gràcia is bordered by the districts of Eixample to the south, Sarrià-Sant Gervasi to the west and Horta-Guinardó to the east. A vibrant and diverse enclave of Catalan life, Gràcia was an independent municipality for centuries before being formally annexed by Barcelona in 1897 as a part of the city's expansion."
input_sentence = nodes[0].text 

doc = nlp(input_sentence)
doc_list = nlp.pipe([input_sentence])
for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

In [None]:
len([doc for doc in policies if doc.metadata["airline"] == "aa"])

Load Graph Index Query Engine:

In [None]:
Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.chunk_size = 256
Settings.node_parser = SentenceSplitter(
    chunk_size=200,
    chunk_overlap=20,
    paragraph_separator="\n\n"
)
# Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
# Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
# Settings.num_output = 512
# Settings.context_window = 3900

documents = load_policies()
# index = KnowledgeGraphIndex.from_documents(documents, kg_triplet_extract_fn=extract_triplets)
index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=25,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
    # kg_triplet_extract_fn=extract_triplets,
)

Check it was loaded to Nebula:

In [None]:
%ngql USE llamaindex;
%ngql MATCH ()-[e]->() RETURN e LIMIT 100;

In [None]:
for r in index.as_retriever().retrieve("What does Singapore Airlines offer?"):
    print(r.text)

In [None]:
response = index.as_query_engine().query("Tell me about Singapore Airlines")
print(response)

In [None]:
response = index.as_query_engine().query("Tell me about Singapore Airlines")
print(response)

Doesnt seem to be from the KG...

In [None]:
from llama_index.core.query_engine import KnowledgeGraphQueryEngine


nl2kg_query_engine = KnowledgeGraphQueryEngine(
    storage_context=storage_context,
    service_context=service_context,
    llm=llm,
)

In [None]:
from llama_index.embeddings.langchain import LangchainEmbedding
