Follow along: https://python.langchain.com/docs/use_cases/graph/constructing/

In [1]:
import os
import time

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.graphs import Neo4jGraph
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_fireworks import ChatFireworks
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff



# initialize env vars - load credentials
load_dotenv()

# initialize graph db:
graph = Neo4jGraph()

# initialize llm:
# llm = ChatFireworks(
#    model="accounts/fireworks/models/firefunction-v1",
#    max_tokens=5_000,
    # model="accounts/fireworks/models/llama-v3-70b-instruct",
    # prompt_truncate_len=3_000,
    # response_format={"type": "json_object"},
    # context_length_exceeded_behavior="error",
# )

# llm = ChatOpenAI(
#    base_url="https://api.together.xyz/v1",
#    api_key=os.environ["TOGETHER_API_KEY"],
#    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
#)

# llm = ChatVertexAI(model_name="gemini-pro")

#llm = ChatOpenAI(
#    temperature=0,
#    # model_name="gpt-4-0125-preview",
#    model_name="gpt-3.5-turbo-instruct",
# )

llm = ChatOpenAI(
    model="gpt-3.5-turbo-0125",
    temperature=0,
    max_tokens=2000,
    )

# initialize graph transformer:
llm_transformer = LLMGraphTransformer(
    llm=llm,
    # alternative options: (these are just examples)
    # allowed_nodes=["Person", "Country", "Organization"],
    # allowed_relationships=["NATIONALITY", "LOCATED_IN", "WORKED_AT", "SPOUSE"],
)

Read reviews, prepare reviews:

In [4]:
def load_reviews(
    file: str = "../../data/clean/cleaned_reviews.csv",
    start_index: int = 0,
    limit: int = 100,
):
    # load dataframe:
    reviews_df = pd.read_csv(file)[start_index:start_index + limit]
    print(f"Loaded reviews: {reviews_df.shape}")
    reviews_df["Content"] = reviews_df.apply(lambda row: f"{row['Review Title']}\n{row['Review Content']}", axis=1)
    return reviews_df

In [5]:
reviews_df = load_reviews()
reviews_df.head()

Loaded reviews: (100, 15)


Unnamed: 0,Review Title,Review Date,Airline,Verified,Review Content,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Content
0,Food was below par,2024-04-21,Singapore Airlines,True,Overall disappointing from Singapore Airlines....,Couple Leisure,2024-04-01,Melbourne to Singapore,Economy Class,4,5,4,4,3,5,Food was below par\nOverall disappointing from...
1,recent pricing and service changes,2024-04-21,Singapore Airlines,True,I usually fly Singapore Airlines international...,Business,2024-04-01,Zurich to Singapore,Premium Economy,2,4,1,1,1,6,recent pricing and service changes\nI usually ...
2,Crew was super nice,2024-04-14,Singapore Airlines,True,"Flight was great! Crew was super nice, Chief S...",Solo Leisure,2024-01-01,London to Singapore,Business Class,5,5,3,2,5,10,Crew was super nice\nFlight was great! Crew wa...
3,attendants are helpful and polite,2024-04-13,Singapore Airlines,True,The cabin is clean. Flight attendants are help...,Solo Leisure,2024-04-01,Taipei to Singapore,Economy Class,5,4,5,5,4,9,attendants are helpful and polite\nThe cabin i...
4,food were served last,2024-04-12,Singapore Airlines,False,From Kochi to Singapore and vice verse we trav...,Couple Leisure,2024-04-01,Kochi to Singapore,Economy Class,3,2,5,3,2,3,food were served last\nFrom Kochi to Singapore...


Total reviews: 28,950. Average time per 10 queries: 1m6s. Expected time for 28,9k: 60 hours.

Transform reviews to nodes and relationships:

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def convert_to_graph_documents(df: pd.DataFrame, page_content_column: str = "Content"):
    # graph_documents = llm_transformer.convert_to_graph_documents(documents)
    loader = DataFrameLoader(df, page_content_column=page_content_column)
    documents = loader.load()
    # return  llm_transformer.convert_to_graph_documents(documents)
    graph_documents = []
    for doc in documents:
        try:
            # fireworks doesnt always return valid json
            graph_documents += llm_transformer.convert_to_graph_documents([doc])
        except:
            graph_documents += [None]
    return graph_documents

See example graphs:

In [None]:
# print(f"Nodes: {graph_documents[0].nodes}")
# print(f"Relationships: {graph_documents[0].relationships}")

Store dataframe with graph temporarily:

In [None]:
def store_graph_locally(
    df: pd.DataFrame,
    docs: list,
    path: str,
):
    df["graph"] = [gd.json() if gd is not None else None for gd in docs]
    df.to_csv(path, index=False)

Storing to graph database:

In [None]:
# graph.add_graph_documents(graph_documents)

Iteratively load these in chunks because, well, there's... a lot.

In [None]:
import time

# earlier:
# combination of fireworks and others

# together.ai: mixtral 
#start_index = 300
#limit = 500
#chunk_size = 10

start_index = 2000
limit = 10000
chunk_size = 10


def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))


graph_docs = []

reviews_df = load_reviews(start_index=start_index, limit=limit)
for idx, chunk_df in enumerate(chunker(reviews_df, chunk_size)):
    idx_start = start_index + (idx * chunk_size)
    idx_end = start_index + ((idx + 1) * chunk_size)
    print(f"chunk {idx}: {idx_start} -> {idx_end}. {chunk_df.shape}")

    chunk_graph_docs = convert_to_graph_documents(chunk_df)
    graph_docs += chunk_graph_docs
    print(f"chunk {idx}: converted")

    store_graph_locally(chunk_df, chunk_graph_docs, f"../data/transformed/openai/reviews-graph-{idx_start}-{idx_end}.csv")
    print(f"chunk {idx}: saved to csv")

    chunk_graph_docs = list(filter(None, chunk_graph_docs))
    graph.add_graph_documents(chunk_graph_docs)
    print(f"chunk {idx}: saved to neo4j")
    print(f"move index to: {idx_end}")

    time.sleep(1)

In [None]:
# 1180