In [36]:
#from langchain.chains import GraphCypherQAChain
#from langchain_openai import ChatOpenAI
import time
import os
from langchain_openai import AzureChatOpenAI
import pandas as pd
from langchain_neo4j import Neo4jGraph

### Passing the endpoint and environment variable

In [None]:
# GPT OMNI keys - load from environment variables
# Required: AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY", "")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT", "")
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-02-15-preview"
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = "gpt-omni"

llm = AzureChatOpenAI(
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
    temperature=0
)

In [None]:
# Neo4j credentials - load from environment variables
# Required: NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD
os.environ["NEO4J_URI"] = os.getenv("NEO4J_URI", "")
os.environ["NEO4J_USERNAME"] = os.getenv("NEO4J_USERNAME", "neo4j")
os.environ["NEO4J_PASSWORD"] = os.getenv("NEO4J_PASSWORD", "")

In [None]:
# Secondary Azure OpenAI instance - load from environment variables
# Required: AZURE_OPENAI_API_KEY_2, AZURE_OPENAI_ENDPOINT_2
llm2 = AzureChatOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY_2", ""),
    api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT_2", ""),
    azure_deployment="Gpt-4-CA",
    temperature=0
)

In [40]:
graph = Neo4jGraph(enhanced_schema=True)
graph.refresh_schema()
print(graph.schema)

Node properties:
- **Product**
  - `Product_ID`: STRING Available options: ['FG-1000', 'FG-2000', 'RM-1000', 'RM-2000', 'SFG-1000']
  - `Base_UOM`: STRING Available options: ['EA']
  - `Material_Type_ID`: STRING Available options: ['FERT', 'ROH', 'HALB']
  - `Product_Desc`: STRING Available options: ['FG 1000', 'FG 2000', 'RM 1000', 'RM 2000', 'SFG 1000']
  - `Product ID`: STRING Available options: ['FG-1000', 'FG-2000', 'RM-1000', 'RM-2000', 'SFG-1000']
  - `Base UOM`: STRING Available options: ['EA']
  - `Material Type ID`: STRING Available options: ['FERT', 'ROH', 'HALB']
  - `Product Desc`: STRING Available options: ['FG 1000', 'FG 2000', 'RM 1000', 'RM 2000', 'SFG 1000']
- **Resource**
  - `Resource_ID`: STRING Available options: ['RES1000_001', 'RES1000_002', 'RES2000_001', 'RES2000_002', 'RES3000_001', 'RES4000_001', 'RES4000_002']
  - `Resource ID`: STRING Available options: ['RES1000_001', 'RES1000_002', 'RES2000_001', 'RES2000_002', 'RES3000_001', 'RES4000_001', 'RES4000_002'

In [31]:
graph.get_structured_schema

{'node_props': {'Location Product': [{'property': 'Loc_Product_ID',
    'type': 'STRING',
    'values': ['DC1000_FG-1000',
     'DC1000_FG-2000',
     'DC1000_RM-1000',
     'DC1000_RM-2000',
     'DC1000_SFG-1000',
     'DC2000_FG-1000',
     'DC2000_FG-2000',
     'DC2000_RM-1000',
     'DC2000_RM-2000',
     'DC2000_SFG-1000'],
    'distinct_count': 27},
   {'property': 'Location ID',
    'type': 'STRING',
    'values': ['DC1000',
     'DC2000',
     'PL1000',
     'PL2000',
     'RDC1000',
     'VEN1000',
     'VEN2000',
     'VEN3000'],
    'distinct_count': 8},
   {'property': 'Product ID',
    'type': 'STRING',
    'values': ['FG-1000', 'FG-2000', 'RM-1000', 'RM-2000', 'SFG-1000'],
    'distinct_count': 5},
   {'property': 'Subnetwork ID',
    'type': 'STRING',
    'values': ['ELECTROLUX'],
    'distinct_count': 1}],
  'Location Resource': [{'property': 'Location ID',
    'type': 'STRING',
    'values': ['PL1000', 'PL2000'],
    'distinct_count': 2},
   {'property': 'Loc_Resourc

In [32]:
graph.get_schema

'Node properties:\n- **Location Product**\n  - `Loc_Product_ID`: STRING Example: "DC1000_FG-1000"\n  - `Location ID`: STRING Available options: [\'DC1000\', \'DC2000\', \'PL1000\', \'PL2000\', \'RDC1000\', \'VEN1000\', \'VEN2000\', \'VEN3000\']\n  - `Product ID`: STRING Available options: [\'FG-1000\', \'FG-2000\', \'RM-1000\', \'RM-2000\', \'SFG-1000\']\n  - `Subnetwork ID`: STRING Available options: [\'ELECTROLUX\']\n- **Location Resource**\n  - `Location ID`: STRING Available options: [\'PL1000\', \'PL2000\']\n  - `Loc_Resource_ID`: STRING Available options: [\'PL1000_RES1000_001\', \'PL1000_RES1000_002\', \'PL1000_RES3000_001\', \'PL1000_RES4000_001\', \'PL2000_RES2000_001\', \'PL2000_RES2000_002\', \'PL2000_RES3000_001\', \'PL2000_RES4000_002\']\n  - `Resource ID`: STRING Available options: [\'RES1000_001\', \'RES1000_002\', \'RES3000_001\', \'RES4000_001\', \'RES2000_001\', \'RES2000_002\', \'RES4000_002\']\n  - `Capacity Supply Expansion Time Series Indicator`: STRING Available 

In [33]:
query = "list down all the product and location ids"

In [34]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent


# # Generate Cypher statement based on natural language input
# cypher_template = """Based on the Neo4j graph schema below, write a Cypher query that would answer the user's question:
# {schema}
# Question: {question}
# Cypher query:"""  # noqa: E501

# Generate Cypher statement based on natural language input
cypher_template = """
    Given the following question and Neo4j graph schemas information, generate a cypher query to provide all the data related to the question. 
    Do not answer the question just provide the cypher query that has all the relvant data.
    Only use the tables and columns that are actually present in the database.
    Return ONLY the executable cypher query itself, without any additional text, explanations, or formatting.

    Neo4j Graph Schema Information:
    {schema}

    Question: {question}

    Cypher Query:
    Before writing the query, consider:
    1. Which nodes and relationships contain the relevant information?
    2. How should these nodes and relationships be matched?
    3. What conditions need to be applied?
    
    Now, write the Cypher query:
    """  # noqa: E501

cypher_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Given an input question, convert it to a Cypher query. No pre-amble.",
        ),
        ("human", cypher_template),
    ]
)

# cypher_response = (
#     RunnablePassthrough.assign(names=entity_chain)
#     | RunnablePassthrough.assign(
#         entities_list=lambda x: map_to_database(x["names"]["function"]),
#         schema=lambda _: graph.get_schema,
#     )
#     | cypher_prompt
#     | llm.bind(stop=["\nCypherResult:"])
#     | StrOutputParser()
# )

cypher_response = (
    RunnablePassthrough.assign()
    | RunnablePassthrough.assign(
        schema=lambda _: graph.get_schema,
    )
    | cypher_prompt
    | llm.bind(stop=["\nCypherResult:"])
    | StrOutputParser()
)

ImportError: cannot import name 'AGENT_DEPRECATION_WARNING' from 'langchain._api.deprecation' (C:\Users\laxmi.panch\AppData\Roaming\Python\Python312\site-packages\langchain\_api\deprecation.py)

In [22]:
cypher = cypher_response.invoke({"question": query})
cypher

'```cypher\nMATCH (w:Warehouse)\nRETURN DISTINCT w.City\n```'

In [28]:
results = graph.query(cypher)
results

[{'w.City': 'Martensville'},
 {'w.City': 'Stanstead'},
 {'w.City': 'Saint-Pamphile'},
 {'w.City': 'Estevan'}]

## Creating a single complete code

In [99]:
def graph_df_answer(query):
    cypher = cypher_response.invoke({"question": query})
    print("Cypher Query - ",cypher)
    
    try:
        # First attempt: removing the first and last 3 characters from the cypher
        results = graph.query(cypher[3:-3])
    except Exception as e1:  # Catch the exception if the first approach fails
        try:
            # Second attempt: using the cypher as it is
            results = graph.query(cypher)
        except Exception as e2:  # Catch the exception if the second approach fails
                # Third attempt: replacing "CH" with "cypher"
                new_cypher = cypher.replace("CH", "cypher", 1)
                results = graph.query(new_cypher)
            
            #     new_cypher = cypher.replace("CH", "cypher", 1)
            #     results = graph.query(new_cypher)
            # except Exception as e3:  # Catch the exception if the third approach fails
            #     # Handle the failure of all approaches (logging, raising an error, etc.)
            #     print(f"All attempts failed: {e3}")
            #     results = None  # Or raise a custom exception if necessary

    print(results)
    df = pd.DataFrame(results)
    print(df.head())
    df_query = f"""generate response for the user query {query} based on dataframe provided, without mentioning about the dataframe.The Dataframe is an extracted answer from Neo4j graph database
    If the dataframe contains only one row then it is the answer and directly return the value in dataframe"""
    df_query = df_query + " using tool python_repl_ast"
    # define agent
    pandas_agent = create_pandas_dataframe_agent(llm = llm2 , df = df, verbose = True, allow_dangerous_code=True,handle_parsing_errors=True)
    response1 = pandas_agent.invoke(df_query)
    response = response1['output']
    print(f"Input Query - {query}")
    print(f"Response - {response}")
    return response
    

In [27]:
from langchain.agents import load_tools, initialize_agent

In [31]:
import time

## Testing all four cases

#### Information Retrieval

In [34]:
# Start measuring time
start_time = time.time()
graph_df_answer("Give details for order number SO - 018901")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")



Cypher Query -  ```cypher
MATCH (o:Order {Order_Number: "SO - 018901"})
OPTIONAL MATCH (o)-[:Contains]->(p:Product)
OPTIONAL MATCH (o)-[:Supplied_by]->(w:Warehouse)
OPTIONAL MATCH (w)-[:Has_Inventory]->(iq:Inventory_Quantity)
RETURN o, p, w, iq
```
[{'o': {'Customer_Type': 'Wholesale', 'Unit_Sale_Price': 134.5, 'Order_Number': 'SO - 018901', 'Order_Quantity': 151.0, 'Revenue': 20310, 'Order_Date': '01-Jan-21'}, 'p': {'SKU_ID': '3079BA', 'SKU_Name': 'Product 247'}, 'w': {'Country': ' Canada', 'Latitude': 45.016666, 'City': 'Stanstead', 'Longitude': -72.099998, 'Warehouse_Code': 'AXW291', 'Province': 'Quebec'}, 'iq': {'Average_Lead_Time(days)': 90, 'Vendor_Name': 'Accord Ltd', 'Warehouse_ID': 'AXW291', 'Cost_per_SKU': ' 22.96 ', 'Units(Nos/Kg)': 'Kg', 'SKU_ID': '1116CA', 'Current_Inventory_Quantity': 33398.0, 'Total_Value': ' 7,66,818.08 ', 'Maximum_Lead_Time(days)': 140, 'Unit_Price': ' 32.80 '}}, {'o': {'Customer_Type': 'Wholesale', 'Unit_Sale_Price': 134.5, 'Order_Number': 'SO - 01890



[32;1m[1;3mThought: As per the data provided all the order details seems to be in column 'o'. 
    To answer the query, I need to filter rows where the order number matches with the provided order number which is 'SO - 018901' in this case.
Action: python_repl_ast
Action Input: df[df['o'].apply(lambda x: x['Order_Number']) == 'SO - 018901'][0m[36;1m[1;3m                                                    o  \
0   {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
1   {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
2   {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
3   {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
4   {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
..                                                ...   
64  {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
65  {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
66  {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
67  {'Customer_Type': 'Wholesale', 'Unit_Sale_Pric...   
68  {'Custom

In [97]:
# Start measuring time
start_time = time.time()
graph_df_answer("What is the total sale amount for August 2022?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")



Cypher Query -  ```cypher
MATCH (o:Order)-[:Contains]->(:Product)
WHERE o.Order_Date STARTS WITH "Aug-22"
RETURN SUM(o.Revenue) AS total_sale_amount
```
[{'total_sale_amount': 0}]
   total_sale_amount
0                  0


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mThought: I need to access the value in the 'total_sale_amount' column of the dataframe.
Action: python_repl_ast
Action Input: df['total_sale_amount'].values[0][0m[36;1m[1;3m0[0m[32;1m[1;3mThe total sales amount retrieved from the dataframe is 0.
Final Answer: The total sale amount for August 2022 is 0.[0m

[1m> Finished chain.[0m
Input Query - What is the total sale amount for August 2022?
Response - The total sale amount for August 2022 is 0.
--- completion time 5.4399 seconds ---


In [37]:

# Start measuring time
start_time = time.time()
graph_df_answer("What city warehouse is supplying the order number SO - 018914?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")



Cypher Query -  ```cypher
MATCH (o:Order {Order_Number: "SO - 018914"})-[:Supplied_by]->(w:Warehouse)
RETURN w.City
```
[{'w.City': 'Stanstead'}]
      w.City
0  Stanstead


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mThought: I need to extract the value from the dataframe and return it as the answer.
Action: python_repl_ast
Action Input: df['w.City'].values[0][0m[36;1m[1;3mStanstead[0m[32;1m[1;3mI now know the final answer.
Final Answer: The city warehouse supplying the order number SO - 018914 is Stanstead.[0m

[1m> Finished chain.[0m
Input Query - What city warehouse is supplying the order number SO - 018914?
Response - The city warehouse supplying the order number SO - 018914 is Stanstead.
--- completion time 8.0864 seconds ---


### Derived metrics


In [38]:
# Start measuring time
start_time = time.time()

graph_df_answer("give details of the highest Revenue order")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (o:Order)-[:Contains]->(p:Product), (o)-[:Supplied_by]->(w:Warehouse), (w)-[:Has_Inventory]->(i:Inventory_Quantity)
RETURN o.Order_Number, o.Order_Date, o.Customer_Type, o.Order_Quantity, o.Unit_Sale_Price, o.Revenue, 
       p.SKU_ID, p.SKU_Name, 
       w.Warehouse_Code, w.City, w.Province, w.Country, w.Latitude, w.Longitude, 
       i.Vendor_Name, i.Warehouse_ID, i.Current_Inventory_Quantity, i.Cost_per_SKU, i.Total_Value, i.`Units(Nos/Kg)`, i.`Average_Lead_Time(days)`, i.`Maximum_Lead_Time(days)`, i.Unit_Price
ORDER BY o.Revenue DESC LIMIT 1
```
[{'o.Order_Number': 'SO - 021924', 'o.Order_Date': '12-Mar-21', 'o.Customer_Type': 'Distributor', 'o.Order_Quantity': 25926.0, 'o.Unit_Sale_Price': 99.12769, 'o.Revenue': 2569984, 'p.SKU_ID': '1964BA', 'p.SKU_Name': 'Product 106', 'w.Warehouse_Code': 'AXW291', 'w.City': 'Stanstead', 'w.Province': 'Quebec', 'w.Country': ' Canada', 'w.Latitude': 45.016666, 'w.Longitude': -72.099998, 'i.Vendor_Name': 'Accord Ltd



[32;1m[1;3mThought: In order to find the order with the highest revenue, I need to sort the DataFrame based on the 'o.Revenue' column in descending order and then select the first row.
Action: python_repl_ast
Action Input: df.sort_values('o.Revenue', ascending=False).iloc[0][0m[36;1m[1;3mo.Order_Number                    SO - 021924
o.Order_Date                        12-Mar-21
o.Customer_Type                   Distributor
o.Order_Quantity                      25926.0
o.Unit_Sale_Price                    99.12769
o.Revenue                             2569984
p.SKU_ID                               1964BA
p.SKU_Name                        Product 106
w.Warehouse_Code                       AXW291
w.City                              Stanstead
w.Province                             Quebec
w.Country                              Canada
w.Latitude                          45.016666
w.Longitude                        -72.099998
i.Vendor_Name                      Accord Ltd
i.Warehouse_ID  

In [49]:
# Start measuring time
start_time = time.time()

graph_df_answer("what is the average order quantity for warehouse in the City Estevan ?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (w:Warehouse {City: 'Estevan'})-[:Has_Inventory]->(:Inventory_Quantity)<-[:Supplied_by]-(o:Order)
RETURN AVG(o.Order_Quantity) AS average_order_quantity
```
[{'average_order_quantity': None}]
  average_order_quantity
0                   None


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mThought: As the dataframe contains only one row, it means the answer to the query is already extracted from the database and stored in the dataframe.
Action: python_repl_ast
Action Input: df['average_order_quantity'][0][0m[36;1m[1;3m[0m[32;1m[1;3mThe result I got is the value of average order quantity for warehouse in the city Estevan.
Final Answer: The average order quantity for warehouse in the city Estevan is the value extracted from the dataframe.[0m

[1m> Finished chain.[0m
Input Query - what is the average order quantity for warehouse in the City Estevan ?
Response - The average order quantity for warehouse in the city Estevan is the value extracted from the dataframe.
--- completion time 8.8936 seconds ---


In [52]:
# Start measuring time
start_time = time.time()

graph_df_answer("what is Order ID of thr lowest order quantity in the data?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (o:Order)
RETURN o.Order_Number
ORDER BY o.Order_Quantity ASC
LIMIT 1
```
[{'o.Order_Number': 'SO - 041048'}]
  o.Order_Number
0    SO - 041048


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mAction: python_repl_ast
Action Input: df['o.Order_Number'].values[0][0m[36;1m[1;3mSO - 041048[0m[32;1m[1;3mThe dataframe only contains one row, so this must be the order ID for the order with the lowest quantity.
Final Answer: The Order ID of the lowest order quantity is 'SO - 041048'.[0m

[1m> Finished chain.[0m
Input Query - what is Order ID of thr lowest order quantity in the data?
Response - The Order ID of the lowest order quantity is 'SO - 041048'.
--- completion time 8.4665 seconds ---


In [53]:
# Start measuring time
start_time = time.time()

graph_df_answer("Which vendor has the highest inventory quantity in the warehouse thats in Martensville ?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (w:Warehouse {City: 'Martensville'})-[:Has_Inventory]->(iq:Inventory_Quantity)
RETURN iq.Vendor_Name, SUM(iq.Current_Inventory_Quantity) AS Total_Inventory
ORDER BY Total_Inventory DESC
LIMIT 1
```
[{'iq.Vendor_Name': 'Kinray Ltd', 'Total_Inventory': 345329.5}]
  iq.Vendor_Name  Total_Inventory
0     Kinray Ltd         345329.5


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mThought: The dataframe only contains one row, so the vendor in this row must be the one with the highest inventory quantity.
Action: python_repl_ast
Action Input: df.iq.Vendor_Name[0][0m[36;1m[1;3mAttributeError: 'DataFrame' object has no attribute 'iq'[0m[32;1m[1;3mIt seems I made an error in referencing the column 'iq.Vendor_Name'. The correct way to reference it is `df['iq.Vendor_Name']`.
Action: python_repl_ast
Action Input: df['iq.Vendor_Name'][0][0m[36;1m[1;3mKinray Ltd[0m[32;1m[1;3mI now know the final answer
Final Answer: The vendor with the highest inventory quantity in the warehouse that's in Martensville is Kinray Ltd.[0m

[1m> Finished chain.[0m
Input Query - Which vendor has the highest inventory quantity in the warehouse thats in Martensville ?
Response - The vendor with the highest inventory quantity in the warehouse that's in Martensville is Kinray Ltd.
--- completion time 12.1769 seconds ---


Cypher Query -  ```cypher
MATCH (o:Order)
WHERE o.Customer_Type = 'Wholesale'
RETURN count(o) AS WholesaleCustomerCount
```
[{'WholesaleCustomerCount': 18193}]
   WholesaleCustomerCount
0                   18193


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mThought: The dataframe `df` contains only one row which is the answer to the question. We need to extract this value.
Action: python_repl_ast
Action Input: df['WholesaleCustomerCount'].values[0][0m[36;1m[1;3m18193[0m[32;1m[1;3mI now know the final answer
Final Answer: There are 18193 Wholesale Customers.[0m

[1m> Finished chain.[0m
Input Query - How many Customer type are Wholesale?
Response - There are 18193 Wholesale Customers.
--- completion time 6.9747 seconds ---


## Smart Insights

In [101]:

# Start measuring time
start_time = time.time()

graph_df_answer("Which customer type contributed the most to sales in january 2021")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (o:Order)-[:Contains]->(:Product)
WHERE o.Order_Date STARTS WITH '01-Jan-21'
RETURN o.Customer_Type, SUM(o.Revenue) AS Total_Sales
ORDER BY Total_Sales DESC
LIMIT 1
```
[{'o.Customer_Type': 'Distributor', 'Total_Sales': 180120}]
  o.Customer_Type  Total_Sales
0     Distributor       180120


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mThought: Given that the dataframe only contains one row, the customer type in that row is the one that contributed the most to sales in January 2021.
Action: python_repl_ast
Action Input: df.loc[0, 'o.Customer_Type'][0m[36;1m[1;3mDistributor[0m[32;1m[1;3mThe customer type that contributed the most to sales in January 2021 is 'Distributor'.
Final Answer: The customer type 'Distributor' contributed the most to sales in January 2021.[0m

[1m> Finished chain.[0m
Input Query - Which customer type contributed the most to sales in january 2021
Response - The customer type 'Distributor' contributed the most to sales in January 2021.
--- completion time 7.0643 seconds ---


In [70]:

# Start measuring time
start_time = time.time()

graph_df_answer("Which are top 5 SKUs which have highest inventory quantity?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (w:Warehouse)-[:Has_Inventory]->(iq:Inventory_Quantity)
RETURN iq.SKU_ID, SUM(iq.Current_Inventory_Quantity) AS Total_Inventory
ORDER BY Total_Inventory DESC
LIMIT 5
```
[{'iq.SKU_ID': '2429BA', 'Total_Inventory': 342553.0}, {'iq.SKU_ID': '2396CA', 'Total_Inventory': 124908.0}, {'iq.SKU_ID': '2418CA', 'Total_Inventory': 120860.0}, {'iq.SKU_ID': '2483BA', 'Total_Inventory': 107097.0}, {'iq.SKU_ID': '2471CA', 'Total_Inventory': 96647.0}]
  iq.SKU_ID  Total_Inventory
0    2429BA         342553.0
1    2396CA         124908.0
2    2418CA         120860.0
3    2483BA         107097.0
4    2471CA          96647.0


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mThought: To answer this question, I need to sort the DataFrame `df` by the column `Total_Inventory` in descending order and then get the top 5 rows. 
Action: python_repl_ast
Action Input: df.sort_values(by='Total_Inventory', ascending=False).head(5)[0m[36;1m[1;3m  iq.SKU_ID  Total_Inventory
0    2429BA         342553.0
1    2396CA         124908.0
2    2418CA         120860.0
3    2483BA         107097.0
4    2471CA          96647.0[0m[32;1m[1;3mI now know that the top 5 SKUs with the highest inventory quantity are '2429BA', '2396CA', '2418CA', '2483BA', and '2471CA'.
Final Answer: The top 5 SKUs with the highest inventory quantity are '2429BA', '2396CA', '2418CA', '2483BA', and '2471CA'.[0m

[1m> Finished chain.[0m
Input Query - Which are top 5 SKUs which have highest inventory quantity?
Response - The top 5 SKUs with the highest inventory quantity are '2429BA', '2396CA', '2418CA', '2483BA', and '2471CA'.
--- completion time 10.6161 seconds ---


In [86]:

# Start measuring time
start_time = time.time()

graph_df_answer("Which are top 5 product names which have highest inventory quantity?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (p:Product)<-[:Contains]-(o:Order)-[:Supplied_by]->(w:Warehouse)-[:Has_Inventory]->(iq:Inventory_Quantity)
RETURN p.SKU_Name AS Product_Name, SUM(iq.Current_Inventory_Quantity) AS Total_Inventory
ORDER BY Total_Inventory DESC
LIMIT 5
```
[{'Product_Name': 'Product 259', 'Total_Inventory': 200436345.2460006}, {'Product_Name': 'Product 262', 'Total_Inventory': 199440447.9680005}, {'Product_Name': 'Product 264', 'Total_Inventory': 192655790.64400038}, {'Product_Name': 'Product 267', 'Total_Inventory': 192334071.89200047}, {'Product_Name': 'Product 260', 'Total_Inventory': 192004281.54600042}]
  Product_Name  Total_Inventory
0  Product 259     2.004363e+08
1  Product 262     1.994404e+08
2  Product 264     1.926558e+08
3  Product 267     1.923341e+08
4  Product 260     1.920043e+08


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mThought: To answer the user query we need to sort the dataframe based on the Total_Inventory column in descending order and then select the top 5 rows. After that, we can select the 'Product_Name' column from those rows. 
Action: python_repl_ast
Action Input: df.sort_values(by='Total_Inventory', ascending=False).head(5)['Product_Name'][0m[36;1m[1;3m0    Product 259
1    Product 262
2    Product 264
3    Product 267
4    Product 260
Name: Product_Name, dtype: object[0m[32;1m[1;3mThe top 5 products with the highest inventory quantity have been retrieved successfully. Now I can provide the final answer.
Final Answer: The top 5 products with the highest inventory quantity are 'Product 259', 'Product 262', 'Product 264', 'Product 267' and 'Product 260'.[0m

[1m> Finished chain.[0m
Input Query - Which are top 5 product names which have highest inventory quantity?
Response - The top 5 products with the highest inventory quantity are 'Product 259', 'Product 262', 'Product

In [72]:

# Start measuring time
start_time = time.time()

graph_df_answer("Which are the products that are risk of running out of inventory?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (w:Warehouse)-[:Has_Inventory]->(iq:Inventory_Quantity)
WHERE iq.Current_Inventory_Quantity < 10.0
MATCH (p:Product {SKU_ID: iq.SKU_ID})
RETURN p.SKU_ID, p.SKU_Name, iq.Current_Inventory_Quantity, w.Warehouse_Code
```
[{'p.SKU_ID': '1265CA', 'p.SKU_Name': 'Product 018', 'iq.Current_Inventory_Quantity': 6.0, 'w.Warehouse_Code': 'GUT930'}, {'p.SKU_ID': '1283CA', 'p.SKU_Name': 'Product 021', 'iq.Current_Inventory_Quantity': 0.0, 'w.Warehouse_Code': 'GUT930'}, {'p.SKU_ID': '1296CA', 'p.SKU_Name': 'Product 023', 'iq.Current_Inventory_Quantity': 4.0, 'w.Warehouse_Code': 'GUT930'}, {'p.SKU_ID': '1325AA', 'p.SKU_Name': 'Product 029', 'iq.Current_Inventory_Quantity': 6.0, 'w.Warehouse_Code': 'GUT930'}, {'p.SKU_ID': '1491CA', 'p.SKU_Name': 'Product 053', 'iq.Current_Inventory_Quantity': 0.0, 'w.Warehouse_Code': 'GUT930'}, {'p.SKU_ID': '1842BA', 'p.SKU_Name': 'Product 090', 'iq.Current_Inventory_Quantity': 6.0, 'w.Warehouse_Code': 'GUT930'}, {'p.SKU_ID': '1857AA', 



[32;1m[1;3mThought: Since the question is asking about the products that are at risk of running out of inventory, we need to find the products with low inventory quantity. Here, we are not given a specific threshold for what constitutes "low inventory". So, I'll consider products with inventory quantity 0 as the ones at risk of running out.
Action: python_repl_ast
Action Input: df[df['iq.Current_Inventory_Quantity'] == 0]['p.SKU_Name'][0m[36;1m[1;3m1     Product 021
4     Product 053
9     Product 269
10    Product 273
17    Product 268
29    Product 271
30    Product 272
31    Product 274
39    Product 270
40    Product 276
Name: p.SKU_Name, dtype: object[0m[32;1m[1;3mI have now retrieved the names of the products that have an inventory quantity of 0. These are the products at risk of running out of inventory.
Final Answer: The products that are at risk of running out of inventory are Product 021, Product 053, Product 269, Product 273, Product 268, Product 271, Product 272, Pr

In [73]:

# Start measuring time
start_time = time.time()

graph_df_answer("what is the correlation between order quantity and revenue?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (o:Order)
RETURN o.Order_Quantity AS OrderQuantity, o.Revenue AS Revenue
```
[{'OrderQuantity': 105.0, 'Revenue': 742}, {'OrderQuantity': 151.0, 'Revenue': 20310}, {'OrderQuantity': 300.0, 'Revenue': 10426}, {'OrderQuantity': 50.0, 'Revenue': 6830}, {'OrderQuantity': 1000.0, 'Revenue': 103}, {'OrderQuantity': 6.0, 'Revenue': 8700}, {'OrderQuantity': 50.0, 'Revenue': 205}, {'OrderQuantity': 25.0, 'Revenue': 2877}, {'OrderQuantity': 98.0, 'Revenue': 693}, {'OrderQuantity': 3.0, 'Revenue': 1157}, {'OrderQuantity': 4.0, 'Revenue': 7575}, {'OrderQuantity': 105.0, 'Revenue': 52290}, {'OrderQuantity': 4.0, 'Revenue': 1771}, {'OrderQuantity': 300.0, 'Revenue': 224}, {'OrderQuantity': 63.0, 'Revenue': 9316}, {'OrderQuantity': 1.0, 'Revenue': 1018}, {'OrderQuantity': 1.0, 'Revenue': 29}, {'OrderQuantity': 20.0, 'Revenue': 702}, {'OrderQuantity': 2.0, 'Revenue': 124}, {'OrderQuantity': 4.0, 'Revenue': 1221}, {'OrderQuantity': 5.0, 'Revenue': 33000}, {'OrderQuantity



[32;1m[1;3mThought: To find the correlation between 'OrderQuantity' and 'Revenue', I need to use the corr() function on these two columns of the dataframe. Also, I need to check the number of rows in the dataframe. If it's only one row, then I need to return the values directly.
Action: python_repl_ast
Action Input: len(df)[0m[36;1m[1;3m33919[0m[32;1m[1;3mThe dataframe contains more than one row, so I can proceed to calculate the correlation between 'OrderQuantity' and 'Revenue'.
Action: python_repl_ast
Action Input: df['OrderQuantity'].corr(df['Revenue'])[0m[36;1m[1;3m0.2854755750224063[0m[32;1m[1;3mI now know the correlation between 'OrderQuantity' and 'Revenue'.
Final Answer: The correlation between 'OrderQuantity' and 'Revenue' is approximately 0.285.[0m

[1m> Finished chain.[0m
Input Query - what is the correlation between order quantity and revenue?
Response - The correlation between 'OrderQuantity' and 'Revenue' is approximately 0.285.
--- completion time 17.557

In [90]:

# Start measuring time
start_time = time.time()

graph_df_answer("what is next 3 value for revenue id predicted using linear regression?")

# End measuring time
end_time = time.time()

# Calculate latency
elapsed_time = end_time - start_time
print(f"--- completion time {elapsed_time:.4f} seconds ---")

Cypher Query -  ```cypher
MATCH (o:Order)
RETURN o.Order_Date AS date, o.Revenue AS revenue
ORDER BY o.Order_Date ASC
```
[{'date': '01-Apr-21', 'revenue': 722}, {'date': '01-Apr-21', 'revenue': 24986}, {'date': '01-Apr-21', 'revenue': 121}, {'date': '01-Apr-21', 'revenue': 162}, {'date': '01-Apr-21', 'revenue': 25635}, {'date': '01-Apr-21', 'revenue': 15822}, {'date': '01-Apr-21', 'revenue': 25686}, {'date': '01-Apr-21', 'revenue': 58}, {'date': '01-Apr-21', 'revenue': 6514}, {'date': '01-Apr-21', 'revenue': 2132}, {'date': '01-Apr-21', 'revenue': 1093}, {'date': '01-Apr-21', 'revenue': 1433}, {'date': '01-Apr-21', 'revenue': 4909}, {'date': '01-Apr-21', 'revenue': 224}, {'date': '01-Apr-21', 'revenue': 535}, {'date': '01-Apr-21', 'revenue': 915}, {'date': '01-Apr-21', 'revenue': 6328}, {'date': '01-Apr-21', 'revenue': 11634}, {'date': '01-Apr-21', 'revenue': 2584}, {'date': '01-Apr-21', 'revenue': 4167}, {'date': '01-Apr-21', 'revenue': 4805}, {'date': '01-Apr-21', 'revenue': 763}, {



[32;1m[1;3mThought: The question asks to predict the next three values of the 'revenue' column based on a linear regression model. However, the dataframe provided does not seem to have enough features to train a linear regression model. We only have 'date' and 'revenue', but 'date' is not a numerical value that can be used directly in a linear regression. 

Final Answer: It is not possible to predict the next three values of revenue using linear regression based on the given dataframe because there are not enough numerical features.[0m

[1m> Finished chain.[0m
Input Query - what is next 3 value for revenue id predicted using linear regression?
Response - It is not possible to predict the next three values of revenue using linear regression based on the given dataframe because there are not enough numerical features.
--- completion time 15.6659 seconds ---
