# Exp 4. This is a notebook using QAChain with trunks of graph stored in the Pinecone to communicate with the database

In [1]:
import pinecone
import openai
from dotenv import load_dotenv
import os
from neo4j import GraphDatabase
from langchain.vectorstores import Pinecone
import pinecone
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI
from langchain.embeddings.base import Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings

load_dotenv()
neo4j_url = os.getenv('NEO4J_URL')
neo4j_user = os.getenv('NEO4J_USER')
neo4j_password = os.getenv('NEO4J_PASSWORD')
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key
pinecone_api_key = os.getenv('PINECONE_KEY')

embeddings = OpenAIEmbeddings()


  from tqdm.autonotebook import tqdm


# Divide graph into trunks

In [2]:
# Setting of Neo4j
driver = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))

# Divide the graph into trunks
def get_pair_nodes (tx):
    pairs_of_nodes = []
    for record in tx.run("MATCH (a)-[r]->(b) RETURN labels(a), a.name, type(r), labels(b), b.name"):
        pair_node = {
            "start_node_name": record["a.name"],
            "start_node_label":record["labels(a)"][0],
            "end_node_name": record["b.name"],
            "end_node_label":record["labels(b)"][0],
            "edge": record["type(r)"]
        }
        pairs_of_nodes.append(pair_node)
        # print(record["labels(a)"][0], record["a.name"], record["type(r)"], record["labels(b)"][0], record["b.name"])
    return pairs_of_nodes

with driver.session() as session:
    pairs_of_nodes = session.execute_read(get_pair_nodes)
driver.close()
pairs_of_nodes

[{'start_node_name': 'Review_project_portfolio_balance_for_2020_Q1',
  'start_node_label': 'Areas_of_support',
  'end_node_name': 'Portfolio_management',
  'end_node_label': 'Our_Service_Offerings',
  'edge': 'deploy'},
 {'start_node_name': 'Conduct_project_assurance_readiness_for_Brexit_Programme',
  'start_node_label': 'Areas_of_support',
  'end_node_name': 'P3M_Assessment',
  'end_node_label': 'Our_Service_Offerings',
  'edge': 'deploy'},
 {'start_node_name': 'Identify_best_practice_regulatory_project_as_examplar',
  'start_node_label': 'Areas_of_support',
  'end_node_name': 'P3M_Assessment',
  'end_node_label': 'Our_Service_Offerings',
  'edge': 'deploy'},
 {'start_node_name': 'Identify_best_practice_regulatory_project_as_examplar',
  'start_node_label': 'Areas_of_support',
  'end_node_name': 'P3M_Support',
  'end_node_label': 'Our_Service_Offerings',
  'edge': 'deploy'},
 {'start_node_name': 'Cross-project_review_of_plastic_dependencies',
  'start_node_label': 'Areas_of_support',


In [3]:
node_vectors = []
for pair_node in pairs_of_nodes:
    start_node_name = pair_node["start_node_name"]
    edge = pair_node["edge"]
    end_node_name = pair_node["end_node_name"]
    start_node_label = pair_node["start_node_label"]
    end_node_label = pair_node["end_node_label"]
    pair_node["context"] = f"{start_node_name} is {edge} {end_node_name}. {start_node_name} is one type of {start_node_label}. {end_node_name} is one type of {end_node_label}."
    pair_node["source"] = "ps-graph"
    node_per_pair = {
        "value": embeddings.embed_documents([pair_node["context"]]),
        "meta_data": pair_node,
    } 
    node_vectors.append(node_per_pair)
node_vectors

[{'value': [[0.0004832131059630333,
    -0.029557246531250678,
    -0.0037083397389101693,
    -0.03144562597413471,
    -0.021045854184446145,
    0.026273108855620664,
    -0.029009891493742432,
    0.013896011661111865,
    -0.021319532634522836,
    -0.004933049871680792,
    0.012308677581989625,
    0.0013333260930218273,
    -0.0298035571363197,
    0.01130291067494164,
    -0.0020867963191896297,
    -0.007006162268366587,
    0.013615490784121971,
    -0.011439748968657418,
    -0.0024528410133365613,
    -0.0352771205499181,
    -0.02575311980047495,
    -0.00523067495330855,
    -0.019992193082973453,
    -0.01331444542036018,
    -0.01607859497216705,
    -0.007095107764641511,
    0.0189932676715161,
    -0.013711278241648812,
    0.0009219534637886043,
    0.004163329929332026,
    0.0013059582480141583,
    -0.0015163483236799722,
    -0.0028223066881094515,
    0.016242802042213067,
    -0.0077108838116527805,
    0.00033461439099019135,
    0.0034329511477664623,
    0

# Upload trunks of graph to Pinecone

In [4]:
# Setting of pinecone
env_name = "us-west4-gcp-free"
index_name = "test-chatbot-ran"
pinecone.init(api_key=pinecone_api_key,environment=env_name)
index = pinecone.Index(index_name)

# upload vectors 
my_namespace = 'graph_02'
index.delete(deleteAll='true', namespace=my_namespace)
pinecone_vectors = []
for idx, n_v in enumerate(node_vectors):
    pinecone_vectors.append((str(idx), n_v["value"], n_v["meta_data"]))
upsert_response = index.upsert(vectors=pinecone_vectors, namespace=my_namespace)

# Response with Pinecone API

In [5]:
pinecone.init(api_key=pinecone_api_key,environment=env_name)
index = pinecone.Index(index_name)

prompt = "what are the values of unilever create?" # vector created with OpenAI as well
query_vector = embeddings.embed_query(prompt)
response = index.query(
    top_k=10,
    vector=query_vector,
    include_metadata=True,
    namespace=my_namespace,
)
response

{'matches': [{'id': '46',
              'metadata': {'context': '1._Consumer_benefits is enabled_by '
                                      'Alignment_with_Product_lifecycle. '
                                      '1._Consumer_benefits is one type of '
                                      'Value_Unilever_create. '
                                      'Alignment_with_Product_lifecycle is one '
                                      'type of Business_Transformation_themes.',
                           'edge': 'enabled_by',
                           'end_node_label': 'Business_Transformation_themes',
                           'end_node_name': 'Alignment_with_Product_lifecycle',
                           'source': 'ps-graph',
                           'start_node_label': 'Value_Unilever_create',
                           'start_node_name': '1._Consumer_benefits'},
              'score': 0.819398165,
              'values': []},
             {'id': '49',
              'metadata': {'c

<font size=4>As we can see, the response return the similar vector from the prompt when using Pinecone API

# Use Langchain to search

# Option 1. Query using similarity search

In [6]:


# The environment should be the one specified next to the API key
# in your Pinecone console
pinecone.init(api_key=pinecone_api_key,environment=env_name)
index = pinecone.Index(index_name)
vectorstore = Pinecone(index, embeddings.embed_query, "context", namespace=my_namespace)


query = "what are the values of unilever create?"
docs = vectorstore.similarity_search(query)
docs

[Document(page_content='1._Consumer_benefits is enabled_by Alignment_with_Product_lifecycle. 1._Consumer_benefits is one type of Value_Unilever_create. Alignment_with_Product_lifecycle is one type of Business_Transformation_themes.', metadata={'edge': 'enabled_by', 'end_node_label': 'Business_Transformation_themes', 'end_node_name': 'Alignment_with_Product_lifecycle', 'source': 'ps-graph', 'start_node_label': 'Value_Unilever_create', 'start_node_name': '1._Consumer_benefits'}),
 Document(page_content='3._Improved_health_and_well_being is enabled_by Digitising_R&D. 3._Improved_health_and_well_being is one type of Value_Unilever_create. Digitising_R&D is one type of Business_Transformation_themes.', metadata={'edge': 'enabled_by', 'end_node_label': 'Business_Transformation_themes', 'end_node_name': 'Digitising_R&D', 'source': 'ps-graph', 'start_node_label': 'Value_Unilever_create', 'start_node_name': '3._Improved_health_and_well_being'}),
 Document(page_content='2._Top_&_bottom_line_grow

# Option 2. Use chain to qurey

In [7]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
query = "what are the values of unilever create?"
docs = vectorstore.similarity_search(query, k=10)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

{'output_text': ' The values of Unilever Create are Consumer Benefits, Improved Health and Well-Being, Top and Bottom Line Growth, Innovation, Marketing, Sales, Consumer Insight, and Sourcing.\nSOURCES: ps-graph'}

In [8]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
query = "how many types of migitigation associated with project risks and give their name please"
docs = vectorstore.similarity_search(query, k=10)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

{'output_text': ' There are 8 types of mitigation associated with project risks: Steering_group, Broad_stakeholder_representation, Appropriate_personnel, Link_with_digital_programme_team, Regular_progress_updates, Monitor_volume_of_change_programmes, Executive_sponsorship, and Identify_best_practice_regulatory_project_as_examplar.\nSOURCES: ps-graph'}

In [9]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
query = "How many types of Strategy_themes and give the list of their names?"
docs = vectorstore.similarity_search(query, k=10)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

{'output_text': ' There are 8 types of Strategy_themes: Winning_with_brands_and_innovation, Winning_in_the_marketplace, Winning_with_people_, Winning_through_continuous_improvement, Agility_for_a_Changing_Market, Operating_with_Purpose, Partnerships_for_the_future, and Quality_and_Service_theme.\nSOURCES: ps-graph'}

In [10]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
query = "What are the names of Mitigation associated with Poor_project_execution"
docs = vectorstore.similarity_search(query, k=10)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

{'output_text': ' The names of Mitigations associated with Poor_project_execution are Monitor_volume_of_change_programmes, Regular_progress_updates, Executive_sponsorship, Link_with_digital_programme_team, Broad_stakeholder_representation, Appropriate_personnel, and Steering_group.\nSOURCES: ps-graph'}

In [11]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
query = "How many types of areas of support and give the list of their names?"
docs = vectorstore.similarity_search(query, k=10)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

{'output_text': ' There are 8 types of Areas of Support: Identify_best_practice_regulatory_project_as_examplar, Cross-project_review_of_plastic_dependencies, Review_project_portfolio_balance_for_2020_Q1, Rescope_large_project_which_has_failed_to_deliver_in_2020, Conduct_project_assurance_readiness_for_Brexit_Programme, Home_Care_as_growth_driver, P3M_Optimisation, and P3M_Assessment.\nSOURCES: ps-graph'}

# QA-Chain with Source returns reasonable output. However, it still depends on the top K search. BUT the result looks pretty good.