In [1]:
import json
import random
import os 
from dotenv import load_dotenv
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import json 
from langchain.embeddings.openai import OpenAIEmbeddings


load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key
embeddings = OpenAIEmbeddings()
pinecone_api_key = os.getenv('PINECONE_KEY')
pinecone_env_name = os.getenv('PINECONE_ENV')
pinecone_index_name = os.getenv('PINECONE_INDEX')

  from tqdm.autonotebook import tqdm


# Load embeddings of PDF and graphs

In [2]:
with open('pdf_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    pdf_embedding = json.load(f)

In [3]:
with open('graph_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    graph_embedding = json.load(f)

In [4]:
pdf_embedding.keys()

dict_keys(['text', 'vectors', 'categories'])

In [5]:
pdf_embedding.keys(), graph_embedding.keys()

(dict_keys(['text', 'vectors', 'categories']),
 dict_keys(['text', 'vectors', 'categories']))

# Choose Examples

In [6]:
# Pick up random chunk
select_idx = random.randint(0, len(pdf_embedding["text"]))
test_chunk = pdf_embedding["text"][select_idx][0]
print (f"The test chunk of PDF is the {select_idx} of embedding vectors and its content is: \n {test_chunk}")

The test chunk of PDF is the 1131 of embedding vectors and its content is: 
 Iran-related required disclosure Unilever operates in Iran through a non-US subsidiary. In 2019, sales in Iran were significantly less than one percent of Unilever’s worldwide turnover. During the year, this non-US subsidiary had approximately €1,334 in gross revenues and less than €547 in net profits attributable to the sale of food, personal care and home care products to the Hotel Homa Group, which is owned by the Social Security Organization of Iran, and IRR Mohammad Rasoullah Pharmacy & Kowsar ‘Veterans of IRGC’, which are affiliated with the Islamic Republic Revolutionary Guard Corps. Income, payroll and other taxes, duties and fees (including for utilities) were payable to the Government of Iran and affiliated entities in connection with our operations. Our non-US subsidiary maintains bank accounts in Iran with various banks to facilitate our business in the country and make any required payments to the

# Get the TopK neighbourhod list

# Option 1. Use Pinecone Vectorstore to perform similarity search

<font size=4 color=green> This option is suitable for delopyment, but not suitable for research as it needs to upload the vector database first and then use their own API.

In [7]:
pinecone.init(api_key=pinecone_api_key,environment=pinecone_env_name)
index = pinecone.Index(pinecone_index_name)

In [8]:
vectorstore = Pinecone(index,embeddings.embed_query, "text", namespace="chatbot-testing")

In [9]:
neighbouhood_docs = vectorstore.similarity_search(test_chunk, k=5)
neighbouhood_docs[:3]

[Document(page_content='Iran-related required disclosure Unilever operates in Iran through a non-US subsidiary. In 2019, sales in Iran were significantly less than one percent of Unilever’s worldwide turnover. During the year, this non-US subsidiary had approximately €1,334 in gross revenues and less than €547 in net profits attributable to the sale of food, personal care and home care products to the Hotel Homa Group, which is owned by the Social Security Organization of Iran, and IRR Mohammad Rasoullah Pharmacy & Kowsar ‘Veterans of IRGC’, which are affiliated with the Islamic Republic Revolutionary Guard Corps. Income, payroll and other taxes, duties and fees (including for utilities) were payable to the Government of Iran and affiliated entities in connection with our operations. Our non-US subsidiary maintains bank accounts in Iran with various banks to facilitate our business in the country and make any required payments to the Government of Iran and affiliated entities. While we

# Option 2. Use KNN nearest neighbour search

<font size=4 color=green> This option is suitable for research at development stage, but not suitable for delopyment as it needs to align the local data structure

In [10]:
import numpy as np
import heapq

def cosine_similarity(a, b):
    """Compute the cosine similarity between vector a and vector b"""
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

def find_most_similar_vectors(input_vector, vector_list, top_k):
    """Find the topK most similar vectors to the input_vector in the vector_list"""
    similarity_scores = []

    for i, vector in enumerate(vector_list):
        similarity_score = cosine_similarity(input_vector, vector[0])
        similarity_scores.append((similarity_score, i))

    # Get the topK vectors
    top_k_vectors = heapq.nlargest(top_k, similarity_scores)

    # Indices of the topK vectors
    top_k_indices = [item[1] for item in top_k_vectors]

    return top_k_indices

In [11]:
input_vector = pdf_embedding["vectors"][select_idx][0]
vector_list = pdf_embedding["vectors"]
top_k_indices = find_most_similar_vectors(input_vector, vector_list, top_k=5)
similar_chunks = [pdf_embedding["text"][idx] for idx in top_k_indices]
similar_chunks [:3]

[['Iran-related required disclosure Unilever operates in Iran through a non-US subsidiary. In 2019, sales in Iran were significantly less than one percent of Unilever’s worldwide turnover. During the year, this non-US subsidiary had approximately €1,334 in gross revenues and less than €547 in net profits attributable to the sale of food, personal care and home care products to the Hotel Homa Group, which is owned by the Social Security Organization of Iran, and IRR Mohammad Rasoullah Pharmacy & Kowsar ‘Veterans of IRGC’, which are affiliated with the Islamic Republic Revolutionary Guard Corps. Income, payroll and other taxes, duties and fees (including for utilities) were payable to the Government of Iran and affiliated entities in connection with our operations. Our non-US subsidiary maintains bank accounts in Iran with various banks to facilitate our business in the country and make any required payments to the Government of Iran and affiliated entities. While we currently continue o

# <font color =blue>As we can see, the pinecone use KNN nearest neigoubour search to find similar documents as the results are exactly same. The metric is depened on the setting of pinecone database.

# Similar for Graph Nodes and Edges

In [12]:
select_idx = random.randint(0, len(graph_embedding["text"]))
test_node = graph_embedding["text"][select_idx]
print (f"The test chunk of PDF is the {select_idx} of embedding vectors and its content is: \n {test_node}")

The test chunk of PDF is the 55 of embedding vectors and its content is: 
 Innovation_theme


In [13]:
input_vector = graph_embedding["vectors"][select_idx][0]
vector_list = graph_embedding["vectors"]
top_k_indices = find_most_similar_vectors(input_vector, vector_list, top_k=5)
similar_nodes = [graph_embedding["text"][idx] for idx in top_k_indices]
similar_nodes

['Innovation_theme',
 'Strategy_theme',
 'Quality_and_Service_theme',
 'Business_Transformation_themes',
 '1._Innovation']

# <font color=red>Get the neighbour graph elements and chunks

In [17]:
def get_closet_graph_elements(input_text, graph_embedding):
    query_vector = embeddings.embed_query(input_text)
    top_k_indices = find_most_similar_vectors(input_vector, graph_embedding["vectors"], top_k=5)
    
    return [graph_embedding["text"][idx] for idx in top_k_indices]

def get_closet_pdf_elements(input_text, pdf_embedding):
    query_vector = embeddings.embed_query(input_text)
    top_k_indices = find_most_similar_vectors(input_vector, graph_embedding["vectors"], top_k=5)
    return [graph_embedding["text"][idx] for idx in top_k_indices]

["c\n\nc\n\no\n\nu\n\nn\n\nt\n\ns\n\n2\n\n0\n\n1\n\n9\n\nPurpose-led, future-fit\n\nUnilever Annual Report and Accounts 2019\n\nUnilever Annual Report and Accounts 2019 This document is made up of the Strategic Report, the Governance Report, the Financial Statements and Notes, and Additional Information for US Listing Purposes.\n\nIn this report\n\nStrategic Report How our strategy is delivering value for our stakeholders\n\nAt a glance\n\n2\n\nThe Unilever Group consists of Unilever N.V. (NV) and Unilever PLC (PLC) together with the companies they control. The terms “Unilever”, the “Group”, “we”, “our” and “us” refer to the Unilever Group.\n\nChairman's introduction\n\n4\n\nOur Board of Directors\n\n5"]

In [19]:
example_pdf_chunk = pdf_embedding["text"][100]
example_pdf_chunk

['We’re making good progress at management level. Women held 51% of our managerial roles as of December 2019 and our efforts have been recognised – we were featured in the Bloomberg Gender Equality Index in 2019. Despite this, there is still work to be done to ensure a balanced representation of women at senior management level and above. Among the various initiatives to address this, we have two targeted programmes to develop our senior women and create a healthy pipeline of talent.\n\nOur aim is simple: to be a diverse and inclusive workplace where people with purpose thrive.\n\nFit for the future']