In [1]:
import json
import random
import os 
from dotenv import load_dotenv
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import json 
from langchain.embeddings.openai import OpenAIEmbeddings


load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key
embeddings = OpenAIEmbeddings()
pinecone_api_key = os.getenv('PINECONE_KEY')
pinecone_env_name = os.getenv('PINECONE_ENV')
pinecone_index_name = os.getenv('PINECONE_INDEX')

  from tqdm.autonotebook import tqdm


# Load embeddings of PDF and graphs

In [2]:
with open('pdf_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    pdf_embedding = json.load(f)

In [3]:
with open('graph_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    graph_embedding = json.load(f)

In [4]:
pdf_embedding.keys()

dict_keys(['text', 'vectors', 'categories'])

In [5]:
pdf_embedding.keys(), graph_embedding.keys()

(dict_keys(['text', 'vectors', 'categories']),
 dict_keys(['text', 'vectors', 'categories']))

# Choose Examples

In [6]:
# Pick up random chunk
select_idx = random.randint(0, len(pdf_embedding["text"]))
test_chunk = pdf_embedding["text"][select_idx][0]
print (f"The test chunk of PDF is the {select_idx} of embedding vectors and its content is: \n {test_chunk}")

The test chunk of PDF is the 355 of embedding vectors and its content is: 
 Nationality Dutch Age 53, Male Appointed to ULE January 2016 Joined Unilever 1990 Previous Unilever posts include: Unilever East Africa and Emerging Markets (EVP); Chief Procurement Officer; Supply Chain, Spreads, Dressings and Olive Oil Europe (VP); Ice Cream Brazil (Managing Director); Ice Cream Brazil (VP); Corporate Strategy Group; Birds Eye Wall’s, Unilever UK (Operations Manager). Current external appointments: A. P. Møller Mærsk (Supervisory Board member).

Nationality Dutch Age 50, Female Appointed to ULE January 2018 Joined Unilever 2018 Previous posts include: Royal Ahold Delhaize (CEIO & EC member); Royal Ahold (CCO & EC member); P&G (VP & GM). Previous Unilever posts include: Europe (President). Current external appointments: Bayer AG (Supervisory Board member); Food Drink Europe (Board member); Leading Executives Advancing Diversity (LEAD) (Advisory Board member); Pepsi/ Lipton JV (Board member).


# Get the TopK neighbourhod list

# Option 1. Use Pinecone Vectorstore to perform similarity search

<font size=4 color=green> This option is suitable for delopyment, but not suitable for research as it needs to upload the vector database first and then use their own API.

In [7]:
pinecone.init(api_key=pinecone_api_key,environment=pinecone_env_name)
index = pinecone.Index(pinecone_index_name)

In [8]:
vectorstore = Pinecone(index,embeddings.embed_query, "text", namespace="chatbot-testing")

In [9]:
neighbouhood_docs = vectorstore.similarity_search(test_chunk, k=5)
neighbouhood_docs

[Document(page_content='Nationality Dutch Age 53, Male Appointed to ULE January 2016 Joined Unilever 1990 Previous Unilever posts include: Unilever East Africa and Emerging Markets (EVP); Chief Procurement Officer; Supply Chain, Spreads, Dressings and Olive Oil Europe (VP); Ice Cream Brazil (Managing Director); Ice Cream Brazil (VP); Corporate Strategy Group; Birds Eye Wall’s, Unilever UK (Operations Manager). Current external appointments: A. P. Møller Mærsk (Supervisory Board member).\n\nNationality Dutch Age 50, Female Appointed to ULE January 2018 Joined Unilever 2018 Previous posts include: Royal Ahold Delhaize (CEIO & EC member); Royal Ahold (CCO & EC member); P&G (VP & GM). Previous Unilever posts include: Europe (President). Current external appointments: Bayer AG (Supervisory Board member); Food Drink Europe (Board member); Leading Executives Advancing Diversity (LEAD) (Advisory Board member); Pepsi/ Lipton JV (Board member).', metadata={'source': 'docs/unilever-annual-report-

# Option 2. Use KNN nearest neighbour search

<font size=4 color=green> This option is suitable for research at development stage, but not suitable for delopyment as it needs to align the local data structure

In [10]:
import numpy as np
import heapq

def cosine_similarity(a, b):
    """Compute the cosine similarity between vector a and vector b"""
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

def find_most_similar_vectors(input_vector, vector_list, top_k):
    """Find the topK most similar vectors to the input_vector in the vector_list"""
    similarity_scores = []

    for i, vector in enumerate(vector_list):
        similarity_score = cosine_similarity(input_vector, vector[0])
        similarity_scores.append((similarity_score, i))

    # Get the topK vectors
    top_k_vectors = heapq.nlargest(top_k, similarity_scores)

    # Indices of the topK vectors
    top_k_indices = [item[1] for item in top_k_vectors]

    return top_k_indices

In [11]:
input_vector = pdf_embedding["vectors"][select_idx][0]
vector_list = pdf_embedding["vectors"]
top_k_indices = find_most_similar_vectors(input_vector, vector_list, top_k=5)
similar_chunks = [pdf_embedding["text"][idx] for idx in top_k_indices]
similar_chunks

[['Nationality Dutch Age 53, Male Appointed to ULE January 2016 Joined Unilever 1990 Previous Unilever posts include: Unilever East Africa and Emerging Markets (EVP); Chief Procurement Officer; Supply Chain, Spreads, Dressings and Olive Oil Europe (VP); Ice Cream Brazil (Managing Director); Ice Cream Brazil (VP); Corporate Strategy Group; Birds Eye Wall’s, Unilever UK (Operations Manager). Current external appointments: A. P. Møller Mærsk (Supervisory Board member).\n\nNationality Dutch Age 50, Female Appointed to ULE January 2018 Joined Unilever 2018 Previous posts include: Royal Ahold Delhaize (CEIO & EC member); Royal Ahold (CCO & EC member); P&G (VP & GM). Previous Unilever posts include: Europe (President). Current external appointments: Bayer AG (Supervisory Board member); Food Drink Europe (Board member); Leading Executives Advancing Diversity (LEAD) (Advisory Board member); Pepsi/ Lipton JV (Board member).'],
 ['Nationality Dutch Age 52, Male Appointed to ULE January 2016 Joine

# <font color =red>As we can see, the pinecone use KNN nearest neigoubour search to find similar documents as the results are exactly same. The metric is depened on the setting of pinecone database.

# Similar for Graph Nodes and Edges

In [14]:
select_idx = random.randint(0, len(graph_embedding["text"]))
test_node = graph_embedding["text"][select_idx]
print (f"The test chunk of PDF is the {select_idx} of embedding vectors and its content is: \n {test_node}")

The test chunk of PDF is the 39 of embedding vectors and its content is: 
 Acquisition_Programme_1


In [16]:
input_vector = graph_embedding["vectors"][select_idx][0]
vector_list = graph_embedding["vectors"]
top_k_indices = find_most_similar_vectors(input_vector, vector_list, top_k=5)
similar_nodes = [graph_embedding["text"][idx] for idx in top_k_indices]
similar_nodes

['Acquisition_Programme_1',
 'Acquisition_programme_2',
 'Disposal_programme_1',
 'Data_management_enhancement_programme_1',
 'Org._transformation_programme_1']