In [1]:
import json
import random
import os 
from dotenv import load_dotenv
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import json 
from langchain.embeddings.openai import OpenAIEmbeddings


load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key
embeddings = OpenAIEmbeddings()
pinecone_api_key = os.getenv('PINECONE_KEY')
pinecone_env_name = os.getenv('PINECONE_ENV')
pinecone_index_name = os.getenv('PINECONE_INDEX')

  from tqdm.autonotebook import tqdm


# Load embeddings of PDF and graphs

In [2]:
with open('pdf_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    pdf_embedding = json.load(f)

In [3]:
with open('graph_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    graph_embedding = json.load(f)

In [4]:
pdf_embedding.keys()

dict_keys(['text', 'vectors', 'categories'])

In [5]:
pdf_embedding.keys(), graph_embedding.keys()

(dict_keys(['text', 'vectors', 'categories']),
 dict_keys(['text', 'vectors', 'categories']))

# Choose Examples

In [6]:
# Pick up random chunk
select_idx = random.randint(0, len(pdf_embedding["text"]))
test_chunk = pdf_embedding["text"][select_idx][0]
print (f"The test chunk of PDF is the {select_idx} of embedding vectors and its content is: \n {test_chunk}")

The test chunk of PDF is the 938 of embedding vectors and its content is: 
 (288)

income tax rate of 25% (2018: 25%)

Tax effects of:

Income not subject to tax (primarily

Financial guarantees Where the Company enters into financial guarantee contracts to guarantee the indebtedness of other companies within its group, the Company considers these to be insurance arrangements and accounts for them as such. In this respect, the Company treats the guarantee contract as a contingent liability until such time as it becomes probable that the Company will be required to make a payment under the guarantee.

4,528

87

tax exempt dividends)

(35)

(27)

Non recoverable withholding tax

–

(10)

(Under)/over provided in prior years

79

(12)

Reduction in tax rate on deferred tax balances

Utilisation of prior year tax credit

78

85

(15)

29

Other

IFRS 16 - Leases Unilever N.V. does not have any lease arrangements on a standalone basis and so there is no impact of IFRS 16.

(135)

Total tax

# Get the TopK neighbourhod list

# Option 1. Use Pinecone Vectorstore to perform similarity search

<font size=4 color=green> This option is suitable for delopyment, but not suitable for research as it needs to upload the vector database first and then use their own API.

In [7]:
pinecone.init(api_key=pinecone_api_key,environment=pinecone_env_name)
index = pinecone.Index(pinecone_index_name)

In [8]:
vectorstore = Pinecone(index,embeddings.embed_query, "text", namespace="chatbot-testing")

In [9]:
neighbouhood_docs = vectorstore.similarity_search(test_chunk, k=5)
neighbouhood_docs[:3]

[Document(page_content='Financial guarantees Where the Company enters into financial guarantee contracts to guarantee the indebtedness of other companies within its group, the Company considers these to be insurance arrangements and accounts for them as such. In this respect, the Company treats the guarantee contract as a contingent liability until such time as it becomes probable that the Company will be required to make a payment under the guarantee.\n\n4,528\n\n87\n\ntax exempt dividends)\n\n(35)\n\n(27)\n\nNon recoverable withholding tax\n\n–\n\n(10)\n\n(Under)/over provided in prior years\n\n79\n\n(12)\n\nReduction in tax rate on deferred tax balances\n\nUtilisation of prior year tax credit\n\n78\n\n85\n\n(15)\n\n29\n\nOther\n\nIFRS 16 - Leases Unilever N.V. does not have any lease arrangements on a standalone basis and so there is no impact of IFRS 16.\n\n(135)\n\nTotal tax expense\n\n(136)\n\n5. Intangible assets\n\n€ million\n\n€ million Indefinite- life intangible assets\n\n€ 

# Option 2. Use KNN nearest neighbour search

<font size=4 color=green> This option is suitable for research at development stage, but not suitable for delopyment as it needs to align the local data structure

In [10]:
import numpy as np
import heapq

def cosine_similarity(a, b):
    """Compute the cosine similarity between vector a and vector b"""
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

def find_most_similar_vectors(input_vector, vector_list, top_k):
    """Find the topK most similar vectors to the input_vector in the vector_list"""
    similarity_scores = []

    for i, vector in enumerate(vector_list):
        similarity_score = cosine_similarity(input_vector, vector[0])
        similarity_scores.append((similarity_score, i))

    # Get the topK vectors
    top_k_vectors = heapq.nlargest(top_k, similarity_scores)

    # Indices of the topK vectors
    top_k_indices = [item[1] for item in top_k_vectors]

    return top_k_indices

In [11]:
input_vector = pdf_embedding["vectors"][select_idx][0]
vector_list = pdf_embedding["vectors"]
top_k_indices = find_most_similar_vectors(input_vector, vector_list, top_k=5)
similar_chunks = [pdf_embedding["text"][idx] for idx in top_k_indices]
similar_chunks [:3]

[['(288)\n\nincome tax rate of 25% (2018: 25%)\n\nTax effects of:\n\nIncome not subject to tax (primarily\n\nFinancial guarantees Where the Company enters into financial guarantee contracts to guarantee the indebtedness of other companies within its group, the Company considers these to be insurance arrangements and accounts for them as such. In this respect, the Company treats the guarantee contract as a contingent liability until such time as it becomes probable that the Company will be required to make a payment under the guarantee.\n\n4,528\n\n87\n\ntax exempt dividends)\n\n(35)\n\n(27)\n\nNon recoverable withholding tax\n\n–\n\n(10)\n\n(Under)/over provided in prior years\n\n79\n\n(12)\n\nReduction in tax rate on deferred tax balances\n\nUtilisation of prior year tax credit\n\n78\n\n85\n\n(15)\n\n29\n\nOther\n\nIFRS 16 - Leases Unilever N.V. does not have any lease arrangements on a standalone basis and so there is no impact of IFRS 16.\n\n(135)\n\nTotal tax expense\n\n(136)\n\n5.

# <font color =blue>As we can see, the pinecone use KNN nearest neigoubour search to find similar documents as the results are exactly same. The metric is depened on the setting of pinecone database.

# Similar for Graph Nodes and Edges

In [12]:
select_idx = random.randint(0, len(graph_embedding["text"]))
test_node = graph_embedding["text"][select_idx]
print (f"The test chunk of PDF is the {select_idx} of embedding vectors and its content is: \n {test_node}")

The test chunk of PDF is the 14 of embedding vectors and its content is: 
 1._Consumer_benefits


In [13]:
input_vector = graph_embedding["vectors"][select_idx][0]
vector_list = graph_embedding["vectors"]
top_k_indices = find_most_similar_vectors(input_vector, vector_list, top_k=5)
similar_nodes = [graph_embedding["text"][idx] for idx in top_k_indices]
similar_nodes

['1._Consumer_benefits',
 '7._Consumer_insight_',
 '6._Consumer_Use',
 '1._Innovation',
 '3._Improved_health_and_well_being']

# <font color=red>Get the neighbour graph elements and chunks

In [14]:
def get_closet_graph_elements(input_text, graph_embedding):
    query_vector = embeddings.embed_query(input_text)
    top_k_indices = find_most_similar_vectors(input_vector, graph_embedding["vectors"], top_k=5)
    
    return [graph_embedding["text"][idx] for idx in top_k_indices]

def get_closet_pdf_elements(input_text, pdf_embedding):
    query_vector = embeddings.embed_query(input_text)
    top_k_indices = find_most_similar_vectors(input_vector, pdf_embedding["vectors"], top_k=5)
    return [pdf_embedding["text"][idx] for idx in top_k_indices]

In [15]:
example_pdf_chunk = pdf_embedding["text"][100]
example_pdf_chunk

['We’re making good progress at management level. Women held 51% of our managerial roles as of December 2019 and our efforts have been recognised – we were featured in the Bloomberg Gender Equality Index in 2019. Despite this, there is still work to be done to ensure a balanced representation of women at senior management level and above. Among the various initiatives to address this, we have two targeted programmes to develop our senior women and create a healthy pipeline of talent.\n\nOur aim is simple: to be a diverse and inclusive workplace where people with purpose thrive.\n\nFit for the future']

In [16]:
get_closet_graph_elements(example_pdf_chunk[0], graph_embedding)

['1._Consumer_benefits',
 '7._Consumer_insight_',
 '6._Consumer_Use',
 '1._Innovation',
 '3._Improved_health_and_well_being']

In [17]:
get_closet_pdf_elements(example_pdf_chunk[0], pdf_embedding)

[['As the ultimate user of our products, consumers continue to look for quality products that are convenient and good value – and increasingly want more natural ingredients and less packaging and waste. We also know that brands that demonstrate a meaningful purpose create conversations and brand loyalty, particularly among younger generations.\n\nFor more on consumers see pages 14 to 15.\n\nOur people Without talented and committed employees, we could never deliver on our ambitions.\n\nInterests and concerns\n\nHow we engaged in 2019\n\nConsiderations and outcomes'],
 ['Fragmentation remains a principal driver of change, impacting consumer journeys, route-to-market channels and media, and brand spend. Consumers are taking different paths to purchase, often combining offline and online channels where influencers are a growing force. Younger consumers continue to prioritise meaning over materialism and are demanding more authenticity, transparency and natural ingredients. The talkability