In [1]:
import os 
from dotenv import load_dotenv
import pinecone
import numpy as np
from nomic import atlas
import nomic
from tqdm import tqdm 

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key

pinecone_api_key = os.getenv('PINECONE_KEY')
pinecone_env_name = os.getenv('PINECONE_ENV')
pinecone_index_name = os.getenv('PINECONE_INDEX')
nomic_api_key = os.getenv('NOMIC_KEY')

pinecone_config = {
    "api_key":pinecone_api_key,
    "env_name":pinecone_env_name,
    "index_name":pinecone_index_name
}


  from tqdm.autonotebook import tqdm


# Load Basic of Pinecone Vectorstore

In [2]:
# Load vector store
my_namespace = 'Unilever-2018-2019'
pinecone.init(api_key=pinecone_api_key,environment=pinecone_env_name)
index = pinecone.Index(pinecone_index_name)

In [3]:
pinecone.describe_index(pinecone_index_name)


IndexDescription(name='test-chatbot-ran', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

In [4]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.1,
 'namespaces': {'Unilever-2018-2019': {'vector_count': 1999},
                'graph': {'vector_count': 119},
                'graph_02': {'vector_count': 134}},
 'total_vector_count': 2252}

# Fetch Embedded Chunks

In [5]:
# Since the list of ids are encrypted, therefore we need to query the vectors by getting the id list
# for pinecone, the maximum fecth number of vector is 1000
def fetch_id_list(index, number_ids=1999, my_namespace="Unilever-2018-2019"):
    pbar = tqdm(total = number_ids)
    id_list = []
    while len(id_list)<=number_ids:
        test_vector = np.random.rand(1, 1536)
        test_vector = test_vector.tolist()
        matched_vectors = index.query(
                  vector=test_vector,
                  top_k=20,
                  include_values=True,
                  namespace=my_namespace)
        
        for matched_vector in matched_vectors['matches']:
            macthed_id = matched_vector['id']
            if macthed_id not in id_list:
                id_list.append(macthed_id)
                pbar.update(1)
    return id_list

In [6]:
# pdf_id_list = fetch_id_list(index, number_ids=1500, my_namespace="Unilever-2018-2019")
# pdf_id_list[:5]

# with open('pdf_ids.txt', 'w') as file:
#     for item in pdf_id_list:
#         file.write('%s\n' % item)

In [8]:
# graph_id_list = fetch_id_list(index, number_ids=110, my_namespace="graph")
# with open('graph_ids.txt', 'w') as file:
#     for item in graph_id_list:
#         file.write('%s\n' % item)

112it [00:53,  2.09it/s]                                                        


In [9]:
with open('pdf_ids.txt', 'r') as file:
    pdf_id_list = file.read().splitlines()

In [10]:
with open('graph_ids.txt', 'r') as file:
    graph_id_list = file.read().splitlines()

# <font color = red>Visualize the embeddings of PDF

In [11]:
def split_list(lst, sublist_size):
    return [lst[i:i + sublist_size] for i in range(0, len(lst), sublist_size)]

In [17]:
def get_embedding_and_ids (index, id_list, my_namespace):
    if len(id_list)>1000:
        split_id_lists = split_list(id_list,1000)
    else: 
        split_id_lists = [id_list]
        
    ids = []
    embeddings = []
    
    for my_id_list in tqdm(split_id_lists):
        vectors = index.fetch(my_id_list, namespace=my_namespace)
        for id, vector in vectors['vectors'].items():
            ids.append(id)
            embeddings.append(vector['values'])
        
    embeddings = np.array(embeddings)
    
    return ids, embeddings

In [18]:
pdf_ids, pdf_embeddings = get_embedding_and_ids (index, pdf_id_list, "Unilever-2018-2019")

100%|█████████████████████████████████████████████| 2/2 [00:53<00:00, 26.58s/it]


In [19]:
nomic.login(nomic_api_key)
project = atlas.map_embeddings(embeddings=pdf_embeddings, data=[{'id': id} for id in pdf_ids], id_field='id')

[32m2023-06-29 15:18:03.313[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m749[0m - [1mCreating project `observant-agony` in organization `rancheng0918`[0m
[32m2023-06-29 15:18:05.710[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m107[0m - [1mUploading embeddings to Atlas.[0m
2it [00:03,  1.72s/it]                                                          
[32m2023-06-29 15:18:09.197[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1371[0m - [1mUpload succeeded.[0m
[32m2023-06-29 15:18:09.200[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m126[0m - [1mEmbedding upload succeeded.[0m
[32m2023-06-29 15:18:10.677[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1081[0m - [1mCreated map `observant-agony` in project `observant-agony`: https://atlas.nomic.ai/map/071122d2-63ae-4cbd-875d-73393b267e0e/0bf6f061-cbbd-4688-8a45-f76e34503cf2[0m
[3

# <font color = red>Visualize the embeddings of Graph

In [20]:
graph_ids, graph_embeddings = get_embedding_and_ids (index, graph_id_list, "graph")
project = atlas.map_embeddings(embeddings=graph_embeddings, data=[{'id': id} for id in graph_ids], id_field='id')

100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.58s/it]
[32m2023-06-29 15:18:16.080[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m749[0m - [1mCreating project `threatening-flatboat` in organization `rancheng0918`[0m
[32m2023-06-29 15:18:18.079[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m107[0m - [1mUploading embeddings to Atlas.[0m
1it [00:01,  1.58s/it]
[32m2023-06-29 15:18:19.671[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1371[0m - [1mUpload succeeded.[0m
[32m2023-06-29 15:18:19.673[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m126[0m - [1mEmbedding upload succeeded.[0m
[32m2023-06-29 15:18:21.121[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1081[0m - [1mCreated map `threatening-flatboat` in project `threatening-flatboat`: https://atlas.nomic.ai/map/6a180834-b43e-4d27-8523-08e443726486/973a49

# <font color="red">Mapping both graphs on atlas

In [21]:
def create_data_category(cat_name, id_list):
    
    data = [
        {'category': cat_name, 'id': i}
            for i in id_list
    ]
    
    return data
    

In [22]:
all_embeddings = np.concatenate((pdf_embeddings, graph_embeddings),axis=0)
all_data = create_data_category('pdf-embedding', pdf_ids) + create_data_category('graph-embedding', graph_ids)

project = atlas.map_embeddings(embeddings=all_embeddings,
                                data=all_data,
                                id_field='id',
                                colorable_fields=['category']
                                )

[32m2023-06-29 15:18:41.743[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m749[0m - [1mCreating project `godly-average` in organization `rancheng0918`[0m
[32m2023-06-29 15:18:44.109[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m107[0m - [1mUploading embeddings to Atlas.[0m
2it [00:03,  1.59s/it]                                                          
[32m2023-06-29 15:18:47.337[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1371[0m - [1mUpload succeeded.[0m
[32m2023-06-29 15:18:47.342[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m126[0m - [1mEmbedding upload succeeded.[0m
[32m2023-06-29 15:18:48.801[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1081[0m - [1mCreated map `godly-average` in project `godly-average`: https://atlas.nomic.ai/map/9f825921-bf5c-49df-b1d2-7c38bb945c68/68939303-f484-4ed3-b511-c2665d21fcf3[0m
[32m2023

 <font size =5 color=green> The NOMIC package is user-friendly for sharing and has integrated settings for the plot. As we can see, the embeddings of graph has closer distance with certain parts of embedded vectors

 <font size =5 color=orange> Limitations for implementation: 1) Pinecone vector database do not have the qurey of ids of vectorstore, I wrote a function using random vectors to fetch the id list. 2) pinecone can only fetch 1000 embedding vectors each time. Therefore, I recommend that we should save and store the IDs and embeddings before uploading to pinecone for delopyment.