In [1]:
import os 
from dotenv import load_dotenv
import pinecone
import numpy as np
from nomic import atlas
import nomic
from tqdm import tqdm 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import DirectoryLoader
import json 

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key

pinecone_api_key = os.getenv('PINECONE_KEY')
pinecone_env_name = os.getenv('PINECONE_ENV')
pinecone_index_name = os.getenv('PINECONE_INDEX')
nomic_api_key = os.getenv('NOMIC_KEY')

pinecone_config = {
    "api_key":pinecone_api_key,
    "env_name":pinecone_env_name,
    "index_name":pinecone_index_name
}


  from tqdm.autonotebook import tqdm


# Vectors Embedding

<font size=4 color=blue> This time I get the embedding vectors locally rather than fetching the vectors from Pinecone, because cloud vector database has limitations of fetching the vectors (bandwidth and keys encryption). For visulization and experiment purpose, it is suggested that embeding and saving the vectors before upserting to vector database

In [2]:
# pdf_directory = "../docs"
# my_loader = DirectoryLoader(pdf_directory, glob='**/*.pdf')
# documents = my_loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
# docs = text_splitter.split_documents(documents)


In [3]:
# #### Uncomment for first time uses
# embeddings = OpenAIEmbeddings()
# chunk_dicts = {
#     "text":[],
#     "vectors":[],
#     "categories":[]
# }

# for chunk in tqdm(docs):
#     chunk_text = chunk.page_content
#     chunk_embedded_vector = embeddings.embed_documents([chunk_text])    

#     chunk_dicts["vectors"].append(chunk_embedded_vector)
#     chunk_dicts["categories"].append("pdf_chunk_embeddings")
#     chunk_dicts["text"].append([chunk_text])
    

In [4]:
# 
# with open('pdf_embedding.json', 'w') as f:
#     # Use json.dump to write the data to the file.
#     json.dump(chunk_dicts, f)

In [5]:
with open('pdf_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    pdf_embedding = json.load(f)

In [6]:
nomic.login(nomic_api_key)
id_list = range(len(pdf_embedding['text']))
project = atlas.map_embeddings(embeddings= np.squeeze(np.array(pdf_embedding["vectors"]),  axis=1),
                               data=[{'id':str(idx), 'text':text[0]} for idx, text in zip(id_list, pdf_embedding['text'])],
                               id_field='id',
                              )

[32m2023-07-03 16:58:05.494[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m749[0m - [1mCreating project `inexpensive-numismatist` in organization `rancheng0918`[0m
[32m2023-07-03 16:58:09.338[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m107[0m - [1mUploading embeddings to Atlas.[0m
2it [00:03,  1.75s/it]                                                          
[32m2023-07-03 16:58:12.880[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1371[0m - [1mUpload succeeded.[0m
[32m2023-07-03 16:58:12.881[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m126[0m - [1mEmbedding upload succeeded.[0m
[32m2023-07-03 16:58:14.144[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1081[0m - [1mCreated map `inexpensive-numismatist` in project `inexpensive-numismatist`: https://atlas.nomic.ai/map/8edb59a4-c1e5-4c8e-afa4-73cb83fecd7e/53bdc26a-007a-43d7-b

# Graph Embeddings

In [7]:
from neo4j import GraphDatabase
neo4j_url = os.getenv('NEO4J_URL')
neo4j_user = os.getenv('NEO4J_USER')
neo4j_password = os.getenv('NEO4J_PASSWORD')

In [8]:
# driver = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))
# # Divide the graph into trunks
# def get_pair_nodes (tx):
#     pairs_of_nodes = []
#     for record in tx.run("MATCH (a)-[r]->(b) RETURN labels(a), a.name, type(r), labels(b), b.name"):
#         pair_node = {
#             "start_node_name": record["a.name"],
#             "start_node_label":record["labels(a)"][0],
#             "end_node_name": record["b.name"],
#             "end_node_label":record["labels(b)"][0],
#             "edge": record["type(r)"]
#         }
#         pairs_of_nodes.append(pair_node)
#     return pairs_of_nodes

# with driver.session() as session:
#     pairs_of_nodes = session.execute_read(get_pair_nodes)
    
# driver.close()

# node_names = set()
# node_types = set()
# edge_types = set()
# for p_n in pairs_of_nodes:
#     node_names.add(p_n["start_node_name"])
#     node_names.add(p_n["end_node_name"])
    
#     node_types.add(p_n["start_node_label"])
#     node_types.add(p_n["end_node_label"])
#     edge_types.add(p_n["edge"])

In [9]:
# node_dicts = {
#     "text":[],
#     "vectors":[],
#     "categories":[]
    
# }

# for n in node_names:
#     node_dicts["text"].append(n)
#     node_dicts["vectors"].append(embeddings.embed_documents([n]))
#     node_dicts["categories"].append("node_name (graphs)")
    
# for n in node_types:
#     node_dicts["text"].append(n)
#     node_dicts["vectors"].append(embeddings.embed_documents([n]))
#     node_dicts["categories"].append("node_type (graphs)")
    
# for e in edge_types:
#     node_dicts["text"].append(e)
#     node_dicts["vectors"].append(embeddings.embed_documents([e]))
#     node_dicts["categories"].append("edge_type (graphs)")
    
# with open('graph_embedding.json', 'w') as f:
#     # Use json.dump to write the data to the file.
#     json.dump(node_dicts, f)  
    

In [10]:
with open('graph_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    graph_embedding = json.load(f)

In [11]:
nomic.login(nomic_api_key)
id_list = range(len(graph_embedding['text']))
project = atlas.map_embeddings(embeddings= np.squeeze(np.array(graph_embedding["vectors"]),  axis=1),
                               data=[{'id':idx, 'text':text} for idx, text in zip(id_list, graph_embedding['text'])], 
                               id_field='id')

[32m2023-07-03 16:58:17.918[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m749[0m - [1mCreating project `rabid-sheet` in organization `rancheng0918`[0m
[32m2023-07-03 16:58:20.082[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m107[0m - [1mUploading embeddings to Atlas.[0m
1it [00:01,  1.97s/it]
[32m2023-07-03 16:58:22.050[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1371[0m - [1mUpload succeeded.[0m
[32m2023-07-03 16:58:22.051[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m126[0m - [1mEmbedding upload succeeded.[0m
[32m2023-07-03 16:58:23.366[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1081[0m - [1mCreated map `rabid-sheet` in project `rabid-sheet`: https://atlas.nomic.ai/map/dca0f7ff-eff0-405b-9eef-0995f388d58a/2255d1f2-3e34-405a-b870-0542d6c5da07[0m
[32m2023-07-03 16:58:23.367[0m | [1mINFO    [0m | [36mnomic.atlas[0

# View PDF emedding and Graph embedding in the same plot

In [12]:
# stack pdf and graph embeddings
pdf_embedding_vectors = np.squeeze(np.array(pdf_embedding["vectors"]),  axis=1)
graph_embedding_vectors = np.squeeze(np.array(graph_embedding["vectors"]),  axis=1)

all_embeddings_vectors = np.vstack((pdf_embedding_vectors,graph_embedding_vectors))
all_embeddings_vectors.shape

(2024, 1536)

In [14]:
# stack data
id_list = range(len(pdf_embedding['text']))

pdf_data = [{'id':idx, 'text':text[0], 'category':cat} for idx, text, cat in zip(id_list, pdf_embedding['text'], pdf_embedding['categories'])]

id_list = range(len(pdf_embedding['text']), len(pdf_embedding['text'])+ len(graph_embedding['text']))
graph_data = [{'id':idx, 'text':text} for idx, text, cat in zip(id_list, graph_embedding['text'], graph_embedding['categories'])]

all_data = pdf_data + graph_data

In [15]:
project = atlas.map_embeddings(embeddings= all_embeddings_vectors,
                               data=all_data, 
                               id_field='id',
                               colorable_fields=['category']
                              )

[32m2023-07-03 16:59:07.971[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m749[0m - [1mCreating project `foamy-armchair` in organization `rancheng0918`[0m
[32m2023-07-03 16:59:09.993[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m107[0m - [1mUploading embeddings to Atlas.[0m
2it [00:03,  1.73s/it]                                                          
[32m2023-07-03 16:59:13.471[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1371[0m - [1mUpload succeeded.[0m
[32m2023-07-03 16:59:13.471[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m126[0m - [1mEmbedding upload succeeded.[0m
[32m2023-07-03 16:59:14.728[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1081[0m - [1mCreated map `foamy-armchair` in project `foamy-armchair`: https://atlas.nomic.ai/map/1ce92be5-155a-4f0a-af32-111e574b8b19/188ec58d-4a59-42b4-9157-ef81abb4d198[0m
[32m2