In [1]:
import os
import tensorflow as tf
from tensorboard.plugins import projector
import json
import pandas as pd
import numpy as np


import numpy as np
from dotenv import load_dotenv

from tqdm import tqdm 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key

# Save Embedding Vectors

In [2]:
def get_subfolders(directory):
    subfolders = []
    for root, dirs, files in os.walk(directory):
        for dir in dirs:
            subfolder = os.path.join(root, dir)
            subfolders.append(subfolder)
    return subfolders

In [3]:
subfolders = get_subfolders("../docs")
subfolders

['../docs/OEDT', '../docs/Praxis', '../docs/Unilever']

In [4]:
# embeddings = OpenAIEmbeddings()

# chunk_dicts = {
#     "text":[],
#     "vectors":[],
#     "categories":[]
# }

# for subfolder in tqdm(subfolders):
#     my_loader = DirectoryLoader(subfolder, glob='**/*.pdf')
#     documents = my_loader.load()
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
#     docs = text_splitter.split_documents(documents)
    
#     for chunk in tqdm(docs):
#         chunk_text = chunk.page_content
#         chunk_embedded_vector = embeddings.embed_documents([chunk_text])    

#         chunk_dicts["vectors"].append(chunk_embedded_vector)
#         chunk_dicts["categories"].append(subfolder.split("/")[-1])
#         chunk_dicts["text"].append([chunk_text])

In [5]:
# with open('new_pdf_embedding.json', 'w') as f:
#     # Use json.dump to write the data to the file.
#     json.dump(chunk_dicts, f)  

# Load PDF embeddings

In [6]:
with open('new_pdf_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    pdf_embedding = json.load(f)

In [7]:
pdf_embedding.keys()

dict_keys(['text', 'vectors', 'categories'])

In [8]:
len(pdf_embedding["text"]), len(pdf_embedding["vectors"])

(8916, 8916)

# Write embedding TSV

In [9]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir='./logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [10]:
embeddings_vectors = np.array([v[0] for v in pdf_embedding["vectors"]])
embeddings_df = pd.DataFrame(embeddings_vectors)
embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,-0.007892,0.010565,-0.001894,-0.017092,-0.008840,0.041859,-0.026363,-0.001982,-0.009387,-0.016948,...,0.040220,0.005506,0.010853,-0.001635,0.003554,0.000572,-0.003716,0.008733,-0.016632,-0.011104
1,-0.021001,0.003865,0.012621,-0.037704,0.003091,0.040578,-0.016131,0.003475,-0.010166,-0.029750,...,0.022717,-0.003991,0.029331,-0.008658,-0.000202,-0.023568,0.002789,0.002653,-0.023903,-0.013996
2,-0.025848,-0.010900,0.002505,-0.037905,-0.007742,0.033077,-0.018114,0.012375,-0.015122,-0.024258,...,0.000492,0.004236,0.028480,-0.012071,-0.030330,-0.016495,-0.008421,0.004879,-0.009361,-0.018736
3,-0.026359,0.021031,0.010216,-0.027475,-0.002247,0.028548,-0.016262,0.005056,-0.025006,-0.012161,...,0.006792,0.003689,0.009867,-0.013716,-0.038186,-0.005471,0.007461,-0.010544,-0.012866,-0.006788
4,-0.016307,0.001086,0.018129,-0.025702,-0.020065,0.034766,-0.011144,0.005895,-0.021815,-0.017756,...,0.008068,0.001029,0.012966,-0.016709,-0.032041,-0.008139,0.012270,0.001613,-0.019276,-0.024554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8911,-0.018756,-0.024100,0.000894,-0.017991,-0.009190,0.025449,-0.010195,0.010169,0.003291,-0.009877,...,-0.024982,-0.007912,0.012115,0.015734,-0.011077,-0.007017,-0.002281,0.003424,-0.001158,-0.002610
8912,-0.013933,-0.007284,0.009415,-0.034827,-0.015614,0.002233,-0.011545,0.008356,-0.022190,-0.029852,...,-0.021357,0.009540,0.009904,-0.012213,0.001011,-0.023130,0.007046,0.008707,0.001844,0.004367
8913,-0.005401,-0.025280,-0.016100,-0.016814,-0.008784,0.001791,-0.007190,0.012746,0.004736,-0.026021,...,-0.002502,-0.023918,0.051910,0.009531,-0.000745,-0.014314,0.001261,0.007937,0.002874,-0.021484
8914,-0.016402,-0.053440,-0.010729,-0.026489,-0.006616,0.010004,-0.017865,0.008008,0.007643,-0.008047,...,0.029621,-0.021330,0.033625,-0.009048,-0.026233,0.012122,0.018211,-0.016042,0.006212,-0.002461


In [11]:
embeddings_df.shape

(8916, 1536)

In [12]:
embeddings_df.to_csv(os.path.join(log_dir,'new_embeddings.tsv'), sep='\t', index=False, header=False)

# Write metadata TSV

In [13]:
# paragraphs = ["paragraph"+str(i) for i in range(100)] # And a placeholder for your real paragraphs
# metadata_df = pd.DataFrame(paragraphs, columns=['paragraph'])
# metadata_df.to_csv('example_metadata.tsv', sep='\t', index=False, header=False)

In [14]:
paragraphs = []
idx = 0
for text, cat in zip(pdf_embedding["text"], pdf_embedding["categories"]):
    context = text[0].replace("\n", "")
    metadata = f"{cat}-{idx}: {context}"
    paragraphs.append(metadata)
    idx +=1 

#  paragraphs = [ "Pdf_Chunk_id{}: ".format(idx) + text[0].replace("\n", "") for idx, text in enumerate(pdf_embedding["text"])]
metadata_df = pd.DataFrame(paragraphs, columns=['ID'])
metadata_df["label"]=pdf_embedding["categories"]
metadata_df

Unnamed: 0,ID,label
0,OEDT-0: TO/WCM/CGA1HT EXPLORATION WELL HP-HT E...,OEDT
1,OEDT-1: This report deals with the operation o...,OEDT
2,OEDT-2: 20/11/2014 - 23/07/2015 - EXP 1- PROSP...,OEDT
3,OEDT-3: Running 30” x 36” CP – DDR 23 to 24 Ce...,OEDT
4,OEDT-4: 3.7 PLUG & ABANDON Objective Result 8-...,OEDT
...,...,...
8911,"Unilever-8911: PROPERTY, PLANT AND EQUIPMENT W...",Unilever
8912,Unilever-8912: INFORMATION PRESENTED Unless ot...,Unilever
8913,Unilever-8913: IRAN-RELATED REQUIRED DISCLOSUR...,Unilever
8914,Unilever-8914: the Government of Iran and affi...,Unilever


In [15]:
metadata_df.to_csv(os.path.join(log_dir, 'new_metadata.tsv'), sep='\t', index=False)


# Do the same thing for graph embeddings

In [11]:
with open('graph_embedding.json', 'r') as f:
    # Use json.load to load the data from the file.
    graph_embedding = json.load(f)

In [12]:
graph_embeddings_vectors = np.array([v[0] for v in graph_embedding["vectors"]])
graph_embeddings_df = pd.DataFrame(embeddings_vectors)
graph_embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0.006073,-0.014151,-0.016932,-0.044704,-0.010930,0.014345,-0.013077,0.006875,-0.012618,-0.005585,...,-0.006539,0.004366,0.043333,0.001566,0.019881,-0.031950,0.007379,0.023995,0.011570,-0.005960
1,-0.017852,-0.031290,-0.017589,-0.022975,-0.008571,0.008217,-0.012814,0.004956,-0.013727,-0.001736,...,0.003573,0.002076,0.028768,0.000390,0.014489,-0.024670,0.017878,0.018391,0.011888,-0.026088
2,-0.010138,-0.019535,-0.003043,-0.025504,-0.013168,0.020536,0.010067,0.012427,-0.010593,-0.005375,...,0.004162,-0.003450,0.018507,-0.003872,-0.007049,-0.009442,0.010613,0.010281,0.008389,-0.023502
3,-0.005506,-0.003964,-0.020313,-0.035135,-0.019960,0.016812,-0.021358,0.010213,-0.015250,-0.003381,...,0.004722,0.020363,0.015893,-0.008866,-0.020615,-0.030854,0.012209,-0.000382,-0.005563,-0.025401
4,-0.005409,-0.009122,-0.008411,-0.041769,-0.018452,0.007182,-0.015062,-0.001724,-0.018530,-0.005470,...,-0.010048,-0.006450,0.015722,-0.004451,0.007596,-0.024676,0.013910,0.018167,0.002755,-0.030175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1900,-0.018756,-0.024100,0.000894,-0.017991,-0.009190,0.025449,-0.010195,0.010169,0.003291,-0.009877,...,-0.024982,-0.007912,0.012115,0.015734,-0.011077,-0.007017,-0.002281,0.003424,-0.001158,-0.002610
1901,-0.013933,-0.007284,0.009415,-0.034827,-0.015614,0.002233,-0.011545,0.008356,-0.022190,-0.029852,...,-0.021357,0.009540,0.009904,-0.012213,0.001011,-0.023130,0.007046,0.008707,0.001844,0.004367
1902,-0.005390,-0.025211,-0.016079,-0.016913,-0.008781,0.001911,-0.007299,0.012777,0.004817,-0.026071,...,-0.002557,-0.023953,0.051771,0.009469,-0.000669,-0.014491,0.001305,0.007920,0.002900,-0.021479
1903,-0.027783,-0.037841,0.002175,-0.019115,-0.007680,0.014788,-0.027627,0.001157,0.008460,-0.024703,...,0.039608,-0.026081,0.034358,-0.006699,-0.031370,-0.007212,0.029316,-0.023898,0.008447,-0.008551


In [13]:
graph_embeddings_df.to_csv(os.path.join(log_dir,'graph_embeddings.tsv'), sep='\t', index=False, header=False)

In [14]:
infos = ["Node(Edge)_id{}: ".format(idx) + text for idx, text in enumerate(graph_embedding["text"])]
graph_metadata_df = pd.DataFrame(infos, columns=['ID'])
graph_metadata_df["label"]="Graph"
graph_metadata_df.to_csv(os.path.join(log_dir, 'metadata2.tsv'), sep='\t', index=False)
graph_metadata_df

Unnamed: 0,ID,label
0,Node(Edge)_id0: Digitising_R&D,Graph
1,Node(Edge)_id1: 4._Winning_with_people_,Graph
2,Node(Edge)_id2: Alignment_with_Product_lifecycle,Graph
3,Node(Edge)_id3: supply_chain_improvement_project,Graph
4,Node(Edge)_id4: 7._Consumer_insight_,Graph
...,...,...
114,Node(Edge)_id114: enables,Graph
115,Node(Edge)_id115: peer_of,Graph
116,Node(Edge)_id116: depends_on,Graph
117,Node(Edge)_id117: operated_by,Graph


# Visualize all embedding vectors

In [15]:
all_embedding_vectors = np.concatenate((embeddings_vectors, graph_embeddings_vectors), axis=0)
all_embeddings_df = pd.DataFrame(all_embedding_vectors)

all_embeddings_df.to_csv(os.path.join(log_dir,'all_embeddings.tsv'), sep='\t', index=False, header=False)

(2024, 1536)

In [16]:
all_metadata_df = pd.concat([metadata_df, graph_metadata_df])
all_metadata_df.to_csv(os.path.join(log_dir, 'metadata3.tsv'), sep='\t', index=False)

In [17]:
embeddings1=tf.Variable(embeddings_vectors,name='var1')
embeddings2=tf.Variable(graph_embeddings_vectors,name='var2')
all_embeddings = tf.Variable(all_embedding_vectors,name='var3')

checkpoint=tf.train.Checkpoint(var1=embeddings1,var2=embeddings2,var3=all_embeddings)
checkpoint.save(log_dir+'/var.ckpt')

config=projector.ProjectorConfig()
emb1=config.embeddings.add()
emb1.tensor_name='var1'+'/.ATTRIBUTES/VARIABLE_VALUE'
emb1.metadata_path='metadata.tsv'

emb2=config.embeddings.add()
emb2.tensor_name='var2'+'/.ATTRIBUTES/VARIABLE_VALUE'
emb2.metadata_path='metadata2.tsv'

emb3=config.embeddings.add()
emb3.tensor_name='var3'+'/.ATTRIBUTES/VARIABLE_VALUE'
emb3.metadata_path='metadata3.tsv'

projector.visualize_embeddings(log_dir, config)


In [16]:
# tensorboard --logdir=<Your Path>/PS-Chat/notebooks/logs/