In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector
from sentence_transformers import SentenceTransformer




In [2]:
# Read dataset
proposals = pd.read_csv("dataset.csv", encoding="latin-1")

In [3]:
# Stitch problem and solution
proposals["problem_solution"] = "Problem: " + proposals["problem"] + "\nSolution: " + proposals["solution"] 

In [4]:
# Encode problem-solution text
model = SentenceTransformer('all-MiniLM-L6-v2')

sentences = proposals["problem_solution"].tolist()

embeddings = model.encode(sentences)

In [5]:
embeddings

array([[-0.05735937,  0.01484988, -0.00389303, ..., -0.03956553,
         0.09735272, -0.03836142],
       [-0.07917185,  0.11348645,  0.0509224 , ..., -0.0411661 ,
        -0.04118628,  0.02412531],
       [-0.05354699,  0.05007074, -0.01066663, ..., -0.12300329,
        -0.0761971 ,  0.00527502],
       ...,
       [-0.02701267,  0.01422018,  0.00639625, ..., -0.03266954,
         0.00028772, -0.09036349],
       [-0.08481316,  0.08431742,  0.03225553, ..., -0.07002679,
         0.02056553,  0.03197028],
       [-0.05189255,  0.02621674, -0.02884698, ..., -0.14218749,
        -0.03502588, -0.00576854]], dtype=float32)

In [6]:
num_proposals = embeddings.shape[0]
print(f"There are {num_proposals} proposals.")

There are 1300 proposals.


In [7]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir='/logs/imdb-example/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    for i in range(1, num_proposals+1):
        f.write(f"Proposal {i}\n")

# Save embedding as a checkpoint
weights = tf.Variable(embeddings)
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [8]:
port = np.random.randint(8000, 9999)
print(f"Running on port {port}")

Running on port 8576


In [9]:
%load_ext tensorboard
%tensorboard --logdir=$log_dir --port=$port