In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
import plotly_express as px

EMBEDDINGS_DIRECTORY = 'SentenceBERT_embeddings'

# make 3D plot using plotly express
# https://snrspeaks.medium.com/visualizing-1-4-millions-game-of-thrones-words-with-bert-941184dd713a


In [2]:
def construct_embedding_matrix(directory):
    """
    Construct the embedding matrix from the pieces stored as pickle files in
    the embeddings directory.
    """
    embedding_files = sorted(os.listdir(directory))
    embedding_matrix = None

    for file in embedding_files:
        piece = np.load(directory + '/' + file, allow_pickle=True)

        if embedding_matrix is None:
            embedding_matrix = piece
        else:
            embedding_matrix = np.vstack((embedding_matrix, piece))

    return embedding_matrix


def reduce_dimension(embedding_matrix):
    """
    Reduces the dimensionality of sentence embeddings to 3D using two methods,
    tSVD (aka LSA) and PCA.
    """

    tsvd = TruncatedSVD(n_components=3)
    tsvd_projection = tsvd.fit_transform(embedding_matrix)

    pca = PCA(n_components=3)
    pca_projection = pca.fit_transform(embedding_matrix)

    return tsvd_projection, pca_projection


def append_embedding_to_quote(embedding_matrix, quote_df):
    """
    Appends the embedding matrix to the quote dataframe, returning a dataframe
    with a column for each component of the vector embedding as well as the
    original quote. Note that the rows are appended simply by the order which
    they appear in the matrix and the dataframe.
    """

    embedding_df = pd.DataFrame(
        embedding_matrix,
        columns=[f'component_{i}' for i in range(embedding_matrix.shape[1])]
    )
    df = pd.concat([quote_df, embedding_df], axis=1)

    return df


In [3]:
embedding_matrix = construct_embedding_matrix(EMBEDDINGS_DIRECTORY)
# < 10 seconds

In [4]:
tsvd_3d, pca_3d = reduce_dimension(embedding_matrix)
# < 15 seconds

In [14]:
quote_df = pd.DataFrame()

for file in os.listdir('quote_data/cleaned'):
    quote_df = pd.concat([quote_df, pd.read_csv(f'quote_data/cleaned/{file}')])
quote_df = quote_df.drop(columns=['Unnamed: 0'])

quote_tsvd_df = append_embedding_to_quote(tsvd_3d, quote_df)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
quote_tsvd_df

Unnamed: 0,quote,author,component_0,component_1,component_2
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,0.355903,-0.095574,0.325896
1,You've gotta dance like there's nobody watchin...,William W. Purkey,0.395555,0.090211,0.160476
2,You know you're in love when you can't fall as...,Dr. Seuss,0.427160,0.187184,0.212613
3,A friend is someone who knows all about you an...,Elbert Hubbard,0.237631,0.128730,0.196789
4,Darkness cannot drive out darkness: only light...,Martin Luther King Jr.,0.473559,0.065948,-0.093720
...,...,...,...,...,...
499703,Being is always becoming people change and sta...,people change and stay the same.,0.496923,-0.114046,-0.045989
499704,We normally know we're getting older when the ...,unless you're a cancer survivor! Then we love ...,0.379813,-0.239512,-0.121045
499705,Cole was meticulous to a fault office scuttleb...,office scuttlebut had it that he never went ou...,0.208867,0.044685,0.008902
499706,Naivete in grownups is often charming but when...,but when coupled with vanity it is indistingui...,0.268146,-0.168866,0.190102


### tSVD

In [None]:
fig = px.scatter_3d(
    quote_df,
    x = quote_tsvd_df['component_0'],
    y = quote_tsvd_df['component_1'],
    z = quote_tsvd_df['component_2'],
    hover_data = 'quote'
)

# fig.show()

### PCA

In [None]:
fig = px.scatter_3d(
    x = pca_3d[:,0],
    y = pca_3d[:,1],
    z = pca_3d[:,2],
)

# fig.show()

### t-SNE

In [None]:
tsne = TSNE(n_components=3)
# tsne_projection = tsne.fit_transform(embedding_matrix)
# tSNE is computationally expensive! Waited 5 minutes trying to reduce whole
# dataset but was still going...

In [None]:
RANDOM_STATE = 0
sample = append_embedding_to_quote(embedding_matrix, quote_df) \
    .sample(n=1000, random_state=RANDOM_STATE)

sample.head()

In [None]:
embedding_matrix.shape

(499708, 384)

In [None]:
sample_embeddings = sample[[f'component_{i}' for i in range(384)]].to_numpy()
tsne_small_projection = tsne.fit_transform(sample_embeddings)

In [None]:
sample_tsne = append_embedding_to_quote(tsne_small_projection, quote_df.loc[sample.index])

In [None]:
sample_tsne

In [None]:
fig = px.scatter_3d(
    x = sample_tsne['0'],
    y = sample_tsne['1'],
    z = sample_tsne['2'],
)

fig.show()