In [12]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np

In [3]:
# Sentence model we will be using
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Read the file and get all the article texts
with open('../Data/AllTheData.txt', 'r') as file:
    articles = file.read().strip().split('\n')

# Read the file and get all the article names
with open('../Data/articleOrder.txt', 'r') as file:
    articlesNames = file.read().strip().split('\n')

# Compute sentence embeddings for the first 2000 articles
embeddings = model.encode(articles[0:2000])

In [14]:
# Storing the resulting numpy array in a file
with open("../Data/MiniLM_Embeddings.npy", "wb") as f:
    np.save(f, np.array(embeddings))

In [16]:
# Loading all the embeddings
with open("../Data/MiniLM_Embeddings.npy", "rb") as f:
    embeddings = np.load(f)

In [10]:
tsne_results = TSNE(n_components = 2, 
                    random_state = 42, 
                    perplexity = 40   ).fit_transform(embeddings)

# Create a DataFrame with the t-SNE results and article names
tsne_embeddings_df = pd.DataFrame({
    't-SNE Dimension 1': tsne_results[:, 0],
    't-SNE Dimension 2': tsne_results[:, 1],
    'Article Name': articlesNames[0:2000]
})

# Plot the scatter plot
fig = px.scatter(tsne_embeddings_df, 
                 x = 't-SNE Dimension 1', 
                 y = 't-SNE Dimension 2', 
                 hover_name = 'Article Name', 
                 title = "t-SNE Visualization of " +
                         "MiniLM-L6-v2 Embeddings",
                 template = "plotly_dark",
                 width = 700, height = 700)

# Center the title
fig.update_layout(title = {'text': "t-SNE Visualization of " + 
                                    "MiniLM-L6-v2 Embeddings",
                           'x': 0.5, 
                           'xanchor': 'center'})
fig.show()