In [None]:
import nltk
import pickle
import torch
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from umap import UMAP
from urllib.parse import urlparse
from nltk.corpus import brown
from sentence_transformers import SentenceTransformer, util

In [None]:
nltk.download('brown')

In [None]:
embeddings_location = '../page-embeddings.pkl'

print("Loading pre-computed embeddings from disc: " + embeddings_location)
with open(embeddings_location, "rb") as reader:
    cache_data = pickle.load(reader)
    sections = cache_data['sections']
    corpus_embeddings = cache_data['embeddings']

print('Corpus embeddings loaded.')
print('Corpus embedding size:', corpus_embeddings.shape)

In [None]:
if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook.")

paras = brown.paras()

bi_encoder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
bi_encoder.max_seq_length = 256

def joinPara(para):
    return nltk.TreebankWordDetokenizer().detokenize(para[0])

passages = []
passages.extend(map(joinPara, np.asarray(paras, dtype=object)))

print('Passages:', len(passages))

quality_embeddings = bi_encoder.encode(passages, convert_to_tensor=True)

print('Quality embeddings created.')
print('Quality embedding size:', quality_embeddings.shape)

In [None]:
cos_sims = util.cos_sim(corpus_embeddings, quality_embeddings.cpu())
print('Number of similarities computed:', len(cos_sims))

In [None]:
umap_fit = UMAP(n_components=3, n_neighbors=8, random_state=42)
embeddings_umap = umap_fit.fit_transform(corpus_embeddings)

In [None]:
with open('/tmp/embeddings_analysis.pkl', "wb") as writer:
    pickle.dump({'sections': sections, 'embeddings': corpus_embeddings, 'umap': embeddings_umap, 'similarities': cos_sims}, writer)

In [7]:
df = pd.DataFrame.from_records(pd.json_normalize(sections, sep='_'))
df['score'] = list(map(lambda x: x.mean(), cos_sims.numpy()))
df['content'].apply(lambda x: x.replace('\n', ' ').strip())
df['domain'] = df['page_url'].apply(lambda x: urlparse(x).netloc)
df['x'] = embeddings_umap[:, 0]
df['y'] = embeddings_umap[:, 1]
df['z'] = embeddings_umap[:, 2]

In [None]:
threshold = np.percentile(df['score'], 2.5)

plt_range = [df['score'].min(), df['score'].max()]

counts, bins = np.histogram(df['score'], range=plt_range, bins=50)
diagram = plt.stairs(counts, bins)
plt.axvline(threshold, color='r', linestyle='--')

print('Threshold:', threshold)

In [None]:
df['partition'] = df['score'].apply(lambda x: x > threshold and x < (threshold * -1))

fig_3d_tokens = px.scatter_3d(
    df, x='x', y='y', z='z',
    color=df['tokens'], hover_data=['content'], color_continuous_scale=px.colors.sequential.Viridis,
    width=800, height=800
)
fig_3d_tokens.update_traces(marker_size=5)
fig_3d_tokens.show()

fig_3d_score = px.scatter_3d(
    df, x='x', y='y', z='z',
    color=df['score'], hover_data=['content'], color_continuous_scale=px.colors.sequential.Viridis,
    width=800, height=800
)
fig_3d_score.update_traces(marker_size=5)
fig_3d_score.show()

fig_3d_partition = px.scatter_3d(
    df, x='x', y='y', z='z',
    color=df['partition'], hover_data=['content'], color_continuous_scale=px.colors.sequential.Viridis,
    width=800, height=800
)
fig_3d_partition.update_traces(marker_size=5)
fig_3d_partition.show()

df.sample(10)

In [None]:
scoresFrame = df['score']
sorted_ids = scoresFrame.argsort()
scores = scoresFrame[sorted_ids]

print('Threshold:', threshold)

numpy_sections = np.array(sections)

good_ids = sorted_ids[np.logical_and(threshold <= scores, scores <= (threshold * -1))]
bad_ids = sorted_ids[np.logical_or(threshold > scores, scores > (threshold * -1))]

print(len(good_ids), len(bad_ids))

selected_sections = numpy_sections[good_ids[0:10]]
selected_scores = scores[good_ids[0:10]]

df = pd.DataFrame.from_records(pd.json_normalize(selected_sections, sep='_'))
df['score'] = selected_scores
df['content'].apply(lambda x: x.replace('\n', ' ').strip())
df['domain'] = df['page_url'].apply(lambda x: urlparse(x).netloc)

df.sample(10)