# 03 Dimensionality Reduction

## Imports and setup

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle, random

import bokeh, random

from umap import UMAP

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [3]:
SEED = 2023
random.seed(SEED)

## Load embeddings and data

In [19]:
with open('data/transformer_embeddings.pickle', 'rb') as f:
    embeddings = pickle.load(f)
    
print(f'Loaded article embeddings! Shape: {embeddings.shape}')

Loaded article embeddings! Shape: (527, 384)


In [20]:
df = pd.read_csv('data/manufacturing_articles.csv')
print(f'Loaded pd.DataFrame with the article titles and their summaries.')
print(f'Shape: {df.shape}')
df.head(2)

Loaded pd.DataFrame with the article titles and their summaries.
Shape: (527, 4)


Unnamed: 0,link,source_url,title,summary
0,/wiki/Carbon_Arc_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Carbon arc welding,Carbon arc welding (CAW) is a process which pr...
1,/wiki/Flux_Cored_Arc_Welding,https://en.wikipedia.org/wiki/List_of_welding_...,Flux cored arc welding,<!-- \r\nNewPP limit report\r\nParsed by mw140...


## Reduce dimensions using UMAP

Uniform Manifold Approximation and Projection

In [21]:
my_umap = UMAP(n_components=2)
umap_embeddings = my_umap.fit_transform(embeddings)

## Export data for `bulk`

In [24]:
embeddings_2D = pd.DataFrame({
    'text': df['title'],
    'x': umap_embeddings[:, 0],
    'y': umap_embeddings[:, 1],
})

embeddings_2D.head()

Unnamed: 0,text,x,y
0,Carbon arc welding,8.810158,9.205739
1,Flux cored arc welding,6.887566,5.127184
2,Gas metal arc welding,8.940374,9.374857
3,Shielding gas,8.98382,9.291036
4,Plasma arc welding,9.013934,9.13667


In [25]:
# Save
embeddings_2D.to_csv("embeddings/umap_2d.csv", index=False)

## Plot using `bokeh`

Alternative `bokeh` plot in this notebook in case `bulk` is not used.

In [26]:
# Load
embeddings_2D = pd.read_csv("embeddings/umap_2d.csv")

In [27]:
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource, WheelZoomTool
from bokeh.io import output_notebook

# Enable Bokeh to display plots in the JupyterLab notebook
output_notebook()

# Data
data = embeddings_2D

# Create a ColumnDataSource from the data dictionary
source = ColumnDataSource(data)

# Create the figure
p = figure(tools='hover, box_zoom', tooltips=[('Text', '@text')])

# Plot the points
p.circle(x='x', y='y', source=source, size=10)

# Add hover tool
hover = p.select(dict(type=HoverTool))
hover.tooltips = [
    ('Text', '@text'),
    ('X', '@x'),
    ('Y', '@y')
]

# Show the plot
show(p)