In [1]:
import torch
import pandas as pd
import torch
import cuml
import rmm
import numpy as np

from tqdm.auto import tqdm
from pathlib import Path
from itertools import islice
from boltons.iterutils import chunked_iter

from sentence_transformers import SentenceTransformer

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

mpl.style.use('seaborn-muted')
sns.set(style='whitegrid')

In [4]:
rmm.reinitialize(managed_memory=True)
rmm.is_initialized()

True

In [5]:
torch.__version__

'1.11.0+cu113'

In [6]:
torch.cuda.is_available()

True

In [7]:
device = (
    torch.device('cuda')
    if torch.cuda.is_available()
    else torch.device('cpu')
)

In [8]:
device

device(type='cuda')

In [11]:
model = SentenceTransformer('all-MiniLM-L6-v1')

In [12]:
model = model.to(device).half()

In [13]:
pdf = df.sample(frac=1)

In [14]:
embeds = model.encode(
    pdf.learning_outcome.tolist(),
    show_progress_bar=True,
    convert_to_numpy=True,
)

Batches:   0%|          | 0/539609 [00:00<?, ?it/s]

In [15]:
embeds.shape

(17267457, 384)

In [16]:
pdf.to_parquet('./data/minilm-17m.parquet')

In [17]:
np.save('./data/minilm-17m.npy', embeds)

# UMAP

In [14]:
pdf = pd.read_parquet('./data/minilm-17m.parquet')

In [8]:
embeds = np.load('./data/minilm-17m.npy')

In [9]:
embeds_small = cuml.GaussianRandomProjection(n_components=64).fit_transform(embeds)

In [10]:
embeds_small.shape

(17267457, 64)

In [10]:
np.save('./data/minilm-17m-64.npy', embeds_small)

In [11]:
reducer = cuml.UMAP(
    n_neighbors=50,
    n_epochs=1000,
    negative_sample_rate=20,
    verbose=True,
)

x = reducer.fit_transform(embeds_small)

[D] [20:26:59.177356] ../src/umap/runner.cuh:102 n_neighbors=50
[D] [20:26:59.177660] ../src/umap/runner.cuh:124 Calling knn graph run
[D] [21:48:30.812009] ../src/umap/runner.cuh:130 Done. Calling fuzzy simplicial set
[D] [21:48:32.196791] ../src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [21:48:33.174996] ../src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.0793886, 0.169634, 0.214824, 0.0461171, 0.0355322, 0.05919, 0.155336, 0.215529, 0.170407, 0.265723, 0.183086, 0.212213, 0.0292059, 0.162507, 0.15833, 0.0492306, 0.018961, 0.015694, 0.0989981, 0.335043, 0.0444186, 0.020863, 0.139459, 0.0419205, 0.0790405 ]

[D] [21:48:33.175946] ../src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 0.226963, 0.0902719, 1.19209e-07, 0.164198, 0.300329, 0.0577141, 0.119288, 1.19209e-07, 4.76837e-07, 3.57628e-07, 0.0263783, 2.38419e-07, 0.3058, 0.0420145, 2.38419e-07, 0.349005, 0.330844, 0.139198, 0.138221, 2.38419e-07, 0.190082, 0.353538, 0.0955189, 0.142969, 0.327571 ]

[D] [21:48:33

In [12]:
x.shape

(17267457, 2)

In [15]:
pdf['x'] = x[:,0]
pdf['y'] = x[:,1]

In [16]:
pdf.to_parquet('./data/minilm-17m-umap.parquet')

In [None]:
plt.figure(figsize=(30,30), dpi=200)
plt.xlim(-20,20)
plt.ylim(-20,20)

ax = sns.scatterplot(
    x='x',
    y='y',
    s=0.2,
    linewidth=0,
    alpha=0.95,
    data=pdf,
)

In [None]:
plt.figure(figsize=(30,30), dpi=200)
plt.xlim(-20,20)
plt.ylim(-20,20)

ax = sns.scatterplot(
    x='x',
    y='y',
    s=0.2,
    linewidth=0,
    alpha=0.8,
    hue='field',
    palette='tab20b',
    legend=None,
    data=pdf,
)

In [20]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [21]:
small_pdf = pdf.sample(20_000)

In [22]:
small_pdf['tooltip'] = small_pdf.apply(lambda row: f'{row.field} | {row.learning_outcome}', axis=1)

In [23]:
chart = alt.Chart(small_pdf, width=1000, height=1000).mark_point(size=5).encode(
    x='x',
    y='y',
    tooltip='tooltip',
    color='field',
)

In [None]:
chart.interactive()