In [1]:
%load_ext lab_black
%load_ext autotime
import pandas as pd
import numpy as np

time: 411 ms (started: 2023-05-28 16:05:35 -07:00)


This notebook is some (potentially time-consuming) data preparation for the  `tfidf-renorm.ipynb` notebook. This notebook downloads the [20 Newsgroups dataset](http://qwone.com/~jason/20Newsgroups/) and [ICLR data](https://github.com/dkobak/iclr-tsne/blob/main/iclr-tsne.ipynb), applies different TF-IDF settings (either L1 or L2 norm, and using linear or sublinear scaling), then SVD and exports the results for later visualization.

In [17]:
from sklearn.decomposition import TruncatedSVD
import sklearn.datasets
import sklearn.feature_extraction.text

from drnb.io import write_pickle
from drnb.log import log


def get_20ng():
    return sklearn.datasets.fetch_20newsgroups_vectorized(subset="all")


def tfidf(data, norm="l1", sublinear=False):
    if not norm:
        norm = None
    return sklearn.feature_extraction.text.TfidfTransformer(
        norm=norm, sublinear_tf=sublinear
    ).fit_transform(data)


def ng20_tfidf(norm="l1", sublinear=False):
    return tfidf(get_20ng().data, norm=norm, sublinear=sublinear)


def tsvd(data, n_components, algorithm="arpack"):
    svd = TruncatedSVD(n_components=n_components, algorithm=algorithm)
    data_pca = svd.fit_transform(data)
    varex = svd.explained_variance_ratio_
    log.info(
        "%d component explains %f %% variance", n_components, np.sum(varex * 100.0)
    )
    return data_pca


def renormalize(data, norm=""):
    if norm == "l2":
        return renormalize_l2(data)
    elif norm == "l1":
        return renormalize_l1(data)
    return data


def renormalize_l1(data):
    return data / np.sum(np.abs(data), axis=1)[:, np.newaxis]


def renormalize_l2(data):
    return data / np.linalg.norm(data, axis=1)[:, np.newaxis]

time: 20.3 ms (started: 2023-05-28 16:11:43 -07:00)


# 20NG

For the SVD settings below, there are a different number of components extracted for each combination of norm and scaling. This was based on re-running the SVD several times with the values in the TF-IDF columns permuted randomly to see if I could detect when the SVD starts fitting "noise". This is very slow so it's not given here.

## L1 normalization

### Sublinear=False

In [4]:
ng20l1l = ng20_tfidf(norm="l1", sublinear=False)

time: 880 ms (started: 2023-05-28 16:05:51 -07:00)


In [19]:
ng20l1l_pca = tsvd(ng20l1l, n_components=2250)
_ = write_pickle(
    ng20l1l_pca,
    "ng20",
    suffix="l1l",
    verbose=False,
    compression="gzip",
    overwrite=True,
)

time: 26min 11s (started: 2023-05-28 16:14:19 -07:00)


### Sublinear=True

In [5]:
ng20l1s = ng20_tfidf(norm="l1", sublinear=True)

time: 873 ms (started: 2023-05-28 16:06:01 -07:00)


In [20]:
ng20l1s_pca = tsvd(ng20l1s, n_components=2750)
_ = write_pickle(
    ng20l1s_pca,
    "ng20",
    suffix="l1s",
    verbose=False,
    compression="gzip",
    overwrite=True,
)

time: 38min 33s (started: 2023-05-28 16:40:30 -07:00)


## L2 normalization

### Sublinear=False

In [6]:
ng20l2l = ng20_tfidf(norm="l2", sublinear=False)

time: 1.19 s (started: 2023-05-28 16:06:38 -07:00)


In [21]:
ng20l2l_pca = tsvd(ng20l2l, n_components=1750)
_ = write_pickle(
    ng20l2l_pca,
    "ng20",
    suffix="l2l",
    verbose=False,
    compression="gzip",
    overwrite=True,
)

time: 15min 50s (started: 2023-05-28 17:19:03 -07:00)


### Sublinear=True

In [23]:
ng20l2s = ng20_tfidf(norm="l2", sublinear=True)

time: 1.38 s (started: 2023-05-28 18:01:25 -07:00)


In [24]:
ng20l2s_pca = tsvd(ng20l2s, n_components=3000)
_ = write_pickle(
    ng20l2s_pca,
    "ng20",
    suffix="l2s",
    verbose=False,
    compression="gzip",
    overwrite=True,
)

time: 53min 48s (started: 2023-05-28 18:01:26 -07:00)


## Pipeline

First, prepare the target labels.

In [25]:
ng20v = get_20ng()

time: 828 ms (started: 2023-05-28 18:55:15 -07:00)


In [26]:
ng20v.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

time: 5.53 ms (started: 2023-05-28 18:55:16 -07:00)


In [27]:
ng20v.target

array([17,  7, 10, ..., 10, 18,  9])

time: 6.02 ms (started: 2023-05-28 18:55:16 -07:00)


Use the `codes_to_categories` function to convert the numeric codes to a category column with the actual newsgroup names:

In [28]:
from drnb.util import codes_to_categories

description = codes_to_categories(
    ng20v.target, ng20v.target_names, col_name="description"
)
description

0        talk.politics.mideast
1                    rec.autos
2             rec.sport.hockey
3             rec.sport.hockey
4                    rec.autos
                 ...          
18841       talk.politics.misc
18842       talk.politics.guns
18843         rec.sport.hockey
18844       talk.politics.misc
18845       rec.sport.baseball
Name: description, Length: 18846, dtype: category
Categories (20, object): ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', ..., 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

time: 43.1 ms (started: 2023-05-28 18:55:16 -07:00)


In [29]:
target = pd.concat([pd.Series(ng20v.target, name="class"), description], axis=1)

time: 6.25 ms (started: 2023-05-28 18:55:16 -07:00)


In [30]:
target

Unnamed: 0,class,description
0,17,talk.politics.mideast
1,7,rec.autos
2,10,rec.sport.hockey
3,10,rec.sport.hockey
4,7,rec.autos
...,...,...
18841,18,talk.politics.misc
18842,16,talk.politics.guns
18843,10,rec.sport.hockey
18844,18,talk.politics.misc


time: 24.9 ms (started: 2023-05-28 18:55:16 -07:00)


In [31]:
from drnb.io.pipeline import create_default_pipeline

time: 4.86 s (started: 2023-05-28 18:55:16 -07:00)


### L1

#### Sublinear=False

##### Unnormalized

In [32]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l1lu",
    data=ng20l1l_pca,
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


time: 58.1 s (started: 2023-05-28 18:55:21 -07:00)


##### Renormalized

In [33]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l1lr",
    data=renormalize(ng20l1l_pca, norm="l1"),
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 55.7 s (started: 2023-05-28 18:56:19 -07:00)


#### Sublinear=True

##### Unnormalized

In [34]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l1su",
    data=ng20l1s_pca,
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 1min 3s (started: 2023-05-28 18:57:14 -07:00)


##### Renormalized

In [35]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l1sr",
    data=renormalize(ng20l1s_pca, norm="l1"),
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 1min 4s (started: 2023-05-28 18:58:18 -07:00)


### L2

#### Sublinear=False

##### Unnormalized

In [36]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l2lu",
    data=ng20l2l_pca,
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 49.5 s (started: 2023-05-28 18:59:22 -07:00)


##### Renormalized

In [37]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l2lr",
    data=renormalize(ng20l2l_pca, norm="l2"),
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 50.7 s (started: 2023-05-28 19:00:11 -07:00)


#### Sublinear=True

##### Unnormalized

In [38]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l2su",
    data=ng20l2s_pca,
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 1min 7s (started: 2023-05-28 19:01:02 -07:00)


##### Renormalized

In [39]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l2sr",
    data=renormalize(ng20l2s_pca, norm="l2"),
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 1min 8s (started: 2023-05-28 19:02:10 -07:00)


## 20NG, 500 Components

The downside to using a different number of components for each treatment of the data is that some of the effect in dimensionality reduction may come from the difference in the initial dimensionality. Also, every one of the settings has a much larger number of components extracted than you typically see for most datasets. So here are the data pipelines repeated, but this time only using the first 500 components. That is perhaps still on the large size for what you see a pre-processing for dimensionality reduction, but in the same ballpark as the dimensionality of e.g. BERT embeddings (768D). It also has the disadvantage of being chosen entirely arbitrarily.

### L1

#### Sublinear=False

##### Unnormalized

In [40]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l1lu-pca500",
    data=ng20l1l_pca[:, :500],
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 31.9 s (started: 2023-05-28 20:11:29 -07:00)


##### Renormalized

In [41]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l1lr-pca500",
    data=renormalize(ng20l1l_pca[:, :500], norm="l1"),
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 31.6 s (started: 2023-05-28 20:12:01 -07:00)


#### Sublinear=True

##### Unnormalized

In [42]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l1su-pca500",
    data=ng20l1s_pca[:, :500],
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 31.8 s (started: 2023-05-28 20:12:33 -07:00)


##### Renormalized

In [43]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l1sr-pca500",
    data=renormalize(ng20l1s_pca[:, :500], norm="l1"),
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 31.9 s (started: 2023-05-28 20:13:05 -07:00)


### L2

#### Sublinear=False

##### Unnormalized

In [44]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l2lu-pca500",
    data=ng20l2l_pca[:, :500],
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 32.1 s (started: 2023-05-28 20:13:37 -07:00)


##### Renormalized

In [45]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l2lr-pca500",
    data=renormalize(ng20l2l_pca[:, :500], norm="l2"),
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 32.3 s (started: 2023-05-28 20:14:09 -07:00)


#### Sublinear=True

##### Unnormalized

In [46]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l2su-pca500",
    data=ng20l2s_pca[:, :500],
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 33.1 s (started: 2023-05-28 20:14:41 -07:00)


##### Renormalized

In [47]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "ng20l2sr-pca500",
    data=renormalize(ng20l2s_pca[:, :500], norm="l2"),
    target=target,
    tags=["highdim"],
    url="http://qwone.com/~jason/20Newsgroups/",
    verbose=True,
)

time: 32.3 s (started: 2023-05-28 20:15:14 -07:00)


# iclr

See (and run) the `data-pipeline/iclr.pynb` notebook for where the text data comes from ([iclr abstract data](https://github.com/dkobak/iclr-tsne/blob/main/iclr-tsne.ipynb) originally analyzed by Dmitry Kobak).

In [50]:
from drnb.io import read_pickle

time: 1.58 ms (started: 2023-05-28 22:19:35 -07:00)


In [51]:
iclr_text = read_pickle("iclr", suffix="text")

time: 842 ms (started: 2023-05-28 22:19:36 -07:00)


Some of this data has been processed as part of the `iclr.pynb` notebook which uses Dmitry's setting of L2 normalization, sublinear scaling, and extracting 100 components. Below, I will try the same combination of norm/scaling as used with 20NG, but with 200 components to avoid being too duplicative of the previous datasets. 100-200 components is a reasonable choice for all the norm/scaling settings based on comparing the amount of variance extracted at different components vs re-running SVD multiple times with the values in each column randomly permuted, so I've chosen the higher end of the estimate for these runs.

In [53]:
def tfidfv(data, norm="l1", sublinear=False):
    if not norm:
        norm = None
    return sklearn.feature_extraction.text.TfidfVectorizer(
        norm=norm, sublinear_tf=sublinear
    ).fit_transform(data)

time: 2.51 ms (started: 2023-05-28 22:21:26 -07:00)


In [60]:
from drnb.io.dataset import read_target, read_palette

iclr_target = read_target("iclr")
iclr_palette = read_palette("iclr")

time: 130 ms (started: 2023-05-28 22:25:11 -07:00)


## L1 normalization

### Sublinear=False

In [54]:
iclrl1l = tfidfv(iclr_text, norm="l1", sublinear=False)
iclrl1l_pca = tsvd(iclrl1l, n_components=200)

time: 14.2 s (started: 2023-05-28 22:21:32 -07:00)


#### Unnormalized

In [62]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "iclrl1lu-pca200",
    data=iclrl1l_pca,
    target=iclr_target,
    target_palette=iclr_palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 24.1 s (started: 2023-05-28 22:27:57 -07:00)


#### Renormalized

In [63]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "iclrl1lr-pca200",
    data=renormalize(iclrl1l_pca, norm="l1"),
    target=iclr_target,
    target_palette=iclr_palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 24.8 s (started: 2023-05-28 22:28:22 -07:00)


### Sublinear=True

In [55]:
iclrl1s = tfidfv(iclr_text, norm="l1", sublinear=True)
iclrl1s_pca = tsvd(iclrl1s, n_components=200)

time: 16.1 s (started: 2023-05-28 22:22:00 -07:00)


#### Unnormalized

In [64]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "iclrl1su-pca200",
    data=iclrl1s_pca,
    target=iclr_target,
    target_palette=iclr_palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 24 s (started: 2023-05-28 22:29:39 -07:00)


#### Renormalized

In [70]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "iclrl1sr-pca200",
    data=renormalize(iclrl1s_pca, norm="l1"),
    target=iclr_target,
    target_palette=iclr_palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 23.7 s (started: 2023-05-28 22:34:21 -07:00)


## L2 normalization

### Sublinear=False

In [57]:
iclrl2l = tfidfv(iclr_text, norm="l2", sublinear=False)
iclrl2l_pca = tsvd(iclrl2l, n_components=200)

time: 13.2 s (started: 2023-05-28 22:22:38 -07:00)


#### Unnormalized

In [66]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "iclrl2lu-pca200",
    data=iclrl2l_pca,
    target=iclr_target,
    target_palette=iclr_palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 23.6 s (started: 2023-05-28 22:31:33 -07:00)


#### Renormalized

In [67]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "iclrl2lr-pca200",
    data=renormalize(iclrl2l_pca, norm="l2"),
    target=iclr_target,
    target_palette=iclr_palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 23.5 s (started: 2023-05-28 22:31:59 -07:00)


### Sublinear=True

In [58]:
iclrl2s = tfidfv(iclr_text, norm="l2", sublinear=True)
iclrl2s_pca = tsvd(iclrl2s, n_components=200)

time: 15.8 s (started: 2023-05-28 22:22:52 -07:00)


#### Unnormalized

In [68]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "iclrl2su-pca200",
    data=iclrl2s_pca,
    target=iclr_target,
    target_palette=iclr_palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 24.3 s (started: 2023-05-28 22:32:57 -07:00)


#### Renormalized

In [69]:
_ = create_default_pipeline(
    check_for_duplicates=True,
    metric=["euclidean"],
).run(
    "iclrl2sr-pca200",
    data=renormalize(iclrl2s_pca, norm="l2"),
    target=iclr_target,
    target_palette=iclr_palette,
    url="https://github.com/dkobak/iclr-tsne",
    verbose=True,
)

time: 23.7 s (started: 2023-05-28 22:33:48 -07:00)
