In [1]:
pip install opentsne

Defaulting to user installation because normal site-packages is not writeable
Collecting opentsne
  Downloading openTSNE-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading openTSNE-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: opentsne
Successfully installed opentsne-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import sys
sys.path.append("../..")

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pu.feature_extractors.extractors import ViTExtractor, AutoencoderExtractor
from pu.data.loaders import CSVLoader, SingleCSVLoader, SingleCSVWithTestLoader, SinglePNCSVWithTestLoader
from pu.data.pu_builder import build_pu_data

from pu.algorithms.pu_algorithms import IterativeClassifierAlgorithm, ProbTagging
from pu.algorithms.negative_detectors import NaiveDetector, KNNDetector
from pu.algorithms.stop_criterion import StopOnMetricDrop, NonStop

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.decomposition import PCA

import openTSNE

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

2023-12-20 10:03:02.464733: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-20 10:03:02.489856: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-20 10:03:02.489871: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-20 10:03:02.489887: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-20 10:03:02.496042: I tensorflow/core/platform/cpu_feature_g

In [2]:
ava_loader = SinglePNCSVWithTestLoader(
    '/srv/PU-dataset/unlabeled.csv',
    'id',
    '/srv/PU-dataset/dataset_unlabeled',
    positive_fn=lambda row: row['VotesMean'] > 5.0,
    test_frac=0.2,
    random_state=1234
)

paths_train_positive, paths_train_negative, paths_test_positive, paths_test_negative = ava_loader.load_data()

In [3]:
def tsne_huge(x,y):
    aff50 = openTSNE.affinity.PerplexityBasedNN(
        x,
        perplexity=50,
        n_jobs=32,
        random_state=0,
    )
    
    indices = np.random.permutation(list(range(x.shape[0])))
    reverse = np.argsort(indices)

    x_sample, x_rest = x[indices[:25000]], x[indices[25000:]]
    y_sample, y_rest = y[indices[:25000]], y[indices[25000:]]
    
    sample_affinities = openTSNE.affinity.PerplexityBasedNN(
        x_sample,
        perplexity=500,
        n_jobs=32,
        random_state=0,
        verbose=True,
    )

    sample_init = openTSNE.initialization.pca(x_sample, random_state=42)
    sample_embedding = openTSNE.TSNE(n_jobs=-1, verbose=True).fit(affinities=sample_affinities, initialization=sample_init)
    
    rest_init = sample_embedding.prepare_partial(x_rest, k=1, perplexity=1/3)
    init_full = np.vstack((sample_embedding, rest_init))[reverse]
    init_full = init_full / (np.std(init_full[:, 0]) * 10000)

    embedding = openTSNE.TSNEEmbedding(
        init_full,
        aff50,
        n_jobs=32,
        verbose=True,
        random_state=42,
    )
    
    embedding1 = embedding.optimize(n_iter=1500, exaggeration=12)
    return embedding1

In [10]:
def visualize_embedding_vit(emb_name):
    positive_features = ViTExtractor('projection_ava_positive', extractor_name=emb_name).extract_features(paths_train_positive)
    negative_features = ViTExtractor('projection_ava_negative', extractor_name=emb_name).extract_features(paths_train_negative)
    
    positive_features = positive_features.drop(['id'], axis=1).to_numpy()
    negative_features = negative_features.drop(['id'], axis=1).to_numpy()

    all_features = np.concatenate([positive_features, negative_features])
    labels = np.concatenate([np.ones(positive_features.shape[0]), np.zeros(negative_features.shape[0])])
    
    embs = tsne_huge(all_features, labels)
    
    sns.set(rc={'figure.figsize':(23.4,16.54)})
    scatter = sns.scatterplot(x=embs[:,0], y=embs[:,1], hue=labels, s=1.0, alpha=0.5, palette=sns.color_palette(["#FF0000", "#00FF00"], 2))
    
    fig = scatter.get_figure()
    fig.savefig(f"{emb_name}.png", dpi=300) 
    plt.clf()

In [11]:
for model_name in ['clip-ViT-B-32', 'clip-ViT-B-16', 'clip-ViT-L-14']:
    visualize_embedding_vit(model_name)

===> Finding 1500 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 12.87 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 3.28 seconds
--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_jobs=-1, verbose=True)
--------------------------------------------------------------------------------
===> Running optimization with exaggeration=12.00, lr=2083.33 for 250 iterations...
Iteration   50, KL divergence 3.6962, 50 iterations in 2.1051 sec
Iteration  100, KL divergence 3.6962, 50 iterations in 2.1154 sec
Iteration  150, KL divergence 3.6962, 50 iterations in 2.0527 sec
Iteration  200, KL divergence 3.6962, 50 iterations in 2.1538 sec
Iteration  250, KL divergence 3.6962, 50 iterations in 2.0923 sec
   --> Time elapsed: 10.52 seconds
===> Running optimization with exaggeration=1.00, lr=25000.00 for 500 iterations...
Iteration   50, KL divergence 1.9970, 50 iterations 

100%|███████████████████████████████████████████| 67/67 [06:33<00:00,  5.87s/it]


===> Finding 1500 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 12.82 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 3.25 seconds
--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_jobs=-1, verbose=True)
--------------------------------------------------------------------------------
===> Running optimization with exaggeration=12.00, lr=2083.33 for 250 iterations...
Iteration   50, KL divergence 3.6761, 50 iterations in 2.0666 sec
Iteration  100, KL divergence 3.6761, 50 iterations in 1.9876 sec
Iteration  150, KL divergence 3.6761, 50 iterations in 2.0915 sec
Iteration  200, KL divergence 3.6761, 50 iterations in 2.0389 sec
Iteration  250, KL divergence 3.6761, 50 iterations in 2.0284 sec
   --> Time elapsed: 10.21 seconds
===> Running optimization with exaggeration=1.00, lr=25000.00 for 500 iterations...
Iteration   50, KL divergence 1.9736, 50 iterations 

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
100%|█████████████████████████████████████████| 161/161 [37:05<00:00, 13.82s/it]
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
100%|███████████████████████████████████████████| 67/67 [14:53<00:00, 13.33s/it]


===> Finding 1500 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 18.12 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 3.74 seconds
--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_jobs=-1, verbose=True)
--------------------------------------------------------------------------------
===> Running optimization with exaggeration=12.00, lr=2083.33 for 250 iterations...
Iteration   50, KL divergence 3.6497, 50 iterations in 2.0424 sec
Iteration  100, KL divergence 3.6497, 50 iterations in 2.0380 sec
Iteration  150, KL divergence 3.6497, 50 iterations in 2.0671 sec
Iteration  200, KL divergence 3.6497, 50 iterations in 2.0398 sec
Iteration  250, KL divergence 3.6497, 50 iterations in 2.0040 sec
   --> Time elapsed: 10.19 seconds
===> Running optimization with exaggeration=1.00, lr=25000.00 for 500 iterations...
Iteration   50, KL divergence 2.0039, 50 iterations 

<Figure size 2340x1654 with 0 Axes>

In [4]:
def visualize_embedding_autoencoder(filters):
    all_paths = np.concatenate([paths_train_positive, paths_train_negative])
    labels = np.concatenate([np.ones(len(paths_train_positive)), np.zeros(len(paths_train_negative))])
    extractor = AutoencoderExtractor('projection_ava', input_shape=(256, 256, 3), filters=filters)
    features = extractor.extract_features(all_paths)
    
    positive_features = features.drop(['id'], axis=1).to_numpy()[labels == 1]
    negative_features = features.drop(['id'], axis=1).to_numpy()[labels == 0]

    all_features = np.concatenate([positive_features, negative_features])
    
    embs = tsne_huge(all_features, labels)
    
    sns.set(rc={'figure.figsize':(23.4,16.54)})
    scatter = sns.scatterplot(x=embs[:,0], y=embs[:,1], hue=labels, s=1.0, alpha=0.5, palette=sns.color_palette(["#FF0000", "#00FF00"], 2))
    
    fig = scatter.get_figure()
    fig.savefig(f"{extractor.filename}.png", dpi=300) 
    plt.clf()

In [5]:
os.environ['TF_GPU_ALLOCATOR']='cuda_malloc_async'
for filters in ([[8,16,16,32], [8,16,32,64,64], [8,16,32,64,64,128]]):
    visualize_embedding_autoencoder(filters)

2023-12-20 10:03:50.898480: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-20 10:03:50.903450: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-20 10:03:50.903542: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

===> Finding 1500 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 224.43 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 3.10 seconds
--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_jobs=-1, verbose=True)
--------------------------------------------------------------------------------
===> Running optimization with exaggeration=12.00, lr=2083.33 for 250 iterations...
Iteration   50, KL divergence 3.1868, 50 iterations in 2.0661 sec
Iteration  100, KL divergence 3.1962, 50 iterations in 2.1077 sec
Iteration  150, KL divergence 3.1964, 50 iterations in 2.0610 sec
Iteration  200, KL divergence 3.2109, 50 iterations in 2.0729 sec
Iteration  250, KL divergence 3.2113, 50 iterations in 2.0407 sec
   --> Time elapsed: 10.35 seconds
===> Running optimization with exaggeration=1.00, lr=25000.00 for 500 iterations...
Iteration   50, KL divergence 1.8605, 50 iterations

<Figure size 2340x1654 with 0 Axes>