# Experiments by changing the threshold of reliable positives (AVA dataset)

In [1]:
import sys
sys.path.append("..")

In [2]:
import numpy as np
import pandas as pd
import traceback
import os

from pu.feature_extractors.extractors import ViTExtractor, AutoencoderExtractor
from pu.data.loaders import CSVLoader, SingleCSVLoader, SingleCSVWithTestLoader, FullCSVLoader
from pu.data.pu_builder import build_pu_data

from pu.algorithms.pu_algorithms import IterativeClassifierAlgorithm, ProbTagging, NonNegativePU
from pu.algorithms.negative_detectors import NaiveDetector, KNNDetector
from pu.algorithms.stop_criterion import StopOnMetricDrop, NonStop

from sklearn import svm
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score

2024-01-08 13:02:55.566775: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-08 13:02:55.587121: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-08 13:02:55.587138: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-08 13:02:55.587151: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-08 13:02:55.591025: I tensorflow/core/platform/cpu_feature_g

## AVA dataset for positive and unlabeled examples

### Iterative classifier

In [6]:
def quantile_experiment(extractor, quantile):
    ava_loader = FullCSVLoader(
        '/srv/PU-dataset/unlabeled.csv',
        'id',
        '/srv/PU-dataset/dataset_unlabeled'
    )
    
    ava_df = ava_loader.load_data()
    features = extractor.extract_features(ava_df['id'])
    ava_df = pd.concat([ava_df.drop(columns=['id']), features.drop(columns=['id'])], axis=1)

    X_train, X_val, X_test, y_train, y_val, y_test = build_pu_data(
        ava_df,
        frac=1.0,
        move_to_unlabeled_frac=0.5,
        val_split=0.2,
        val_split_positive='same',
        reliable_positive_fn=lambda row, df: row['VotesMean'] > quantile,
        positive_fn=lambda row, df: row['VotesMean'] >= 5.0,
        test_frac=0.2,
        random_state=1234
    )

    iterative_cls = IterativeClassifierAlgorithm(
        negative_detector=KNNDetector(frac=0.1, k=20),
        stop_criterion=NonStop('aul'),
        classifier_class=LogisticRegression,
        max_iterations=20,
        verbose=True,
        classifier_kwargs={'max_iter':10000, 'solver':'saga', 'n_jobs':-1, 'random_state':1234}
    )
    
    iterative_cls.fit(X_train, y_train, X_val, y_val)
    print(f'Evolution of f1 score: {iterative_cls.validation_results}')

    bal_acc = balanced_accuracy_score(y_test, iterative_cls.predict(X_test))
    acc = accuracy_score(y_test, iterative_cls.predict(X_test))
    f1 = f1_score(y_test, iterative_cls.predict(X_test))

    return bal_acc, acc, f1

#### Vit feature extractor (good)

In [7]:
extractors = ['clip-ViT-B-32', 'clip-ViT-B-16', 'clip-ViT-L-14']

# Quantiles [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
quantile_quantities = [
    5.386517,
    5.475771,
    5.566116,
    5.660284,
    5.758871,
    5.865385,
    5.987416,
    6.129032,
    6.307692,
    6.574194,
    7.069421
]

extractor_col, quantile_col, bal_acc_col, acc_col, f1_col = [], [], [], [], []

for quantile in quantile_quantities:
    for extractor in extractors:
        try:
            vit_extractor = ViTExtractor('quantile_experiments', extractor_name=extractor)
            bal_acc, acc, f1 = quantile_experiment(vit_extractor, quantile)
            extractor_col.append(extractor)
            quantile_col.append(quantile)
            bal_acc_col.append(bal_acc)
            acc_col.append(acc)
            f1_col.append(f1)

        except Exception as e:
            print(f'Fail at {quantile}, {extractor}')
            print(traceback.format_exc())

df = pd.DataFrame.from_dict({
    'extractor': extractor_col,
    'quantile': quantile_col,
    'balanced_accuracy': bal_acc_col,
    'accuracy': acc_col,
    'f1': f1_col
})

df.to_csv('quantile_threshold_vit_results_ava.csv')

Total amount: 204278
Validation amount: 40855
Val_positive amount: 20434
Val_unlabeled amount: 20421
Train amount: 163423
Known_positive amount: 40869
Unlabeled_amount: 122554
Size of positive paths: 102172
Size of unlabeled paths: 102106
Iteration #0
Number of negatives: 12255
Number of positives: 40869



KeyboardInterrupt



#### Autoencoder-based feature extractor (very bad)

In [4]:
extractor_filters = [[8,16,16,32], [8,16,32,64,64], [8,16,32,64,64,128]]

# Quantiles [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
quantile_quantities = [
    5.386517,
    5.475771,
    5.566116,
    5.660284,
    5.758871,
    5.865385,
    5.987416,
    6.129032,
    6.307692,
    6.574194,
    7.069421
]

extractor_col, quantile_col, bal_acc_col, acc_col, f1_col = [], [], [], [], []

for quantile in quantile_quantities:
    for filters in extractor_filters:
        try:
            filename = f'quantile_threshold_autoencoder_results_{quantile}_{"_".join(str(i) for i in filters)}.csv'
            if (not os.path.exists(filename)):
                extractor = AutoencoderExtractor('quantile_experiments', input_shape=(256, 256, 3), filters=filters)
                bal_acc, acc, f1 = quantile_experiment(extractor, quantile)
                extractor_col.append(filters)
                quantile_col.append(quantile)
                bal_acc_col.append(bal_acc)
                acc_col.append(acc)
                f1_col.append(f1)
    
                df = pd.DataFrame.from_dict({
                    'extractor': extractor_col,
                    'quantile': quantile_col,
                    'balanced_accuracy': bal_acc_col,
                    'accuracy': acc_col,
                    'f1': f1_col
                })
                
                df.to_csv(filename)
                extractor_col, quantile_col, bal_acc_col, acc_col, f1_col = [], [], [], [], []

        except Exception as e:
            print(f'Fail at {quantile}, {extractor}')
            print(traceback.format_exc())

2023-12-21 16:51:07.450545: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-21 16:51:07.463374: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-21 16:51:07.463458: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Total amount: 204278
Validation amount: 40855
Val_positive amount: 20434
Val_unlabeled amount: 20421
Train amount: 163423
Known_positive amount: 40869
Unlabeled_amount: 122554
Size of positive paths: 102172
Size of unlabeled paths: 102106
Iteration #0
Number of negatives: 12255
Number of positives: 40869


KeyboardInterrupt: 

### Non-negative PU classifier

In [9]:
import tensorflow as tf

def quantile_experiment_nnpu(extractor, quantile, prior):
    ava_loader = FullCSVLoader(
        '/srv/PU-dataset/unlabeled.csv',
        'id',
        '/srv/PU-dataset/dataset_unlabeled'
    )
    
    ava_df = ava_loader.load_data()
    features = extractor.extract_features(ava_df['id'])
    ava_df = pd.concat([ava_df.drop(columns=['id']), features.drop(columns=['id'])], axis=1)

    X_train, X_val, X_test, y_train, y_val, y_test = build_pu_data(
        ava_df,
        frac=1.0,
        move_to_unlabeled_frac=0.5,
        val_split=0.2,
        val_split_positive='same',
        reliable_positive_fn=lambda row, df: row['VotesMean'] > quantile,
        positive_fn=lambda row, df: row['VotesMean'] >= 5.0,
        test_frac=0.2,
        random_state=1234
    )

    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])
    
    iterative_cls = NonNegativePU(
        model=model,
        positive_prior=prior,
        loss_fn=tf.keras.losses.BinaryCrossentropy(),
        compile_kwargs={'optimizer': 'adam'},
        fit_kwargs={'epochs': 10}
    )
    
    iterative_cls.fit(X_train, y_train, X_val, y_val)
    #print(f'Evolution of f1 score: {iterative_cls.validation_results}')

    bal_acc = balanced_accuracy_score(y_test, iterative_cls.predict(X_test))
    acc = accuracy_score(y_test, iterative_cls.predict(X_test))
    f1 = f1_score(y_test, iterative_cls.predict(X_test))

    return bal_acc, acc, f1

#### ViT feature extractor

In [None]:
extractors = ['clip-ViT-B-32', 'clip-ViT-B-16', 'clip-ViT-L-14']

# Quantiles [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
quantile_quantities = [
    5.386517,
    5.475771,
    5.566116,
    5.660284,
    5.758871,
    5.865385,
    5.987416,
    6.129032,
    6.307692,
    6.574194,
    7.069421
]

positive_priors = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

extractor_col, quantile_col, prior_col, bal_acc_col, acc_col, f1_col = [], [], [], [], [], []

for quantile in quantile_quantities:
    for extractor in extractors:
        for prior in positive_priors:
            try:
                vit_extractor = ViTExtractor('quantile_experiments', extractor_name=extractor)
                bal_acc, acc, f1 = quantile_experiment_nnpu(vit_extractor, quantile, prior)
                extractor_col.append(extractor)
                quantile_col.append(quantile)
                prior_col.append(prior)
                bal_acc_col.append(bal_acc)
                acc_col.append(acc)
                f1_col.append(f1)
    
            except Exception as e:
                print(f'Fail at {quantile}, {extractor}')
                print(traceback.format_exc())

df = pd.DataFrame.from_dict({
    'extractor': extractor_col,
    'quantile': quantile_col,
    'prior': prior_col,
    'balanced_accuracy': bal_acc_col,
    'accuracy': acc_col,
    'f1': f1_col
})

df.to_csv('quantile_threshold_vit_results_ava_nnpu.csv')

Total amount: 204278
Validation amount: 40855
Val_positive amount: 20434
Val_unlabeled amount: 20421
Train amount: 163423
Known_positive amount: 40869
Unlabeled_amount: 122554
Size of positive paths: 102172
Size of unlabeled paths: 102106
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Total amount: 204278
Validation amount: 40855
Val_positive amount: 20434
Val_unlabeled amount: 20421
Train amount: 163423
Known_positive amount: 40869
Unlabeled_amount: 122554
Size of positive paths: 102172
Size of unlabeled paths: 102106
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Total amount: 204278
Validation amount: 40855
Val_positive amount: 20434
Val_unlabeled amount: 20421
Train amount: 163423
Known_positive amount: 40869
Unlabeled_amount: 122554
Size of positive paths: 102172
Size of unlabeled paths: 102106
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 