In [1]:
import sys
sys.path.append("..")

In [2]:
import numpy as np
import pandas as pd

from pu.feature_extractors.extractors import ViTExtractor, AutoencoderExtractor
from pu.data.loaders import CSVLoader, SingleCSVLoader, SingleCSVWithTestLoader
from pu.data.pu_builder import build_pu_data

from pu.algorithms.pu_algorithms import IterativeClassifierAlgorithm, ProbTagging
from pu.algorithms.negative_detectors import NaiveDetector, KNNDetector
from pu.algorithms.stop_criterion import StopOnMetricDrop, NonStop

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score

2023-12-14 12:00:20.096789: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-14 12:00:20.162415: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-14 12:00:20.162554: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-14 12:00:20.162618: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-14 12:00:20.179134: I tensorflow/core/platform/cpu_feature_g

In [4]:
ava_loader = SingleCSVWithTestLoader(
    '/srv/PU-dataset/unlabeled.csv',
    'id',
    '/srv/PU-dataset/dataset_unlabeled',
    reliable_positive_fn=lambda row, _: row['VotesMean'] > 6.5,
    positive_fn=lambda row, _: row['VotesMean'] >= 5.0,
    test_frac=0.2,
    random_state=1234
)

paths_train_positive, paths_train_unlabeled, paths_test_positive, paths_test_negative = ava_loader.load_data()

In [4]:
extractor = ViTExtractor('test_ava')
positive_features, unlabeled_features = extractor.extract_features(paths_train_positive, paths_train_unlabeled)

positive_features = positive_features.drop(['label', 'id'], axis=1).to_numpy()
unlabeled_features = unlabeled_features.drop(['label', 'id'], axis=1).to_numpy()

In [5]:
# Create partitions
X_train, X_val, y_train, y_val = build_pu_data(
    positive_features, unlabeled_features,
    frac=1.0,
    move_to_unlabeled_frac=0.5,
    test_split=0.2,
    test_split_positive='same'
)

Total amount: 204278
Test amount: 40855
Test_positive amount: 2492
Test_unlabeled amount: 38363
Train amount: 163423
Known_positive amount: 4984
Unlabeled_amount: 158438
Size of positive paths: 12461
Size of unlabeled paths: 191817


In [None]:
# Train the PU algorithm
iterative_cls = IterativeClassifierAlgorithm(
    negative_detector=KNNDetector(frac=0.1, k=20),
    stop_criterion=NonStop('aul'),
    classifier_class=LogisticRegression,
    max_iterations=20,
    verbose=True,
)

iterative_cls.fit(X_train, y_train, X_test, y_test)

print(f'Evolution of f1 score: {iterative_cls.validation_results}')

In [6]:
probtagging = ProbTagging(
    knn_num_samples=20,
    classifier_class=LogisticRegression,
    num_classifiers=10,
    verbose=True,
    classifier_kwargs={'max_iter':10000}
)

probtagging.fit(X_train, y_train)

[0.   0.   0.   ... 0.   0.   0.05] 0.5 0.0 0.027638887135661906
Training classifier #0
Positives: 4280 // Negatives: 154158
Training classifier #1
Positives: 4373 // Negatives: 154065
Training classifier #2
Positives: 4380 // Negatives: 154058
Training classifier #3
Positives: 4330 // Negatives: 154108
Training classifier #4
Positives: 4384 // Negatives: 154054
Training classifier #5
Positives: 4361 // Negatives: 154077
Training classifier #6
Positives: 4417 // Negatives: 154021
Training classifier #7
Positives: 4362 // Negatives: 154076
Training classifier #8
Positives: 4386 // Negatives: 154052
Training classifier #9
Positives: 4260 // Negatives: 154178


In [22]:
test_positive_features, test_negative_features = extractor.extract_features(paths_test_positive, paths_test_negative, use_cache=False)

100%|███████████████████████████████████████████| 41/41 [03:10<00:00,  4.64s/it]
100%|███████████████████████████████████████████| 17/17 [01:10<00:00,  4.17s/it]


In [31]:
test_df = pd.concat([test_positive_features, test_negative_features])
X_test, y_test = test_df.drop(columns=['id', 'label']).to_numpy(), test_df['label'].to_numpy()

balanced_accuracy_score(y_test, probtagging.predict_proba(X_test) > probtagging.class_prior)

0.6605205640264707