In [1]:
import sys
sys.path.append("..")

In [16]:
import numpy as np
import pandas as pd

from pu.feature_extractors.extractors import ViTExtractor, AutoencoderExtractor
from pu.data.loaders import CSVLoader, SingleCSVLoader, SingleCSVWithTestLoader
from pu.data.pu_builder import build_pu_data

from pu.algorithms.pu_algorithms import IterativeClassifierAlgorithm
from pu.algorithms.negative_detectors import NaiveDetector, KNNDetector
from pu.algorithms.stop_criterion import StopOnMetricDrop, NonStop

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [3]:
ava_loader = SingleCSVWithTestLoader(
    '/srv/PU-dataset/unlabeled.csv',
    'id',
    '/srv/PU-dataset/dataset_unlabeled',
    reliable_positive_fn=lambda row: row['VotesMean'] > 6.5,
    positive_fn=lambda row: row['VotesMean'] >= 5.0,
    test_frac=0.2,
    random_state=1234
)

paths_train_positive, paths_train_unlabeled, paths_test_positive, paths_test_negative = ava_loader.load_data()

In [4]:
extractor = ViTExtractor('test_ava')
#extractor = AutoencoderExtractor(input_shape=(224, 224, 3), filters=[8,16,32,64,64])
positive_features, unlabeled_features = extractor.extract_features(paths_train_positive, paths_train_unlabeled)

positive_features = positive_features.drop(['label', 'id'], axis=1).to_numpy()
unlabeled_features = unlabeled_features.drop(['label', 'id'], axis=1).to_numpy()

In [5]:
# Create partitions
X_train, X_test, y_train, y_test = build_pu_data(
    positive_features, unlabeled_features,
    frac=1.0,
    move_to_unlabeled_frac=0.5,
    test_split=0.2,
    test_split_positive='same'
)

Total amount: 204278
Test amount: 40855
Test_positive amount: 2492
Test_unlabeled amount: 38363
Train amount: 163423
Known_positive amount: 4984
Unlabeled_amount: 158438
Size of positive paths: 12461
Size of unlabeled paths: 191817


In [23]:
# Train the PU algorithm
iterative_cls = IterativeClassifierAlgorithm(
    negative_detector=KNNDetector(frac=0.1, k=20),
    stop_criterion=NonStop('aul'),
    classifier_class=LogisticRegression,
    max_iterations=20,
    verbose=True,
)

iterative_cls.fit(X_train, y_train, X_test, y_test)

print(f'Evolution of f1 score: {iterative_cls.validation_results}')

Iteration #0
Number of negatives: 15843
Number of positives: 4985


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 0: 1.2039175105319335
Moving 1425 from unlabeled to negative and from unlabeled to positive
Iteration #1
Number of negatives: 17268
Number of positives: 6410


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 1: 1.2221657411591744
Moving 1397 from unlabeled to negative and from unlabeled to positive
Iteration #2
Number of negatives: 18665
Number of positives: 7807


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 2: 1.2367748307757684
Moving 1369 from unlabeled to negative and from unlabeled to positive
Iteration #3
Number of negatives: 20034
Number of positives: 9176


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 3: 1.2501159696720858
Moving 1342 from unlabeled to negative and from unlabeled to positive
Iteration #4
Number of negatives: 21376
Number of positives: 10518


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 4: 1.258754816503201
Moving 1315 from unlabeled to negative and from unlabeled to positive
Iteration #5
Number of negatives: 22691
Number of positives: 11833


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 5: 1.2678837195285313
Moving 1288 from unlabeled to negative and from unlabeled to positive
Iteration #6
Number of negatives: 23979
Number of positives: 13121


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 6: 1.267686371417331
Moving 1263 from unlabeled to negative and from unlabeled to positive
Iteration #7
Number of negatives: 25242
Number of positives: 14384


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 7: 1.2778233576340352
Moving 1237 from unlabeled to negative and from unlabeled to positive
Iteration #8
Number of negatives: 26479
Number of positives: 15621


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 8: 1.2801664376819901
Moving 1213 from unlabeled to negative and from unlabeled to positive
Iteration #9
Number of negatives: 27692
Number of positives: 16834


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 9: 1.280656759604668
Moving 1188 from unlabeled to negative and from unlabeled to positive
Iteration #10
Number of negatives: 28880
Number of positives: 18022


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 10: 1.2843146850395823
Moving 1165 from unlabeled to negative and from unlabeled to positive
Iteration #11
Number of negatives: 30045
Number of positives: 19187


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 11: 1.2869632131623783
Moving 1141 from unlabeled to negative and from unlabeled to positive
Iteration #12
Number of negatives: 31186
Number of positives: 20328


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 12: 1.2894707542121189
Moving 1119 from unlabeled to negative and from unlabeled to positive
Iteration #13
Number of negatives: 32305
Number of positives: 21447


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 13: 1.2900039244115344
Moving 1096 from unlabeled to negative and from unlabeled to positive
Iteration #14
Number of negatives: 33401
Number of positives: 22543


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 14: 1.2909826057518021
Moving 1074 from unlabeled to negative and from unlabeled to positive
Iteration #15
Number of negatives: 34475
Number of positives: 23617


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 15: 1.2912540567962951
Moving 1053 from unlabeled to negative and from unlabeled to positive
Iteration #16
Number of negatives: 35528
Number of positives: 24670


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 16: 1.290982177734183
Moving 1032 from unlabeled to negative and from unlabeled to positive
Iteration #17
Number of negatives: 36560
Number of positives: 25702


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 17: 1.2905986940429537
Moving 1011 from unlabeled to negative and from unlabeled to positive
Iteration #18
Number of negatives: 37571
Number of positives: 26713


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Finished training classifier
Validation metric for iteration 18: 1.2913555387606548
Moving 991 from unlabeled to negative and from unlabeled to positive
Iteration #19
Number of negatives: 38562
Number of positives: 27704
Finished training classifier
Validation metric for iteration 19: 1.290335996566417
Moving 971 from unlabeled to negative and from unlabeled to positive
Evolution of f1 score: [1.2039175105319335, 1.2221657411591744, 1.2367748307757684, 1.2501159696720858, 1.258754816503201, 1.2678837195285313, 1.267686371417331, 1.2778233576340352, 1.2801664376819901, 1.280656759604668, 1.2843146850395823, 1.2869632131623783, 1.2894707542121189, 1.2900039244115344, 1.2909826057518021, 1.2912540567962951, 1.290982177734183, 1.2905986940429537, 1.2913555387606548, 1.290335996566417]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
test_positive_features, test_negative_features = extractor.extract_features(paths_test_positive, paths_test_negative, use_cache=False)

100%|███████████████████████████████████████████| 41/41 [03:08<00:00,  4.60s/it]
100%|███████████████████████████████████████████| 17/17 [01:10<00:00,  4.12s/it]


In [24]:
test_df = pd.concat([test_positive_features, test_negative_features])
test_data = test_df.drop(columns=['id', 'label']).to_numpy()

balanced_accuracy_score(test_df['label'], iterative_cls.predict(test_data))

0.6883290777658075