In [1]:
import pickle
import pandas as pd
import numpy as np
import utilities
import preprocess
import parameters

from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sentence_transformers import util, SentenceTransformer
from sklearn.metrics import hamming_loss, accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
import warnings
warnings.filterwarnings("ignore")

## parameters

In [3]:
# importing algorithm parameters
balance_ratio = parameters.balance_ratio
random_state = parameters.random_state
threshold_factor = parameters.threshold_factor
test_size = parameters.test_size
sim_calculation_type = parameters.sim_calculation_type
sim_type = parameters.sim_type
success_metric = parameters.success_metric
embedding_method = parameters.embedding_method
data_paths = parameters.data_paths
unlabaled_ratios = parameters.unlabaled_ratios

np.random.seed(random_state)

In [4]:
# classifier_object = LinearSVC(class_weight='balanced')
classifier_object = LogisticRegression(class_weight='balanced')

# main

In [5]:
def main(data, embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric, 
         single_metric, oversampler_version):
    print('*'*100)
    print('\x1b[1;31m'+data+'\x1b[0m')
    # reading data
    df = utilities.read_data(data_paths[data])
    X = df['text'].apply(preprocess.preprocess_text)
    y = df.drop(['text'], axis=1)
    # ------------------------------------------------------------------------------------------------------------------------------
    # reading from a pickle instead of applying vectorization
    # X_num = utilities.vectorize_data(X, embedding_method)
    # X_num = pd.Series([np.squeeze(i) for i in X_num])
    X_num = pd.read_pickle('X_num_opp115.p')
    X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=test_size, random_state=random_state)
    X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=unlabaled_ratios[data], 
                                                                  random_state=random_state)
    
    print(X_labeled.shape, X_unlabeled.shape, X_test.shape)
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               success_metric=success_metric,
                                               classifier_object = classifier_object, print_results=True)
    # -----------------------------------------------------------------------------------------------------------------------------
    # calculation number of instances to balance dataset
    num_of_new_instances = utilities.calculate_balancing_num_instance_multiclass(y_labeled, balance_ratio, 
                                                                                 calculation_type='metric_based', 
                                                                                 s_metrics=s_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # oversampling dataset using unlabeled data with the given ratios
    # print('num_of_new_instances : ',num_of_new_instances)
    if oversampler_version == 'v1':
        validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v1(
                                                                        num_of_new_instances, X_labeled, y_labeled, 
                                                                        X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                        sim_calculation_type=sim_calculation_type,
                                                                        batch_size=1)
    elif oversampler_version == 'v2':
        validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v2(
                                                                        num_of_new_instances, X_labeled, y_labeled, 
                                                                        X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                        sim_calculation_type=sim_calculation_type,
                                                                        batch_size=1)
    elif oversampler_version == 'v3':
        validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v3(
                                                                        num_of_new_instances, X_labeled, y_labeled, 
                                                                        X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                        sim_calculation_type=sim_calculation_type,
                                                                        batch_size=1, n_iter=1000, single_score=single_metric)
    elif oversampler_version == 'v4':
        validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v4(\
                                                                     num_of_new_instances, 
                                                                     X_labeled, y_labeled, 
                                                                     X_unlabeled, y_unlabeled, 
                                                                     X_test, y_test, \
                                                                     sim_calculation_type=sim_calculation_type, 
                                                                     batch_size=1, 
                                                                     n_iter=1000,
                                                                     balance_ratio=balance_ratio,
                                                                     success_metric=success_metric,
                                                                     single_score=single_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # check if the result gets better
    print(X_labeled.shape, X_unlabeled.shape, X_test.shape)
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               success_metric=success_metric,
                                               classifier_object = classifier_object, print_results=True)
    # comparing the found labels and ground truth
    y_true, y_pred = [], []
    for _, _, _, y_t, y_p in validation:
        y_true.append(list(y_t.values))
        y_pred.append(list(y_p.values()))
    
    acc = 1-hamming_loss(y_true, y_pred)
    emr = accuracy_score(y_true, y_pred)  
    print('-'*30)
    print(f'Exact match ratio : {emr:.2f} ')
    print(f'Accuracy          : {acc:.2f} ')
    print('-'*30)

'accuracy'
'f1_score'
'coverage'
'label_ranking'
'roc_auc_score'
'log_loss'
'average_precision'
'brier_loss'
'hamming_loss'
'precision'
'recall'
'zero_one_loss'
'label_ranking_average_precision'

In [6]:
main('opp115', embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric, 'coverage', 'v1')

****************************************************************************************************
[1;31mopp115[0m
(135,) (2584,) (680,)
[1mMultilabel Classifier Results[0m
[1mLogisticRegression[0m
------------------------------
Hamming Loss
Training : 0.05
Test     : 0.08
Exact Match Ratio
Training : 0.55
Test     : 0.39
Macro F1-Score
Training : 0.74
Test     : 0.64
Coverage Error
Training : 1.46
Test     : 2.06
Ranking Loss Error
Training : 0.02
Test     : 0.07
------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.35      0.74      0.47        19
                       Data Security       0.70      0.82      0.76        51
                        Do Not Track       0.65      0.92      0.76        12
          First Party Collection/Use       0.67      0.72      0.70       218
International and Specific Audiences       0.73      0.84      0.78    

In [7]:
main('opp115', embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric, 'coverage', 'v2')

****************************************************************************************************
[1;31mopp115[0m
(135,) (2584,) (680,)
[1mMultilabel Classifier Results[0m
[1mLogisticRegression[0m
------------------------------
Hamming Loss
Training : 0.05
Test     : 0.08
Exact Match Ratio
Training : 0.55
Test     : 0.39
Macro F1-Score
Training : 0.74
Test     : 0.64
Coverage Error
Training : 1.46
Test     : 2.06
Ranking Loss Error
Training : 0.02
Test     : 0.07
------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.35      0.74      0.47        19
                       Data Security       0.70      0.82      0.76        51
                        Do Not Track       0.65      0.92      0.76        12
          First Party Collection/Use       0.67      0.72      0.70       218
International and Specific Audiences       0.73      0.84      0.78    

In [8]:
main('opp115', embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric, 'coverage', 'v3')

****************************************************************************************************
[1;31mopp115[0m
(135,) (2584,) (680,)
[1mMultilabel Classifier Results[0m
[1mLogisticRegression[0m
------------------------------
Hamming Loss
Training : 0.05
Test     : 0.08
Exact Match Ratio
Training : 0.55
Test     : 0.39
Macro F1-Score
Training : 0.74
Test     : 0.64
Coverage Error
Training : 1.46
Test     : 2.06
Ranking Loss Error
Training : 0.02
Test     : 0.07
------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.35      0.74      0.47        19
                       Data Security       0.70      0.82      0.76        51
                        Do Not Track       0.65      0.92      0.76        12
          First Party Collection/Use       0.67      0.72      0.70       218
International and Specific Audiences       0.73      0.84      0.78    

TypeError: 'list' object is not callable

In [None]:
main('opp115', embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric, 'coverage', 'v4')

In [None]:
ss

In [None]:
main('opp115', embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric, 'label-ranking')

In [None]:
asdas

In [None]:
# oversampling methods

oversample_dataset_v2(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size)
oversample_dataset_v3(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size, n_iter)
oversample_dataset_v4(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size, n_iter, balance_ratio, success_metric)

In [None]:
for data in data_paths.keys():
    main(data, embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric)

In [None]:
stop

In [None]:
import numpy as np

In [None]:
df = utilities.read_data(data_paths[data])
X = df['text'].apply(preprocess.preprocess_text)
y = df.drop(['text'], axis=1)

In [None]:
y[y[col] == 1].index