In [1]:
import pickle
import pandas as pd
import numpy as np
import utilities
import preprocess
import parameters

from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sentence_transformers import util, SentenceTransformer
from sklearn.metrics import hamming_loss, accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.multiclass import OneVsRestClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore")

## parameters

In [3]:
# importing algorithm parameters
balance_ratio = parameters.balance_ratio
random_state = parameters.random_state
threshold_factor = parameters.threshold_factor
test_size = parameters.test_size
sim_calculation_type = parameters.sim_calculation_type
sim_type = parameters.sim_type
success_metric = parameters.success_metric
embedding_method = parameters.embedding_method
data_paths = parameters.data_paths
unlabaled_ratios = parameters.unlabaled_ratios

np.random.seed(random_state)

In [4]:
classifier_object = LinearSVC(class_weight='balanced')

# main

In [5]:
def main(data, embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric):
    print('*'*100)
    print('\x1b[1;31m'+data+'\x1b[0m')
    # reading data
    df = utilities.read_data(data_paths[data])
    X = df['text'].apply(preprocess.preprocess_text)
    y = df.drop(['text'], axis=1)
    # ------------------------------------------------------------------------------------------------------------------------------
    # reading from a pickle instead of applying vectorization
    # X_num = utilities.vectorize_data(X, embedding_method)
    # X_num = pd.Series([np.squeeze(i) for i in X_num])
    X_num = pd.read_pickle('X_num_opp115.p')
    X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=test_size, random_state=random_state)
    X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=unlabaled_ratios[data], 
                                                                  random_state=random_state)
    
    print(X_labeled.shape, y_labeled.shape, X_unlabeled.shape, y_unlabeled.shape)
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               classifier_object=classifier_object, success_metric='col_f1-score')
    # -----------------------------------------------------------------------------------------------------------------------------
    # calculation number of instances to balance dataset
    num_of_new_instances = utilities.calculate_balancing_num_instance_multiclass(y_labeled, balance_ratio, 
                                                                                 calculation_type='metric_based', 
                                                                                 s_metrics=s_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # oversampling dataset using unlabeled data with the given ratios
    print('num_of_new_instances : ',num_of_new_instances)
    validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v4(\
                                                                 num_of_new_instances, 
                                                                 X_labeled, y_labeled, 
                                                                 X_unlabeled, y_unlabeled, 
                                                                 X_test, y_test, \
                                                                 sim_calculation_type=sim_calculation_type, 
                                                                 batch_size=1, 
                                                                 n_iter=5000,
                                                                 balance_ratio=balance_ratio,
                                                                 success_metric=success_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # check if the result gets better
    print(X_labeled.shape, X_unlabeled.shape, X_test.shape)
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               success_metric=success_metric,
                                               classifier_object = classifier_object, print_results=True)
    # comparing the found labels and ground truth
    y_true, y_pred = [], []
    for _, _, _, y_t, y_p in validation:
        y_true.append(list(y_t.values))
        y_pred.append(list(y_p.values()))
    
    acc = 1-hamming_loss(y_true, y_pred)
    emr = accuracy_score(y_true, y_pred)  
    print('-'*30)
    print(f'Exact match ratio : {emr:.2f} ')
    print(f'Accuracy          : {acc:.2f} ')
    print('-'*30)
    
    print('*'*100)
    print('/'*100)
    print('*'*100)

In [None]:
main('opp115', embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric)

****************************************************************************************************
[1;31mopp115[0m
(326,) (326, 12) (2393,) (2393, 12)
num_of_new_instances :  {'Data Retention': 206, 'Data Security': 64, 'Do Not Track': 137, 'First Party Collection/Use': 22, 'International and Specific Audiences': 57, 'Introductory/Generic': 103, 'Policy Change': 26, 'Practice not covered': 135, 'Privacy contact information': 82, 'Third Party Sharing/Collection': 36, 'User Access, Edit and Deletion': 213, 'User Choice/Control': 125}
[1mMultilabel Classifier Results[0m
[1mLinearSVC[0m
------------------------------
Hamming Loss
Training : 0.03
Test     : 0.04
Exact Match Ratio
Training : 0.73
Test     : 0.65
Macro F1-Score
Training : 0.79
Test     : 0.66
------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.00      0.00      0.00        13
           

In [None]:
asdas

In [None]:
# oversampling methods
oversample_dataset_v1(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size)
oversample_dataset_v2(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size)
oversample_dataset_v3(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size, n_iter)
oversample_dataset_v4(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size, n_iter, balance_ratio, success_metric)

In [None]:
for data in data_paths.keys():
    main(data, embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric)

In [None]:
stop

In [None]:
import numpy as np

In [None]:
df = utilities.read_data(data_paths[data])
X = df['text'].apply(preprocess.preprocess_text)
y = df.drop(['text'], axis=1)

In [None]:
y[y[col] == 1].index