In [1]:
import pickle
import pandas as pd
import numpy as np
import utilities
import preprocess
import parameters

from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sentence_transformers import util, SentenceTransformer
from sklearn.metrics import hamming_loss, accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
import warnings
warnings.filterwarnings("ignore")

## parameters

In [3]:
# importing algorithm parameters
sim_type = parameters.sim_type
random_state = parameters.random_state
test_size = parameters.test_size

# doe
balance_ratio = parameters.balance_ratio
sim_calculation_type = parameters.sim_calculation_type

success_metric = parameters.success_metric
embedding_method = parameters.embedding_method
data_paths = parameters.data_paths
X_num_paths = parameters.X_num_paths
unlabeled_ratios = parameters.unlabeled_ratios

np.random.seed(random_state)

In [4]:
# classifier_object = LinearSVC(class_weight='balanced')
classifier_object = LogisticRegression(class_weight='balanced')

In [5]:
def split_data(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=unlabeled_ratios[data], 
                                                                  random_state=random_state)
    
    return X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test

In [6]:
def split_data_KFold(X, y, cv):
    
    from sklearn.model_selection import KFold, StratifiedKFold
    
    sorted_y = y.sum().sort_values(ascending=False)
    for col, _ in zip(sorted_y.index, sorted_y):
        y.loc[y[y[col]==1].index, 'dominant_label'] = col
        
    stratify_flag = y['dominant_label']
    y.drop(['dominant_label'], axis=1, inplace=True)
    
    kf = StratifiedKFold(n_splits=cv, random_state=random_state, shuffle=True)
    
    splits = []
    for train_idx, test_idx in kf.split(X.index, stratify_flag):
        
        labeled_idx, unlabeled_idx = train_test_split(train_idx, test_size=unlabeled_ratios[data], random_state=random_state)
        
        splits.append((labeled_idx, unlabeled_idx, test_idx))
        
    return splits    

In [7]:
def read_data(data):
    # reading data
    df = utilities.read_data(data_paths[data])
    # X = df['text'].apply(preprocess.preprocess_text)
    y = df.drop(['text'], axis=1)
    # ------------------------------------------------------------------------------------------------------------------------------
    # reading from a pickle instead of applying vectorization
    # X_num = utilities.vectorize_data(X, embedding_method)
    # X_num = pd.Series([np.squeeze(i) for i in X_num])
    X = pd.read_pickle(X_num_paths[data])
    
    return X, y

In [8]:
def reduce_dimension(X, n_comp=200):
    
    X_ = np.vstack(X.values)

    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=n_comp)
    X_reduced = svd.fit_transform(X_)
    
    return pd.Series([np.squeeze(i) for i in X_reduced], index=X.index)

# main

In [9]:
def main(data, balance_ratio, sim_calculation_type, single_metric, oversampler_version, batch_size, n_iter=None):
    
    print('*'*100)
    print('\x1b[1;31m'+data+'\x1b[0m')
    
    X, y = read_data(data)
    random_state=6
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=unlabeled_ratios[data], 
                                                                  random_state=random_state)
    print(y_labeled.sum())
    shape_before = X_labeled.shape[0]
    print(X_labeled.shape, X_unlabeled.shape, X_test.shape)
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               success_metric=success_metric,
                                               classifier_object = classifier_object, 
                                               print_results=True)
    # -----------------------------------------------------------------------------------------------------------------------------
    # calculation number of instances to balance dataset
    num_of_new_instances = utilities.calculate_balancing_num_instance_multiclass(y_labeled, balance_ratio, 
                                                                                 calculation_type='metric_based', 
                                                                                 s_metrics=s_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # oversampling dataset using unlabeled data with the given ratios
    # print('num_of_new_instances : ',num_of_new_instances)
    if oversampler_version == 'v1':
        validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v1(
                                                                        num_of_new_instances, X_labeled, y_labeled, 
                                                                        X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                        sim_calculation_type=sim_calculation_type,
                                                                        batch_size=batch_size)
    elif oversampler_version == 'v2':
        validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v2(
                                                                        num_of_new_instances, X_labeled, y_labeled, 
                                                                        X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                        sim_calculation_type=sim_calculation_type,
                                                                        batch_size=batch_size)
    elif oversampler_version == 'v3':
        validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v3(
                                                                        num_of_new_instances, X_labeled, y_labeled, 
                                                                        X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                        sim_calculation_type=sim_calculation_type,
                                                                        batch_size=batch_size, 
                                                                        n_iter=n_iter, 
                                                                        single_score=single_metric)
    elif oversampler_version == 'v4':
        validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled, metric_history = utilities.oversample_dataset_v4(
                                                                                         num_of_new_instances, 
                                                                                         X_labeled, y_labeled, 
                                                                                         X_unlabeled, y_unlabeled, 
                                                                                         X_test, y_test, 
                                                                                         sim_calculation_type=sim_calculation_type, 
                                                                                         batch_size=batch_size, 
                                                                                         n_iter=n_iter,
                                                                                         balance_ratio=balance_ratio,
                                                                                         success_metric=success_metric,
                                                                                         single_score=single_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # check if the result gets better
    shape_after = X_labeled.shape[0]
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               success_metric=success_metric,
                                               classifier_object = classifier_object, 
                                               print_results=True)
    # comparing the found labels and ground truth
    y_true, y_pred = [], []
    for _, _, _, y_t, y_p in validation:
        y_true.append(list(y_t.values))
        y_pred.append(list(y_p.values()))
    
    acc = 1-hamming_loss(y_true, y_pred)
    emr = accuracy_score(y_true, y_pred)  
    print('-'*30)
    print(f'Shape: before {shape_before}, after {shape_after} : {shape_after-shape_before} instances added...')
    print(f'Exact match ratio : {emr:.2f} ')
    print(f'Accuracy          : {acc:.2f} ')
    print(classification_report(y_true, y_pred))
    print('-'*30)
    
    if oversampler_version == 'v4':
        return metric_history

In [10]:
def run_CV(data, balance_ratio, sim_calculation_type, single_metric, oversampler_version, batch_size, n_iter=None):
    
    CV_results = []
    
    X, y = read_data(data)
    X = reduce_dimension(X)        

    splits = split_data_KFold(X, y, 5)

    for labeled_idx, unlabeled_idx, test_idx in splits:

        X_labeled = X.loc[labeled_idx]
        y_labeled = y.loc[labeled_idx]
        X_unlabeled = X.loc[unlabeled_idx]
        y_unlabeled = y.loc[unlabeled_idx]
        X_test = X.loc[test_idx]
        y_test = y.loc[test_idx]
    
    
        shape_before = X_labeled.shape[0]

        s_metric_before, initial_scores = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                                   success_metric=success_metric,
                                                   classifier_object = classifier_object, 
                                                   print_results=False, return_scores=True)
        # -----------------------------------------------------------------------------------------------------------------------------
        # calculation number of instances to balance dataset
        num_of_new_instances = utilities.calculate_balancing_num_instance_multiclass(y_labeled, balance_ratio, 
                                                                                     calculation_type='metric_based', 
                                                                                     s_metrics=s_metric_before)
        # -----------------------------------------------------------------------------------------------------------------------------
        # oversampling dataset using unlabeled data with the given ratios
        # print('num_of_new_instances : ',num_of_new_instances)
        if oversampler_version == 'v1':
            validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v1(
                                                                            num_of_new_instances, X_labeled, y_labeled, 
                                                                            X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                            sim_calculation_type=sim_calculation_type,
                                                                            batch_size=batch_size)
        elif oversampler_version == 'v2':
            validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v2(
                                                                            num_of_new_instances, X_labeled, y_labeled, 
                                                                            X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                            sim_calculation_type=sim_calculation_type,
                                                                            batch_size=batch_size)
        elif oversampler_version == 'v3':
            validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v3(
                                                                            num_of_new_instances, X_labeled, y_labeled, 
                                                                            X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                            sim_calculation_type=sim_calculation_type,
                                                                            batch_size=batch_size, 
                                                                            n_iter=n_iter, 
                                                                            single_score=single_metric)
        elif oversampler_version == 'v4':
            validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled, metric_history = utilities.oversample_dataset_v4(
                                                                                             num_of_new_instances, 
                                                                                             X_labeled, y_labeled, 
                                                                                             X_unlabeled, y_unlabeled, 
                                                                                             X_test, y_test, 
                                                                                             sim_calculation_type=sim_calculation_type, 
                                                                                             batch_size=batch_size, 
                                                                                             n_iter=n_iter,
                                                                                             balance_ratio=balance_ratio,
                                                                                             success_metric=success_metric,
                                                                                             single_score=single_metric)
        # -----------------------------------------------------------------------------------------------------------------------------
        # check if the result gets better
        shape_after = X_labeled.shape[0]
        s_metric_after, final_scores = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                                   success_metric=success_metric,
                                                   classifier_object = classifier_object, 
                                                   print_results=False, return_scores=True)
        # comparing the found labels and ground truth
        y_true, y_pred = [], []
        for _, _, _, y_t, y_p in validation:
            y_true.append(list(y_t.values))
            y_pred.append(list(y_p.values()))

        acc = 1-hamming_loss(y_true, y_pred)
        emr = accuracy_score(y_true, y_pred)  
        '''
        print('-'*30)
        print(f'Shape: before {shape_before}, after {shape_after} : {shape_after-shape_before} instances added...')
        print(f'Exact match ratio : {emr:.2f} ')
        print(f'Accuracy          : {acc:.2f} ')
        print(classification_report(y_true, y_pred))
        print('-'*30)
        '''
        
        CV_results.append({'shape_before':shape_before,'shape_after':shape_after, 'val_accuracy':acc, 'val_exact_match':emr, 
                          's_metric_before':s_metric_before, 's_metric_after':s_metric_after, 'initial_scores':initial_scores,
                          'final_scores':final_scores, 'validation':validation, 'clf_report':classification_report(y_true, y_pred)})
        
    return CV_results

In [11]:
def main_CV(data, balance_ratio, sim_calculation_type, single_metric, oversampler_version, batch_size, n_iter):
    
    CV_results = run_CV(data, balance_ratio, sim_calculation_type, single_metric, oversampler_version, batch_size, n_iter)
    
    return CV_results

In [12]:
data = 'opp115'
balance_ratio = 0.5
sim_calculation_type = 'average'
single_metric = 'f1_score'
batch_size = 1
n_iter = 200

metric_history_v1 = main(data, balance_ratio, sim_calculation_type, single_metric, 'v1', batch_size, n_iter)

metric_history_v2 = main(data, balance_ratio, sim_calculation_type, single_metric, 'v2', batch_size, n_iter)

metric_history_v3 = main(data, balance_ratio, sim_calculation_type, single_metric, 'v3', batch_size, n_iter)

In [13]:
CV_res = main_CV(data, balance_ratio, sim_calculation_type, single_metric, 'v4', batch_size, 500)

Shapes --------------
(169,) (2351,)
Shapes --------------
(188,) (2172,)
Shapes --------------
(146,) (2241,)
Shapes --------------
(151,) (2208,)
Shapes --------------
(156,) (2281,)


In [14]:
for i in CV_res:
    print(i['initial_scores']['f1_score']) 
    print(i['final_scores']['f1_score'])
    print(i['shape_before'], i['shape_after'])
    print('-----')

0.5863809991931965
0.5286224103285927
135 169
-----
0.596741038512938
0.5325931521573523
135 188
-----
0.5758388097620388
0.5728683542374746
135 146
-----
0.5900775576598388
0.5507608393741729
135 151
-----
0.6111166690356505
0.5854400754170543
136 156
-----


In [15]:
'shape_before':shape_before,
'shape_after':shape_after, 
'val_accuracy':acc,
'val_exact_match':emr, 
's_metric_before':s_metric_before, 
's_metric_after':s_metric_after, 
'initial_scores':initial_scores,
'final_scores':final_scores, 
'validation':validation, 
'clf_report':classification_report

SyntaxError: illegal target for annotation (1886476715.py, line 1)

In [16]:
CV_res

[{'shape_before': 135,
  'shape_after': 169,
  'val_accuracy': 0.7181372549019608,
  'val_exact_match': 0.0,
  's_metric_before': {'Data Retention': 0.3934426229508197,
   'Data Security': 0.7356321839080461,
   'Do Not Track': 0.625,
   'First Party Collection/Use': 0.75,
   'International and Specific Audiences': 0.761904761904762,
   'Introductory/Generic': 0.5358851674641149,
   'Policy Change': 0.8070175438596493,
   'Practice not covered': 0.39999999999999997,
   'Privacy contact information': 0.5783132530120482,
   'Third Party Sharing/Collection': 0.7315914489311164,
   'User Access, Edit and Deletion': 0.2122905027932961,
   'User Choice/Control': 0.5054945054945055},
  's_metric_after': {'Data Retention': 0.1739130434782609,
   'Data Security': 0.7441860465116279,
   'Do Not Track': 0.625,
   'First Party Collection/Use': 0.742063492063492,
   'International and Specific Audiences': 0.761904761904762,
   'Introductory/Generic': 0.5196078431372549,
   'Policy Change': 0.807017

In [None]:
def arrange_results(res):
    

Shape: before 135, after 175 : 40 instances added...

### dimensionality reduction

In [None]:
def reduce_dimension(X, y, method):
    X_ = np.vstack(X.values)
    if method == 'pca':
        from sklearn.decomposition import PCA
        pca = PCA(n_components=0.99)
        X_reduced = pca.fit_transform(X_)
    elif method == 'svd':
        from sklearn.decomposition import TruncatedSVD
        svd = TruncatedSVD(n_components=100)
        X_reduced = svd.fit_transform(X_)
    elif method == 'kpca':
        from sklearn.decomposition import KernelPCA
        kpca = KernelPCA(n_components=100, kernel='rbf', gamma=15, random_state=42)
        X_reduced = kpca.fit_transform(X_)
    elif method == 'tsne':
        from sklearn.manifold import TSNE
        tsne = TSNE(n_components=100, random_state=1)
        X_reduced = tsne.fit_transform(X_)
    elif method == 'isomap':
        from sklearn.manifold import Isomap
        isomap = Isomap(n_neighbors=5, n_components=100, eigen_solver='auto')
        X_reduced = isomap.fit_transform(X_)
    elif method == 'mds':
        from sklearn.manifold import MDS
        mds = MDS(n_components=100, metric=True, random_state=1)
        X_reduced = mds.fit_transform(X_)
    elif method == 'kbest':
        from sklearn.feature_selection import SelectKBest, f_classif
        kbest = SelectKBest(f_classif, k=100)
        X_reduced = kbest.fit_transform(X_, y)
    elif method == 'max-pooling':
        from skimage.measure import block_reduce
        return X.apply(lambda x: block_reduce(x, (4,), np.max))
    
    return pd.Series([np.squeeze(i) for i in X_reduced], index=X.index)

In [None]:
def reduce_dimension(X, n_comp):
    X_ = np.vstack(X.values)

    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=n_comp)
    X_reduced = svd.fit_transform(X_)
    
    return pd.Series([np.squeeze(i) for i in X_reduced], index=X.index)

In [None]:
def main(data, balance_ratio, sim_calculation_type, single_metric, oversampler_version, batch_size, n_iter=None):
    
    random_state = 2
    
    for n_comp in [10, 20, 30, 50, 100, 200, 300, 500]:
        
        print('*'*100)
        print('\x1b[1;31m'+str(n_comp)+'\x1b[0m')
        X, y = read_data(data)
        try:
            X = reduce_dimension(X, n_comp)
        except Exception as e:
            print('error with ', n_comp, e)
            continue
    
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=unlabeled_ratios[data], 
                                                                      random_state=random_state)

        shape_before = X_labeled.shape[0]
        print(X_labeled.shape, X_unlabeled.shape, X_test.shape)
        s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                                   success_metric=success_metric,
                                                   classifier_object = classifier_object, 
                                                   print_results=True)
        # -----------------------------------------------------------------------------------------------------------------------------
        # calculation number of instances to balance dataset
        num_of_new_instances = utilities.calculate_balancing_num_instance_multiclass(y_labeled, balance_ratio, 
                                                                                     calculation_type='metric_based', 
                                                                                     s_metrics=s_metric)
        # -----------------------------------------------------------------------------------------------------------------------------
        # oversampling dataset using unlabeled data with the given ratios
        # print('num_of_new_instances : ',num_of_new_instances)
        if oversampler_version == 'v1':
            validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v1(
                                                                            num_of_new_instances, X_labeled, y_labeled, 
                                                                            X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                            sim_calculation_type=sim_calculation_type,
                                                                            batch_size=batch_size)
        elif oversampler_version == 'v2':
            validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v2(
                                                                            num_of_new_instances, X_labeled, y_labeled, 
                                                                            X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                            sim_calculation_type=sim_calculation_type,
                                                                            batch_size=batch_size)
        elif oversampler_version == 'v3':
            validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_v3(
                                                                            num_of_new_instances, X_labeled, y_labeled, 
                                                                            X_unlabeled, y_unlabeled, X_test, y_test, 
                                                                            sim_calculation_type=sim_calculation_type,
                                                                            batch_size=batch_size, 
                                                                            n_iter=n_iter, 
                                                                            single_score=single_metric)
        elif oversampler_version == 'v4':
            validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled, metric_history = utilities.oversample_dataset_v4(
                                                                                             num_of_new_instances, 
                                                                                             X_labeled, y_labeled, 
                                                                                             X_unlabeled, y_unlabeled, 
                                                                                             X_test, y_test, 
                                                                                             sim_calculation_type=sim_calculation_type, 
                                                                                             batch_size=batch_size, 
                                                                                             n_iter=n_iter,
                                                                                             balance_ratio=balance_ratio,
                                                                                             success_metric=success_metric,
                                                                                             single_score=single_metric)
        # -----------------------------------------------------------------------------------------------------------------------------
        # check if the result gets better
        shape_after = X_labeled.shape[0]
        s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                                   success_metric=success_metric,
                                                   classifier_object = classifier_object, 
                                                   print_results=True)
        # comparing the found labels and ground truth
        y_true, y_pred = [], []
        for _, _, _, y_t, y_p in validation:
            y_true.append(list(y_t.values))
            y_pred.append(list(y_p.values()))

        acc = 1-hamming_loss(y_true, y_pred)
        emr = accuracy_score(y_true, y_pred)  
        print('-'*30)
        print(f'Shape: before {shape_before}, after {shape_after} : {shape_after-shape_before} instances added...')
        print(f'Exact match ratio : {emr:.2f} ')
        print(f'Accuracy          : {acc:.2f} ')
        print(classification_report(y_true, y_pred))
        print('-'*30)
        
        print('*'*300)

    if oversampler_version == 'v4':
        return metric_history

In [None]:
dim_red_methods = ['svd', 'isomap'] # ['pca', 'svd' ,'kpca', 'tsne', 'isomap', 'mds', 'kbest', 'max-pooling']

svd

In [None]:
metric_history_v4 = main(data, balance_ratio, sim_calculation_type, single_metric, 'v4', batch_size, 200)

In [None]:
stop

In [None]:
data = 'opp115'
balance_ratio = 0.5
sim_calculation_type = 'average'
single_metric = 'f1_score'
oversampler_version = 'v4'
batch_size = 1
n_iter = 500

In [None]:
main(data, balance_ratio, sim_calculation_type, single_metric, oversampler_version, batch_size, n_iter)

In [None]:
main(data, balance_ratio, sim_calculation_type, single_metric, oversampler_version, batch_size, n_iter)

In [None]:
stop

In [None]:
main('opp115', balance_ratio, sim_calculation_type, single_metric, 'v1', batch_size)

In [None]:
main('opp115', balance_ratio, sim_calculation_type,  single_metric, 'v2', batch_size)

In [None]:
main('opp115', balance_ratio, sim_calculation_type,  'coverage', 'v3', batch_size, n_iter)

In [None]:
main('opp115', balance_ratio, sim_calculation_type, 'coverage', 'v4', batch_size, n_iter)

In [None]:
ss

In [None]:
data = 'opp115'
balance_ratio = 0.5
sim_calculation_type = 'average'
single_metric = 'coverage'
oversampler_version = 'v1'
batch_size = 1
n_iter = 500

In [None]:
parameters = {
'data' : ['opp115', 'ohsumed', 'reuters'],
'balance_ratio' : [0.2, 0.5],
'sim_calculation_type' : ['average', 'safe_interval'],
'single_metric' : ['accuracy', 'f1_score', 'coverage', 'label_ranking', 'roc_auc_score', 'log_loss', 'average_precision',
                   'brier_loss', 'hamming_loss', 'precision', 'recall', 'zero_one_loss', 'label_ranking_average_precision'],
'oversampler_version' : ['v1', 'v2', 'v3', 'v4'],
'batch_size' : [1,3,5],
'n_iter' : [100, 1000],
}

In [None]:
replication_size = 5

In [None]:
for data in parameters['data']:
    for balance_ratio in parameters['balance_ratio']:
        for sim_calculation_type in parameters['sim_calculation_type']:
            for single_metric in parameters['single_metric']:
                for oversampler_version in parameters['oversampler_version']:
                    for batch_size in parameters['batch_size']:
                        for n_iter in parameters['n_iter']:
                            
                            results = []
                            param_list = [data, balance_ratio, sim_calculation_type, single_metric, oversampler_version,
                                         batch_size, n_iter]
                            
                            for i in range(replication_size):
                                results.append(main(data, balance_ratio, sim_calculation_type, single_metric, 
                                                    oversampler_version, batch_size, n_iter))
                            

In [None]:
num_of_new_instances = {'a':20, 'b':30, 'c':50}
n_iter = 200

In [None]:
n_iter_dist = {k:int(n_iter*v/sum(num_of_new_instances.values())) for k,v in num_of_new_instances.items()}

In [None]:
n_iter_dist

In [None]:
main('opp115', embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric, 'label-ranking')

In [None]:
asdas

In [None]:
# oversampling methods

oversample_dataset_v2(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size)
oversample_dataset_v3(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size, n_iter)
oversample_dataset_v4(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size, n_iter, balance_ratio, success_metric)

In [None]:
for data in data_paths.keys():
    main(data, embedding_method, classifier_object, sim_type, sim_calculation_type, success_metric)

In [None]:
stop

In [None]:
import numpy as np

In [None]:
df = utilities.read_data(data_paths[data])
X = df['text'].apply(preprocess.preprocess_text)
y = df.drop(['text'], axis=1)

In [None]:
y[y[col] == 1].index