In [2]:
data_flag = 'opp115'

# ohsumed: 23986
# opp115 3399
# reuters 10788

In [3]:
import pickle
import pandas as pd
import numpy as np
import utilities
import preprocess
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.model_selection import train_test_split

## parameters

In [4]:
# algorithm parameters
balance_ratio = 0.5
random_state = 1
threshold_factor = 1.5
test_size = 0.2

sim_calculation_type='average'
sim_type = 'cosine'
success_metric = 'col_f1-score'
embedding_method = 'distiluse-base-multilingual-cased-v1' # try different embeddings and find proper one

np.random.seed(random_state)

data_paths = {'opp115'   : r'C:\Users\IsmailKaraman\workspace\data\privacy_policy_data\OPP-115_v2\majority.csv',
              'ohsumed'  : r'C:\Users\IsmailKaraman\workspace\GitHub\thesis\data\ohsumed.csv',
              'reuters'  : r'C:\Users\IsmailKaraman\workspace\GitHub\thesis\data\Reuters21578.csv'}

unlabaled_ratios = {'opp115':0.75, 'ohsumed':0.95, 'reuters':0.9}

In [5]:
import warnings
warnings.filterwarnings("ignore")

# main

In [6]:
def main(data, embedding_method, sim_type, sim_calculation_type, success_metric):
    print('*'*100)
    print('\x1b[1;31m'+data+'\x1b[0m')
    # reading data
    df = utilities.read_data(data_paths[data])
    X = df['text'].apply(preprocess.preprocess_text)
    y = df.drop(['text'], axis=1)
    # ------------------------------------------------------------------------------------------------------------------------------
    # reading from a pickle instead of applying vectorization
    X_num = utilities.vectorize_data(X, embedding_method)
    X_num = pd.Series([np.squeeze(i) for i in X_num])
    X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=test_size, random_state=random_state)
    X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=unlabaled_ratios[data], 
                                                                  random_state=random_state)
    
    print(X_labeled.shape, y_labeled.shape, X_unlabeled.shape, y_unlabeled.shape)
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               success_metric=success_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # calculation number of instances to balance dataset
    balance_ratio = 0.5
    num_of_new_instances = utilities.calculate_balancing_num_instance_multiclass(y_labeled, balance_ratio, 
                                                                                 calculation_type='metric_based', 
                                                                                 s_metrics=s_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # oversampling dataset using unlabeled data with the given ratios
    '''
    
    
    validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset(num_of_new_instances, 
                                                                                              X_labeled, y_labeled,
                                                                                              X_unlabeled, y_unlabeled, 
                                                                                              X_test, y_test, 
                                                                                              sim_calculation_type='safe_interval', 
                                                                                              batch_size=5)
        
    '''    
    print('num_of_new_instances : ',num_of_new_instances)
    validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_with_threshold_update_and_binary_checking(\
                num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, \
                                                    sim_calculation_type=sim_calculation_type, batch_size=5, n_iter=20)
    # -----------------------------------------------------------------------------------------------------------------------------
    # check if the result gets better
    print(X_labeled.shape, X_unlabeled.shape, X_test.shape)
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               success_metric=success_metric)
    # comparing the found labels and ground truth
    y_true, y_pred = [], []
    for _, _, _, y_t, y_p in validation:
        y_true.append(list(y_t.values))
        y_pred.append(list(y_p.values()))
    
    acc = 1-hamming_loss(y_true, y_pred)
    emr = accuracy_score(y_true, y_pred)  
    print('-'*30)
    print(f'Exact match ratio : {emr:.2f} ')
    print(f'Accuracy          : {acc:.2f} ')
    print('-'*30)
    
    print('*'*100)
    print('/'*100)
    print('*'*100)

In [None]:
for data in data_paths.keys():
    main(data, embedding_method, sim_type, sim_calculation_type, success_metric)

****************************************************************************************************
[1;31mopp115[0m
(679,) (679, 12) (2040,) (2040, 12)
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 
[1mMultilabel Classifier Results[0m
[1mLinearSVM[0m
------------------------------
hamLoss: 0.07
Exact Match Ratio: 0.46
------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.11      0.23      0.15        13
                       Data Security       0.69      0.88      0.77        40
                        Do Not Track       0.86      1.00      0.92         6
          First Party Collection/Use       0.73      0.88      0.80       230
International and Specific Audiences       0.92      0.81      0.86        68
                Introductory/Generic       0.50      0.70      0.59        76
           

all_similarities :  {2660: 0.25911457687616346, 3170: 0.34682670176029207, 239: 0.30902634382247923, 2022: 0.41859830856323244, 2446: 0.353774688243866, 1320: 0.2746059063076973, 1037: 0.26479246258735656, 2427: 0.25345491409301757, 2954: 0.3513670742511749, 2982: 0.3693573749065399, 2020: 0.3070305022597313, 1763: 0.29841290414333344, 2367: 0.3250842997431755, 2523: 0.24849108397960662, 2377: 0.4304742467403412, 2097: 0.2832726216316223, 1228: 0.3924214565753937, 2181: 0.41213454604148864, 1893: 0.31388033032417295, 2187: 0.47870577514171603, 1703: 0.22447513103485106, 2021: 0.3141202920675278, 3002: 0.2938860440254211, 1409: 0.3904547768831253, 260: 0.3314749974012375, 1900: 0.2545049738883972, 2300: 0.4059504050016403, 3329: 0.4100243979692459, 2338: 0.24624287396669386, 1145: 0.4096904039382935, 3027: 0.2548485338687897, 1160: 0.30911231428384783, 594: 0.34082466006278994, 1261: 0.23673544585704803, 1250: 0.3471211808919907, 221: 0.3857617050409317, 1745: 0.2906762403249741, 2192: 

all_similarities :  {2660: 0.29523962712846696, 3170: 0.27160841940591735, 239: 0.17365412889436507, 2022: 0.2633699335468312, 2446: 0.2826783787459135, 1320: 0.3123387911667426, 1037: 0.24474457232281566, 2427: 0.26171259007727105, 2954: 0.3462766421337922, 2982: 0.20675246306927875, 2020: 0.17071753479346322, 1763: 0.22642215349090597, 2367: 0.30132252629846334, 2523: 0.17307123666008314, 2377: 0.3466272202009956, 2097: 0.22599568505150577, 1228: 0.3211676850914955, 2181: 0.36383865928898257, 1893: 0.2813306582781176, 2187: 0.4011164304489891, 1703: 0.2470060451887548, 2021: 0.2652457289708157, 3002: 0.17999252337419117, 1409: 0.31678197036186856, 260: 0.35962076981862384, 1900: 0.4005730937545498, 2300: 0.26832192204892635, 3329: 0.32237196698163945, 2338: 0.2524252573299843, 1145: 0.32326989496747655, 3027: 0.19773251563310623, 1160: 0.25108703241373104, 594: 0.2636994590672354, 1261: 0.3086388457256059, 1250: 0.2561389015657672, 221: 0.3464722748224934, 1745: 0.26347265078220516, 

all_similarities :  {2660: 0.2691786491445133, 3170: 0.3617707778300558, 239: 0.21711789284433639, 2022: 0.29240429401397705, 2446: 0.3141629658639431, 1320: 0.37262752040156294, 1037: 0.26668777130544186, 2427: 0.2617662101984024, 2954: 0.36505250685981344, 2982: 0.24841447433988964, 2020: 0.21423859215740645, 1763: 0.14538997398423298, 2367: 0.3144159306372915, 2523: 0.1852127533139927, 2377: 0.2823743974523885, 2097: 0.1757973287520664, 1228: 0.3139068026627813, 2181: 0.38901919179729055, 1893: 0.34547595732978414, 2187: 0.41312637126871515, 1703: 0.3448715768754482, 2021: 0.3203520200082234, 3002: 0.2141357964969107, 1409: 0.3994212139930044, 260: 0.3521425186523369, 1900: 0.22501107837472642, 2300: 0.2508657683751413, 3329: 0.36316024352397236, 2338: 0.15658742455499514, 1145: 0.36666895555598394, 3027: 0.1960561384579965, 1160: 0.24480098485946655, 594: 0.2583925984799862, 1261: 0.40502882269876345, 1250: 0.34074617975524496, 221: 0.3194456829556397, 1745: 0.29598437941500116, 21

all_similarities :  {2660: 0.2671757843407492, 3170: 0.25821599252521993, 239: 0.203775006942451, 2022: 0.31265144526958466, 2446: 0.2728902748848001, 1320: 0.17702897156899175, 1037: 0.23092477223525446, 2427: 0.2599307303130627, 2954: 0.26541333966578046, 2982: 0.23699211269617082, 2020: 0.21859173423610628, 1763: 0.3005932543426752, 2367: 0.34723187163472174, 2523: 0.1501544914022088, 2377: 0.39369371177007756, 2097: 0.17602465158017974, 1228: 0.3468095551400135, 2181: 0.3879382944231232, 1893: 0.22111798410109865, 2187: 0.34500469267057876, 1703: 0.1465492493348817, 2021: 0.22943941928446293, 3002: 0.20899969798823198, 1409: 0.3238562400514881, 260: 0.31946367808307213, 1900: 0.28722250398558874, 2300: 0.30157937257550654, 3329: 0.32514759942268334, 2338: 0.34004516441995897, 1145: 0.321417809377114, 3027: 0.23302427476892867, 1160: 0.3463823289486269, 594: 0.2562850853179892, 1261: 0.17388868009050687, 1250: 0.28904511347413064, 221: 0.3396834462819, 1745: 0.23457163774718842, 219

all_similarities :  {2660: 0.19884312897920609, 3170: 0.2688854465337649, 239: 0.233631475177938, 2022: 0.3589368307427184, 2446: 0.2752452807705084, 1320: 0.3646695879224229, 1037: 0.3250824575350709, 2427: 0.202604108069041, 2954: 0.299917623336899, 2982: 0.3016439204499738, 2020: 0.22498238050978478, 1763: 0.15987192770491723, 2367: 0.22621214382146318, 2523: 0.20106244268380616, 2377: 0.24497886252117484, 2097: 0.18798644807546921, 1228: 0.28343607495500617, 2181: 0.3087581312411452, 1893: 0.32494272812180325, 2187: 0.36409146930665187, 1703: 0.3379038409648897, 2021: 0.2582440353827934, 3002: 0.2768215030329685, 1409: 0.2999815775834228, 260: 0.26283408599357083, 1900: 0.1683408171576384, 2300: 0.3617283061963238, 3329: 0.2968999038208021, 2338: 0.13355046513407454, 1145: 0.3157334018447628, 3027: 0.18753474303648796, 1160: 0.24225998965844717, 594: 0.25263341781619475, 1261: 0.2953105507137841, 1250: 0.2690766406028646, 221: 0.26442068029348165, 1745: 0.2653272678973536, 2192: 0.

all_similarities :  {2660: 0.3460233677574929, 3170: 0.39641507286974725, 239: 0.24982863395137989, 2022: 0.3277241584468395, 2446: 0.36898639132367805, 1320: 0.22224637057552946, 1037: 0.23761020727614138, 2427: 0.2600537641568387, 2954: 0.3368429595485647, 2982: 0.23839786141476732, 2020: 0.2581109092273611, 1763: 0.255794976302918, 2367: 0.33657603758446714, 2523: 0.17456293121931402, 2377: 0.3531375914177996, 2097: 0.1488185213046505, 1228: 0.47152948664857985, 2181: 0.3901255425620586, 1893: 0.2404885030490287, 2187: 0.40403861726852175, 1703: 0.20882268987120467, 2021: 0.2797089764412413, 3002: 0.22424404608442428, 1409: 0.42340495770281933, 260: 0.3193035236698516, 1900: 0.25421976741958174, 2300: 0.2571490300779647, 3329: 0.38900695963108795, 2338: 0.23009315188577834, 1145: 0.43553685349352816, 3027: 0.368145984221012, 1160: 0.3139815498539742, 594: 0.2791126830146668, 1261: 0.22103354620172622, 1250: 0.3217059297764555, 221: 0.40168524612771705, 1745: 0.2608020844611716, 2192

all_similarities :  {2660: 0.3146702212591966, 3170: 0.31214671532313026, 239: 0.2015523557861646, 2022: 0.3186434139808019, 2446: 0.33343214547882477, 1320: 0.20244764884312946, 1037: 0.260348608593146, 2427: 0.3224928682049116, 2954: 0.2993607880237202, 2982: 0.22408577154080073, 2020: 0.1789099742968877, 1763: 0.270934230585893, 2367: 0.5284727680186431, 2523: 0.13124153316020964, 2377: 0.39928662137438853, 2097: 0.16054257303476333, 1228: 0.3530703904417654, 2181: 0.5865615479648113, 1893: 0.24360098327160812, 2187: 0.3987118642621984, 1703: 0.2255816761404276, 2021: 0.21010985014339287, 3002: 0.24416222423315048, 1409: 0.32175523806363343, 260: 0.5224778781334559, 1900: 0.324092594285806, 2300: 0.3065716115136941, 3329: 0.3444995985366404, 2338: 0.3630202089746793, 1145: 0.37131354634960495, 3027: 0.26171252119044464, 1160: 0.3203124095996221, 594: 0.28480827528983355, 1261: 0.17570871996382872, 1250: 0.3554182161887487, 221: 0.39974134139095746, 1745: 0.27548231557011604, 2192: 0

all_similarities :  {2660: 0.23752524054835777, 3170: 0.3355670789508401, 239: 0.2969541809043369, 2022: 0.4163476852121184, 2446: 0.4059556959247267, 1320: 0.308095482595869, 1037: 0.28950762350011516, 2427: 0.24261249684401462, 2954: 0.3493861743525879, 2982: 0.35332639571782704, 2020: 0.2864284170942532, 1763: 0.19561764287158243, 2367: 0.25687835405555526, 2523: 0.25874709529248446, 2377: 0.32044764804729337, 2097: 0.22766511547001633, 1228: 0.3516867491844538, 2181: 0.3565086231038377, 1893: 0.3472339204490512, 2187: 0.4508819380976461, 1703: 0.25572455919070824, 2021: 0.3471650011032014, 3002: 0.27900346002063237, 1409: 0.40138570231863774, 260: 0.27425729985525077, 1900: 0.19858128917906936, 2300: 0.3811434784306314, 3329: 0.4202691120775165, 2338: 0.15643291418516153, 1145: 0.38935889764814763, 3027: 0.2302668028737645, 1160: 0.2863564140792634, 594: 0.32156582498872605, 1261: 0.27380931800281677, 1250: 0.33126442260234745, 221: 0.33057410647790575, 1745: 0.30093367286404044, 2

all_similarities :  {2660: 0.32127436443135654, 3170: 0.29221050171502705, 239: 0.2006198382326241, 2022: 0.2869939175273838, 2446: 0.30839322430306465, 1320: 0.22034186407409864, 1037: 0.21724085188631354, 2427: 0.2789622227308051, 2954: 0.3221505033815729, 2982: 0.21003565924434825, 2020: 0.20193297745146113, 1763: 0.19885249339557928, 2367: 0.2936937864227542, 2523: 0.18650108209714808, 2377: 0.36786967766439094, 2097: 0.1693840262255278, 1228: 0.33570055303902463, 2181: 0.35955865796783876, 1893: 0.218516965493046, 2187: 0.3673888188626232, 1703: 0.19922546136738925, 2021: 0.28306545811737405, 3002: 0.17747704059303063, 1409: 0.36032780058328706, 260: 0.31582441437861014, 1900: 0.2781513003182822, 2300: 0.24795879975988946, 3329: 0.36843656703572847, 2338: 0.24594953116671792, 1145: 0.34504458949307043, 3027: 0.24708552138301834, 1160: 0.2794180854127325, 594: 0.42369478953809575, 1261: 0.24448666821137585, 1250: 0.28028339873357067, 221: 0.5025832418609282, 1745: 0.254449622583142

all_similarities :  {2660: 0.20785563090906004, 3170: 0.30037630508702834, 239: 0.2893035135869133, 2022: 0.39488129244235, 2446: 0.27359481516555606, 1320: 0.270781391786346, 1037: 0.2855821042239927, 2427: 0.22264552793645284, 2954: 0.3694487839708464, 2982: 0.3628859563746996, 2020: 0.2962490524933265, 1763: 0.20892331955786958, 2367: 0.22531675875775123, 2523: 0.2440952076821735, 2377: 0.2945099792100097, 2097: 0.17994256035665768, 1228: 0.28895514847285914, 2181: 0.32704231683001445, 1893: 0.30247181908864723, 2187: 0.3709659060594021, 1703: 0.2248148382641375, 2021: 0.2962213823114309, 3002: 0.2865087107608193, 1409: 0.39654160812783074, 260: 0.25466963272999255, 1900: 0.15739647183299735, 2300: 0.3585570533678197, 3329: 0.3501535360508633, 2338: 0.1141080820898264, 1145: 0.394740284077431, 3027: 0.20635730142219058, 1160: 0.28514612308349013, 594: 0.28497486224953544, 1261: 0.23630964244601496, 1250: 0.34673407324181316, 221: 0.3004160973962331, 1745: 0.325043626554441, 2192: 0.

| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 
[1mMultilabel Classifier Results[0m
[1mLinearSVM[0m
------------------------------
hamLoss: 0.07
Exact Match Ratio: 0.46
------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.11      0.23      0.15        13
                       Data Security       0.69      0.88      0.77        40
                        Do Not Track       0.86      1.00      0.92         6
          First Party Collection/Use       0.73      0.88      0.80       230
International and Specific Audiences       0.92      0.81      0.86        68
                Introductory/Generic       0.50      0.70      0.59        76
                       Policy Change       0.74      0.91      0.82        22
                Practice not covered       0.18      0.33      0.24        24
         P

In [None]:
stop

In [None]:
import numpy as np

In [None]:
df = utilities.read_data(data_paths[data])
X = df['text'].apply(preprocess.preprocess_text)
y = df.drop(['text'], axis=1)

In [None]:
y[y[col] == 1].index