In [6]:
data_flag = 'opp115'

# ohsumed: 23986
# opp115 3399
# reuters 10788

ERROR! Session/line number was not unique in database. History logging moved to new session 682


In [7]:
import pickle
import pandas as pd
import numpy as np
import utilities
import preprocess
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.model_selection import train_test_split

## parameters

In [8]:
# algorithm parameters
balance_ratio = 0.5
random_state = 1
threshold_factor = 1.5
test_size = 0.2

sim_type = 'cosine'
success_metric = 'col_f1-score'
embedding_method = 'distiluse-base-multilingual-cased-v1' # try different embeddings and find proper one

np.random.seed(random_state)

data_paths = {'opp115'   : r'C:\Users\IsmailKaraman\workspace\data\privacy_policy_data\OPP-115_v2\majority.csv',
              'ohsumed'  : r'C:\Users\IsmailKaraman\workspace\GitHub\thesis\data\ohsumed.csv',
              'reuters'  : r'C:\Users\IsmailKaraman\workspace\GitHub\thesis\data\Reuters21578.csv'}

unlabaled_ratios = {'opp115':0.75, 'ohsumed':0.95, 'reuters':0.9}

In [9]:
import warnings
warnings.filterwarnings("ignore")

# main

In [5]:
def main(data):
    print('*'*100)
    print('\x1b[1;31m'+data+'\x1b[0m')
    # reading data
    df = utilities.read_data(data_paths[data])
    X = df['text'].apply(preprocess.preprocess_text)
    y = df.drop(['text'], axis=1)
    # ------------------------------------------------------------------------------------------------------------------------------
    # reading from a pickle instead of applying vectorization
    X_num = utilities.vectorize_data(X, embedding_method)
    X_num = pd.Series([np.squeeze(i) for i in X_num])
    X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=test_size, random_state=random_state)
    X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=unlabaled_ratios[data], 
                                                                  random_state=random_state)
    
    print(X_labeled.shape, y_labeled.shape, X_unlabeled.shape, y_unlabeled.shape)
    s_metric = utilities.multilabel_classifier(np.vstack(X_labeled), y_labeled, np.vstack(X_test), y_test, 
                                               success_metric=success_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # calculation number of instances to balance dataset
    balance_ratio = 0.5
    num_of_new_instances = utilities.calculate_balancing_num_instance_multiclass(y_labeled, balance_ratio, 
                                                                                 calculation_type='metric_based', s_metric)
    # -----------------------------------------------------------------------------------------------------------------------------
    # oversampling dataset using unlabeled data with the given ratios
    '''
    
    
    validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset(num_of_new_instances, 
                                                                                              X_labeled, y_labeled,
                                                                                              X_unlabeled, y_unlabeled, 
                                                                                              X_test, y_test, 
                                                                                              sim_calculation_type='safe_interval', 
                                                                                              batch_size=5)
        
    '''    
    validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset_with_threshold_update(\
                num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, \
                                                    sim_calculation_type='safe_interval', batch_size=5)
    # -----------------------------------------------------------------------------------------------------------------------------
    # check if the result gets better
    print(X_labeled.shape, X_unlabeled.shape, X_test.shape)
    success_metric = utilities.multilabel_classifier(np.vstack(X_labeled.values), y_labeled, np.vstack(X_test.values), y_test, metric='f1')  
    
    # comparing the found labels and ground truth
    y_true, y_pred = [], []
    for _, _, _, y_t, y_p in validation:
        y_true.append(list(y_t.values))
        y_pred.append(list(y_p.values()))
    
    acc = 1-hamming_loss(y_true, y_pred)
    emr = accuracy_score(y_true, y_pred)  
    print('-'*30)
    print(f'Exact match ratio : {emr:.2f} ')
    print(f'Accuracy          : {acc:.2f} ')
    print('-'*30)
    
    print('*'*100)
    print('/'*100)
    print('*'*100)

In [7]:
from sklearn.metrics import classification_report
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
aa = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)

In [11]:
aa['macro avg']

{'precision': 0.5,
 'recall': 0.5555555555555555,
 'f1-score': 0.48888888888888893,
 'support': 5}

In [5]:
random.shuffle(list(at.keys()))

None


In [6]:
for data in data_paths.keys():
    main(data)

****************************************************************************************************
[1;31mopp115[0m
(679,) (679, 12) (2040,) (2040, 12)
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 
[1mMultilabel Classifier Results[0m
[1mLinearSVM[0m
------------------------------
hamLoss: 0.07
Exact Match Ratio: 0.46
------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.11      0.23      0.15        13
                       Data Security       0.69      0.88      0.77        40
                        Do Not Track       0.86      1.00      0.92         6
          First Party Collection/Use       0.73      0.88      0.80       230
International and Specific Audiences       0.92      0.81      0.86        68
                Introductory/Generic       0.50      0.70      0.59        76
           

[1m---------------c08---------------[0m
[1m---------------c01---------------[0m
[1m---------------c21---------------[0m
[1m---------------c12---------------[0m
[1m---------------c06---------------[0m
[1m---------------c20---------------[0m
[1m---------------c10---------------[0m
Shapes --------------
(959,) (18229,)
(959,) (18229,) (4798,)
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 
[1mMultilabel Classifier Results[0m
[1mLinearSVM[0m
------------------------------
hamLoss: 0.11
Exact Match Ratio: 0.19
------------------------------
[1mClassification Report[0m
              precision    recall  f1-score   support

         c01       0.41      0.65      0.50       542
         c02       0.36      0.57      0.44       250
         c05       0.36      0.64      0.46       339
         c06       0.47      0.69      0.56       596
         c08       0.60      0.73      0.66       559
         c10       0.52      0.6

ValueError: Multioutput target data is not supported with label binarization

In [None]:
stop

In [None]:
oversample_dataset_with_threshold_update(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, sim_calculation_type, batch_size)

In [None]:
def cal(x):
    
    return x + ((1-x)**2) * x

In [None]:
0.9 + 0.1*0.1*0.9

In [None]:
for i in range(10):
    x = (i+1)/10
    print(i, cal(x)/x)

In [None]:
import numpy as np

In [None]:
0.1, 0.2, 0.3, 0.4 0.7, 0.9 

x*()

In [None]:
for i in range(0,12,3 ):
    print(i)

In [None]:
a = {1:'a', 2:'b', 4:'d', 3:'c'}

In [None]:
k = list(a.keys())
k

In [None]:
from nltk.corpus import words
all_words = words.words()

In [None]:
words_5len = [word.lower() for word in all_words if len(word)==5]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
data_vecs = vectorizer.fit_transform(data_words).toarray()

In [None]:
for word in words_5len:
    if 'h' in word and 'k' in word and 'n' in word:
        print(word)

In [None]:
from gensim.corpora.dictionary import Dictionary
common_dictionary = Dictionary(data_words)
common_corpus = [common_dictionary.doc2bow(text) for text in data_words]

In [None]:
common_corpus

In [None]:
data_vecs[0]

In [None]:
data_vecs

In [None]:
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon
from scipy.special import kl_div

In [None]:
for word in words_5len:
    if word.startswith('se') and 'r' in word:
        print(word)

In [None]:
letter_0 = ''
letter_1 = ''
letter_2 = 'a'
letter_3 = ''
letter_4 = ''

exist_letters = 'acs'
banned_letters = 'trdefou'

In [None]:
def KL_divergence(p, q):
        """ Compute KL divergence of two vectors, K(p || q)."""
        return sum(p[x] * log((p[x]) / (q[x])) for x in range(len(p)) if p[x] != 0.0 or p[x] != 0)

In [None]:
from numpy import zeros, array
from math import sqrt, log

In [None]:
jensenshannon(p, q)

In [None]:
filtered = [word for word in words_5len for e in exist_letters if e in word]
filtered

In [None]:
filtered = [word for word in filtered for b in banned_letters if b in word]
filtered = [word for word in filtered if letter_0 and word[0]==letter_0]
filtered = [word for word in filtered if letter_1 and word[0]==letter_1]
filtered = [word for word in filtered if letter_2 and word[0]==letter_2]
filtered = [word for word in filtered if letter_3 and word[0]==letter_3]
filtered = [word for word in filtered if letter_4 and word[0]==letter_4]
filtered

In [None]:
class JSD(object):
    def __init__(self):
        self.log2 = log(2)


    def KL_divergence(self, p, q):
        """ Compute KL divergence of two vectors, K(p || q)."""
        return sum(p[x] * log((p[x]) / (q[x])) for x in range(len(p)) if p[x] != 0.0 or p[x] != 0)

    def Jensen_Shannon_divergence(self, p, q):
        """ Returns the Jensen-Shannon divergence. """
        self.JSD = 0.0
        weight = 0.5
        average = zeros(len(p)) #Average
        for x in range(len(p)):
            average[x] = weight * p[x] + (1 - weight) * q[x]
            self.JSD = (weight * self.KL_divergence(array(p), average)) + ((1 - weight) * self.KL_divergence(array(q), average))
        return 1-(self.JSD/sqrt(2 * self.log2))

if __name__ == '__main__':
    J = JSD()
    p = [1.0/10, 9.0/10, 0]
    q = [0, 1.0/10, 9.0/10]
    p = data_vecs[0]
    q = data_vecs[1]
    print(J.Jensen_Shannon_divergence(p, q))

In [None]:
if letter_2:
    print('a')

In [None]:
for word in words_5len:
    if 'o' in word and 'u' in word and 'a' not in word and 'i' not in word and 'd' not in word:
        print(word)