TO DO

general implementation
    create a graph or diagram to tell whats hapenning
    mark all the tasks/steps complete, incomplete, in progress, problems, to do, research etc.
    try on a toy problem

find new datasets
    for different datasets different preprocessing techniques should be applied
    RCV1-V2
decide on splitting ratio 20 60 20 

try different similarity measures 
    reference paper
    cosine
    euclidean

implementation steps

+1. reading data and preprocessing
2. vectorization
    -2.1 embeddings - will try other embeddings, and will search which one is best for datasets
    -2.2 dimensionality reduction? (is similarity more accurate when dim. red. done)  - research
3. initial classifier to show results
4. calculate imbalance ratio and find the ratio of newly labeled data
5. oversample dataset using unlabeled set
    5.1 find the proper similarity function (eclidean, cosine etc.)
        Measurement of Text Similarity: A Survey: a very detailed survey of similarity functions that are used for text data
        https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html
        cosine similarity
        minkowski family (euclidean, manhattan)
        hamming distance
        Jaccard index
        Sorensen-dice index
        KL divergence
        Jensen–Shannon divergence with LDA
        Wasserstein distance
        SMTP 
        word mover’s distance
    5.2 define a threshold or mechanism to add data for multilabeled set
6. train a final classifier to compare results

In [1]:
import pickle
import pandas as pd
import numpy as np
import utilities
from sklearn.model_selection import train_test_split

## parameters

In [2]:
# algorithm parameters
balance_ratio = 0.5
random_state = 1
sim_type = 'cosine'
embedding_method = 'stsb-roberta-large' # try different embeddings and find proper one

np.random.seed(random_state)

majority_path = r'C:\Users\IsmailKaraman\workspace\data\privacy_policy_data\OPP-115_v2\majority.csv'

all_columns = ['Data Retention', 'Data Security', 'Do Not Track', 'First Party Collection/Use', 
             'International and Specific Audiences', 'Introductory/Generic', 'Policy Change', 
             'Practice not covered', 'Privacy contact information', 'Third Party Sharing/Collection',
             'User Access, Edit and Deletion', 'User Choice/Control']

sub_col_names = ['Data Security', 'User Access, Edit and Deletion', 'Policy Change']

# main

In [3]:
# reading data
df = utilities.read_data(majority_path)
# -----------------------------------------------------------------------------------------------------------------------------
# creating a toy dataset to test method
np.random.seed(random_state)
toy_df = df[(df[all_columns].sum(axis=1)==df[sub_col_names].sum(axis=1))].sample(100, random_state=random_state)
X = toy_df['text']
y = toy_df[sub_col_names]
all_columns = sub_col_names # note: only for toy example
'''
X = df['text']
y = df[all_columns]
'''
# ------------------------------------------------------------------------------------------------------------------------------
# reading from a pickle instead of applying vectorization
'''
X_num = X.apply(utilities.vectorize_data, embedding_method) # add embedding method as parameter
import pickle
with open('X_num.p', 'wb') as f:
    pickle.dump(X_num, f)     
'''
with open('X_num_toy.p', 'rb') as f:
    X_num = pickle.load(f)

assert np.array_equal(X_num.index, X.index), 'read indexes doesn\'t match!'

# -----------------------------------------------------------------------------------------------------------------------------
# splitting train(labeled-unlabeled)-test
# X_num = X.apply(vectorize_data) 
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=random_state, stratify=y)
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=0.9, 
                                                                  stratify=y_train, random_state=random_state)
# -----------------------------------------------------------------------------------------------------------------------------
# an initial classifier to see results before applying our method
print(X_labeled.shape, y_labeled.shape, X_unlabeled.shape, y_unlabeled.shape)
utilities.classifier(np.vstack(X_labeled.values), y_labeled, np.vstack(X_test.values), y_test)
# -----------------------------------------------------------------------------------------------------------------------------
# calculation number of instances to balance dataset
balance_ratio = 0.5
num_of_new_instances = utilities.calculate_balancing_num_instance_multiclass(y_labeled, balance_ratio)
# -----------------------------------------------------------------------------------------------------------------------------
# oversampling dataset using unlabeled data with the given ratios
validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = utilities.oversample_dataset(num_of_new_instances, 
                                                                    X_labeled, y_labeled, X_unlabeled, y_unlabeled)
# -----------------------------------------------------------------------------------------------------------------------------
# check if the result gets better
print(X_labeled.shape, y_labeled.shape, X_unlabeled.shape, y_unlabeled.shape)
utilities.classifier(np.vstack(X_labeled.values), y_labeled, np.vstack(X_test.values), y_test)
# -----------------------------------------------------------------------------------------------------------------------------

(8,) (8, 3) (72,) (72, 3)
[1mLinearSVM results: [0m
------------------------------
hamLoss: 0.07
Exact Match Ratio: 0.85
------------------------------
[1mClassification Report[0m
                                precision    recall  f1-score   support

                 Data Security       1.00      1.00      1.00         9
User Access, Edit and Deletion       1.00      0.60      0.75         5
                 Policy Change       0.83      0.83      0.83         6

                     micro avg       0.94      0.85      0.89        20
                     macro avg       0.94      0.81      0.86        20
                  weighted avg       0.95      0.85      0.89        20
                   samples avg       0.85      0.85      0.85        20



  _warn_prf(average, modifier, msg_start, len(result))


(55,) (55, 3) (25,) (25, 3)
[1mLinearSVM results: [0m
------------------------------
hamLoss: 0.45
Exact Match Ratio: 0.15
------------------------------
[1mClassification Report[0m
                                precision    recall  f1-score   support

                 Data Security       0.47      1.00      0.64         9
User Access, Edit and Deletion       0.42      1.00      0.59         5
                 Policy Change       0.38      1.00      0.55         6

                     micro avg       0.43      1.00      0.60        20
                     macro avg       0.42      1.00      0.59        20
                  weighted avg       0.43      1.00      0.60        20
                   samples avg       0.49      1.00      0.63        20



In [4]:
compare_res = []
for _, value in validation.items():
    col, _, _, y_true, y_pred = value
    compare_res.append((list(y_true.values), list(y_pred.values())))

In [7]:
emr = 0
accuracy = 0
for t, p in compare_res:
    if t==p:
        emr += 1
    if t[0] == p[0]:
        accuracy +=1
    if t[1] == p[1]:
        accuracy +=1
    if t[2] == p[2]:
        accuracy +=1
print('Metrics for the proposed algorithm ')    
print(f'Exact match ratio : {emr/len(compare_res):.2f} ')
print(f'Accuracy          : {accuracy/(len(compare_res)*3):.2f} ')

Metrics for the proposed algorithm 
Exact match ratio : 0.06 
Accuracy          : 0.41 


In [6]:
stop

NameError: name 'stop' is not defined

In [None]:
from nltk.corpus import words
all_words = words.words()

In [None]:
words_5len = [word.lower() for word in all_words if len(word)==5]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
data_vecs = vectorizer.fit_transform(data_words).toarray()

In [None]:
for word in words_5len:
    if 'h' in word and 'k' in word and 'n' in word:
        print(word)

In [None]:
from gensim.corpora.dictionary import Dictionary
common_dictionary = Dictionary(data_words)
common_corpus = [common_dictionary.doc2bow(text) for text in data_words]

In [None]:
common_corpus

In [None]:
data_vecs[0]

In [None]:
data_vecs

In [None]:
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon
from scipy.special import kl_div

In [None]:
for word in words_5len:
    if word.startswith('se') and 'r' in word:
        print(word)

In [None]:
letter_0 = ''
letter_1 = ''
letter_2 = 'a'
letter_3 = ''
letter_4 = ''

exist_letters = 'acs'
banned_letters = 'trdefou'

In [None]:
def KL_divergence(p, q):
        """ Compute KL divergence of two vectors, K(p || q)."""
        return sum(p[x] * log((p[x]) / (q[x])) for x in range(len(p)) if p[x] != 0.0 or p[x] != 0)

In [None]:
from numpy import zeros, array
from math import sqrt, log

In [None]:
jensenshannon(p, q)

In [None]:
filtered = [word for word in words_5len for e in exist_letters if e in word]
filtered

In [None]:
filtered = [word for word in filtered for b in banned_letters if b in word]
filtered = [word for word in filtered if letter_0 and word[0]==letter_0]
filtered = [word for word in filtered if letter_1 and word[0]==letter_1]
filtered = [word for word in filtered if letter_2 and word[0]==letter_2]
filtered = [word for word in filtered if letter_3 and word[0]==letter_3]
filtered = [word for word in filtered if letter_4 and word[0]==letter_4]
filtered

In [None]:
class JSD(object):
    def __init__(self):
        self.log2 = log(2)


    def KL_divergence(self, p, q):
        """ Compute KL divergence of two vectors, K(p || q)."""
        return sum(p[x] * log((p[x]) / (q[x])) for x in range(len(p)) if p[x] != 0.0 or p[x] != 0)

    def Jensen_Shannon_divergence(self, p, q):
        """ Returns the Jensen-Shannon divergence. """
        self.JSD = 0.0
        weight = 0.5
        average = zeros(len(p)) #Average
        for x in range(len(p)):
            average[x] = weight * p[x] + (1 - weight) * q[x]
            self.JSD = (weight * self.KL_divergence(array(p), average)) + ((1 - weight) * self.KL_divergence(array(q), average))
        return 1-(self.JSD/sqrt(2 * self.log2))

if __name__ == '__main__':
    J = JSD()
    p = [1.0/10, 9.0/10, 0]
    q = [0, 1.0/10, 9.0/10]
    p = data_vecs[0]
    q = data_vecs[1]
    print(J.Jensen_Shannon_divergence(p, q))

In [None]:
if letter_2:
    print('a')

In [None]:
for word in words_5len:
    if 'o' in word and 'u' in word and 'a' not in word and 'i' not in word and 'd' not in word:
        print(word)