TO DO

general implementation
    create a graph or diagram to tell whats hapenning
    mark all the tasks/steps complete, incomplete, in progress, problems, to do, research etc.
    try on a toy problem

find new datasets
    for different datasets different preprocessing techniques should be applied
    RCV1-V2
decide on splitting ratio 20 60 20 

try different similarity measures 
    reference paper
    cosine
    euclidean

implementation steps

+1. reading data and preprocessing
2. vectorization
    -2.1 embeddings - will try other embeddings, and will search which one is best for datasets
    -2.2 dimensionality reduction? (is similarity more accurate when dim. red. done)  - research
3. initial classifier to show results
4. calculate imbalance ratio and find the ratio of newly labeled data
5. oversample dataset using unlabeled set
    5.1 find the proper similarity function (eclidean, cosine etc.)
        Measurement of Text Similarity: A Survey: a very detailed survey of similarity functions that are used for text data
        https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html
        cosine similarity
        minkowski family (euclidean, manhattan)
        hamming distance
        Jaccard index
        Sorensen-dice index
        KL divergence
        Jensen–Shannon divergence with LDA
        Wasserstein distance
        SMTP 
        word mover’s distance
    5.2 define a threshold or mechanism to add data for multilabeled set
6. train a final classifier to compare results

In [1]:
import pickle
import preprocess
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
import torch
from itertools import combinations
from sentence_transformers import util
from sklearn.utils.class_weight import compute_class_weight
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## parameters

In [2]:
# algorithm parameters
balance_ratio = 0.5
sim_type = 'cosine'
embedding_method = '' # try different embeddings and find proper one

random_state = 1
starting_index = 100_000
np.random.seed(random_state)

majority_path = r'C:\Users\IsmailKaraman\workspace\data\privacy_policy_data\OPP-115_v2\majority.csv'

all_columns = ['Data Retention', 'Data Security', 'Do Not Track', 'First Party Collection/Use', 
             'International and Specific Audiences', 'Introductory/Generic', 'Policy Change', 
             'Practice not covered', 'Privacy contact information', 'Third Party Sharing/Collection',
             'User Access, Edit and Deletion', 'User Choice/Control']

sub_col_names = ['Data Security', 'User Access, Edit and Deletion', 'Policy Change']

In [3]:
def read_data(path):
    df = pd.read_csv(path)
    df['text'] = df['text'].apply(preprocess.preprocess_text)
    return df

In [4]:
def vectorize_data(text, model_name='stsb-roberta-large'):
    
    from sentence_transformers import util
    from sentence_transformers import SentenceTransformer
    import torch
    
    model = SentenceTransformer(model_name)
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    
    vectors = model.encode(text, convert_to_tensor=False, device=device)
    
    return vectors

In [5]:
def classifier(X_train, y_train, X_test, y_test):
    
    def calculating_class_weights(y_true):
        
        number_dim = np.shape(y_true)[1]
        weights = []
        for i in range(number_dim):
            at = compute_class_weight('balanced', [0.,1.], y_true[:, i])
            weights.append(dict(zip([0,1], at)))
            # weights[i] = compute_class_weight('balanced', [0.,1.], y_true[:, i])))
        return weights

    # class_weights = calculating_class_weights(y_train.values)
    
    # Linear SVM
    linearSvm = OneVsRestClassifier(LogisticRegression(class_weight='balanced'), n_jobs=-1)
    linearSvm.fit(X_train, y_train.values)
    linearSvm_preds = linearSvm.predict(X_test)
    
    print("\033[1m" + 'LinearSVM results: ' + "\033[0m")
    print('-'*30)
    hamLoss = hamming_loss(y_test.values, linearSvm_preds)
    print('hamLoss: {:.2f}'.format(hamLoss))
    acc_score = accuracy_score(y_test.values, linearSvm_preds)
    print('Exact Match Ratio: {:.2f}'.format(acc_score))
    print('-'*30)
    print("\033[1m" + 'Classification Report' + "\033[0m")
    print(classification_report(y_test.values, linearSvm_preds, target_names=list(y_test.columns)))

In [6]:
def calculate_imb_ratio(y):

    class_ratios = (y.sum() / y.shape[0]).values
    return class_ratios

In [7]:
def cal_balancing_num_instance_binary(n_samples, n_total_samples, balance_ratio=0.5):
    
    if n_samples/n_total_samples > balance_ratio:
        print("Be careful! Given balancing ratio is lower than the class' imbalance ratio")
        
    return int((n_total_samples*balance_ratio - n_samples)*2)

In [8]:
def cal_balancing_num_instance_multiclass(y, balance_ratio):
    
    oversampling_counts = {}
    n_samples = y.shape[0]
    n_classes = y.shape[1]
    
    for col in y.columns:
        oversampling_counts[col] = cal_balancing_num_instance_binary(y[col].sum(), n_samples, balance_ratio)
    
    return oversampling_counts

In [9]:
def cosine_similarity(vec1, vec2):
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0:
        norm1 += 0.00001
    if norm2 == 0:
        norm2 += 0.00001   
    return np.dot(vec1, vec2)/(norm1*norm2)

In [10]:
def minkowski_similarity(u, v, p=2):
    # minkowski distance is a distance measure but we need a similarity function
    if p <= 0:
        raise ValueError("p must be greater than 0")
    u_v = u - v
    dist = np.linalg.norm(u_v, ord=p)
    if dist == 0:
        dist += 0.0001
        
    return 1/dist #converting a distance to similarity

In [11]:
def vector_similarity(vec1, vec2, sim_type=sim_type):
    
    if sim_type == 'cosine':
        similarity = cosine_similarity(vec1, vec2)
    if sim_type == 'euclidean':
        similarity = minkowski_similarity(vec1, vec2, 2)
    if sim_type == 'manhattan':
        similarity = minkowski_similarity(vec1, vec2, 1)
    if sim_type == 'chebychev ':
        similarity = minkowski_similarity(vec1, vec2, np.inf)
    if sim_type.startswith('minkowski'):
        similarity = minkowski_similarity(vec1, vec2, int(sim_type[-1]))
    
    return similarity

In [12]:
def calculate_within_class_similarity(vecs, sim_type=sim_type):
    
    similarities = []
    
    for i,j in list(combinations(vecs.index, 2)):
        similarities.append(vector_similarity(vecs.loc[i], vecs.loc[j], sim_type))    
            
    try:
        avg_similarity = sum(similarities)/len(similarities)
    except AssertionErrors:
        print('Error occured')
        
    return avg_similarity 

In [13]:
def calculate_similarity_between_vector_and_class(vec, class_vecs, sim_type=sim_type):
    
    similarities = []
    
    for c_vec in class_vecs:
        similarities.append(vector_similarity(vec, c_vec, sim_type))
    
    try:
        avg_similarity = sum(similarities)/len(similarities)
    except AssertionErrors:
        print('Error occured')
        
    return avg_similarity 

In [14]:
def find_new_instances(X_labeled, X_unlabeled, class_similarity):
    
    new_instances = []
    
    for idx, instance in X_unlabeled.iteritems():
        avg_sim = calculate_similarity_between_vector_and_class(instance, X_labeled)
        if avg_sim > class_similarity:
            new_instances.append(idx)
            
    return new_instances

In [15]:
def calculate_overall_class_similarities(X, y):
    
    class_similarities = {}
    for col in y.columns:
        indexes = (y[col] == 1).index
        aa = X.loc[indexes]
        class_similarities[col] = calculate_within_class_similarity(aa) 
        
    return class_similarities

In [16]:
def find_similar_columns(instance, X_labeled, y_labeled, other_columns):
    
    other_similarities = {}
    
    for col_name in other_columns:
        
        indexes = (y_labeled[col_name] == 1).index
        
        other_similarities[col_name]  = calculate_similarity_between_vector_and_class(instance, X_labeled.loc[indexes])
    
    return other_similarities

In [17]:
def oversample_dataset(num_of_new_instances, X_labeled, y_labeled, X_unlabeled, y_unlabeled):
    
    # giving priority to mostly imbalanced classes
    num_of_new_instances = {k: v for k, v in sorted(num_of_new_instances.items(), key=lambda item: item[1], reverse=True)}
    
    class_similarities = calculate_overall_class_similarities(X_labeled, y_labeled)
    
    processed_columns = []
    
    validation = {}
    val_idx = 0
    
    for col_name, num_instance in num_of_new_instances.items():
        
        # note: we didnt use num_instance
        # the instances will be added should not exceed num_instance
        
        processed_columns.append(col_name)
        
        if num_instance == 0:
            continue
        
        indexes = (y_labeled[col_name] == 1).index
        new_instances = find_new_instances(X_labeled.loc[indexes], X_unlabeled, class_similarities[col_name])
        
        
        for instance_index in new_instances:
            
            instance_X = X_unlabeled.loc[instance_index]
            instance_y = y_unlabeled.loc[instance_index] # note: this is for test case
            
            # defining all labels as 0s
            new_labels = {c:0 for c in all_columns}
            # changing col_name's label as 1
            new_labels[col_name] = 1
            
            ### finding other labels
            other_columns = [i for i in all_columns if i not in processed_columns]
            other_similarities = find_similar_columns(instance_X, X_labeled, y_labeled, other_columns)
            for col, sim in other_similarities.items():
                if sim > class_similarities[col]:
                    new_labels[col] = 1
            
            ### appending data to unlabeled set and removing it from unlabeled set
            # starting index of new instances from a big number
            instance_new_index = max(starting_index, max(X_labeled.index)) + 1
            instance_X_series = pd.Series([instance_X], index=[instance_new_index])
            instance_new_labels =pd.DataFrame(new_labels, index=[instance_new_index])
            # adding new instance to labeled set
            X_labeled = pd.concat([X_labeled, instance_X_series])
            y_labeled = pd.concat([y_labeled, instance_new_labels])
            # removing new instance from unlabeled set
            X_unlabeled.drop(instance_index, inplace=True)
            y_unlabeled.drop(instance_index, inplace=True) # note: this is for test case
            
            # validation
            validation[val_idx] = (col_name, instance_index, instance_X, (instance_y), new_labels)
            val_idx += 1
    
    return validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled 

# main

In [18]:
# reading data
df = pd.read_csv(majority_path)
# -----------------------------------------------------------------------------------------------------------------------------
# creating a toy dataset to provide efficiency
np.random.seed(random_state)
toy_df = df[(df[all_columns].sum(axis=1)==df[sub_col_names].sum(axis=1))].sample(100, random_state=random_state)
# ------------------------------------------------------------------------------------------------------------------------------
X = toy_df['text']
y = toy_df[sub_col_names]
all_columns = sub_col_names # note: only for toy example
# ------------------------------------------------------------------------------------------------------------------------------
# reading from a pickle instead of applying vectorization
'''
X_num = X.apply(vectorize_data)
import pickle
with open('X_num.p', 'wb') as f:
    pickle.dump(X_num, f)     
'''
with open('X_num.p', 'rb') as f:
    X_num = pickle.load(f)

assert np.array_equal(X_num.index, X.index), 'read indexes doesn\'t match!'

# -----------------------------------------------------------------------------------------------------------------------------
# splitting train(labeled-unlabeled)-test
# X_num = X.apply(vectorize_data) 
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=random_state, stratify=y)
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X_train, y_train, test_size=0.9, 
                                                                  stratify=y_train, random_state=random_state)
# -----------------------------------------------------------------------------------------------------------------------------
# an initial classifier to see results before applying our method
classifier(np.vstack(X_labeled.values), y_labeled, np.vstack(X_test.values), y_test)
# -----------------------------------------------------------------------------------------------------------------------------
# calculation number of instances to balance dataset
balance_ratio = 0.5
num_of_new_instances = cal_balancing_num_instance_multiclass(y_labeled, balance_ratio)
# -----------------------------------------------------------------------------------------------------------------------------
# oversampling dataset using unlabeled data with the given ratios
validation, X_labeled, y_labeled, X_unlabeled, y_unlabeled = oversample_dataset(num_of_new_instances, 
                                                                    X_labeled, y_labeled, X_unlabeled, y_unlabeled)
# -----------------------------------------------------------------------------------------------------------------------------
# check if the result gets better
classifier(np.vstack(X_labeled.values), y_labeled, np.vstack(X_test.values), y_test)
# -----------------------------------------------------------------------------------------------------------------------------

[1mLinearSVM results: [0m
------------------------------
hamLoss: 0.03
Exact Match Ratio: 0.90
------------------------------
[1mClassification Report[0m
                                precision    recall  f1-score   support

                 Data Security       1.00      1.00      1.00         9
User Access, Edit and Deletion       1.00      0.80      0.89         5
                 Policy Change       1.00      0.83      0.91         6

                     micro avg       1.00      0.90      0.95        20
                     macro avg       1.00      0.88      0.93        20
                  weighted avg       1.00      0.90      0.94        20
                   samples avg       0.90      0.90      0.90        20



  _warn_prf(average, modifier, msg_start, len(result))


[1mLinearSVM results: [0m
------------------------------
hamLoss: 0.38
Exact Match Ratio: 0.30
------------------------------
[1mClassification Report[0m
                                precision    recall  f1-score   support

                 Data Security       0.60      1.00      0.75         9
User Access, Edit and Deletion       0.38      1.00      0.56         5
                 Policy Change       0.40      1.00      0.57         6

                     micro avg       0.47      1.00      0.63        20
                     macro avg       0.46      1.00      0.63        20
                  weighted avg       0.49      1.00      0.65        20
                   samples avg       0.57      1.00      0.69        20



In [19]:
stop

NameError: name 'stop' is not defined

In [None]:
compare_res = []
for _, value in validation.items():
    col, _, _, y_true, y_pred = value
    compare_res.append((list(y_true.values), list(y_pred.values())))

In [None]:
hamm_loss = 0
accuracy = 0
for t, p in compare_res:
    if t==p:
        hamm_loss += 1
    if t[0] == p[0]:
        accuracy +=1
    if t[1] == p[1]:
        accuracy +=1
    if t[2] == p[2]:
        accuracy +=1
print('Metrics for the proposed algorithm ')    
print(f'Hamming loss: {hamm_loss/len(compare_res):.2f} ')
print(f'Accuracy:     {accuracy/(len(compare_res)*3):.2f} ')

In [None]:
from nltk.corpus import words
all_words = words.words()

In [None]:
words_5len = [word.lower() for word in all_words if len(word)==5]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
data_vecs = vectorizer.fit_transform(data_words).toarray()

In [None]:
for word in words_5len:
    if 'h' in word and 'k' in word and 'n' in word:
        print(word)

In [None]:
from gensim.corpora.dictionary import Dictionary
common_dictionary = Dictionary(data_words)
common_corpus = [common_dictionary.doc2bow(text) for text in data_words]

In [None]:
common_corpus

In [None]:
data_vecs[0]

In [None]:
data_vecs

In [None]:
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon
from scipy.special import kl_div

In [None]:
for word in words_5len:
    if word.startswith('se') and 'r' in word:
        print(word)

In [None]:
letter_0 = ''
letter_1 = ''
letter_2 = 'a'
letter_3 = ''
letter_4 = ''

exist_letters = 'acs'
banned_letters = 'trdefou'

In [None]:
def KL_divergence(p, q):
        """ Compute KL divergence of two vectors, K(p || q)."""
        return sum(p[x] * log((p[x]) / (q[x])) for x in range(len(p)) if p[x] != 0.0 or p[x] != 0)

In [None]:
from numpy import zeros, array
from math import sqrt, log

In [None]:
jensenshannon(p, q)

In [None]:
filtered = [word for word in words_5len for e in exist_letters if e in word]
filtered

In [None]:
filtered = [word for word in filtered for b in banned_letters if b in word]
filtered = [word for word in filtered if letter_0 and word[0]==letter_0]
filtered = [word for word in filtered if letter_1 and word[0]==letter_1]
filtered = [word for word in filtered if letter_2 and word[0]==letter_2]
filtered = [word for word in filtered if letter_3 and word[0]==letter_3]
filtered = [word for word in filtered if letter_4 and word[0]==letter_4]
filtered

In [None]:
class JSD(object):
    def __init__(self):
        self.log2 = log(2)


    def KL_divergence(self, p, q):
        """ Compute KL divergence of two vectors, K(p || q)."""
        return sum(p[x] * log((p[x]) / (q[x])) for x in range(len(p)) if p[x] != 0.0 or p[x] != 0)

    def Jensen_Shannon_divergence(self, p, q):
        """ Returns the Jensen-Shannon divergence. """
        self.JSD = 0.0
        weight = 0.5
        average = zeros(len(p)) #Average
        for x in range(len(p)):
            average[x] = weight * p[x] + (1 - weight) * q[x]
            self.JSD = (weight * self.KL_divergence(array(p), average)) + ((1 - weight) * self.KL_divergence(array(q), average))
        return 1-(self.JSD/sqrt(2 * self.log2))

if __name__ == '__main__':
    J = JSD()
    p = [1.0/10, 9.0/10, 0]
    q = [0, 1.0/10, 9.0/10]
    p = data_vecs[0]
    q = data_vecs[1]
    print(J.Jensen_Shannon_divergence(p, q))

In [None]:
if letter_2:
    print('a')

In [None]:
for word in words_5len:
    if 'o' in word and 'u' in word and 'a' not in word and 'i' not in word and 'd' not in word:
        print(word)