In [21]:
import pandas as pd
import numpy as np
import utilities
import preprocess
from itertools import combinations

from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

In [22]:
np.random.seed(random_state)

data = 'opp115'
random_state = 1
embedding_method = 'distilbert-base-nli-mean-tokens'

data_paths = {'opp115'   : r'C:\Users\IsmailKaraman\workspace\data\privacy_policy_data\OPP-115_v2\majority.csv',
              'ohsumed'  : r'C:\Users\IsmailKaraman\workspace\GitHub\thesis\data\ohsumed.csv',
              'reuters'  : r'C:\Users\IsmailKaraman\workspace\GitHub\thesis\data\Reuters21578.csv'}

In [23]:
df = utilities.read_data(data_paths[data])
X = df['text'].apply(preprocess.preprocess_text)
y = df.drop(['text'], axis=1)

In [24]:
def calculate_between_class_similarities(col1, col2, X, y):
    
    sims = []
    
    for idx1 in y[y[col1]==1].index:
        for idx2 in y[y[col2]==1].index:
            sims.append(similarities.vector_similarity(X.loc[idx1], X.loc[idx2]))
    
    return sum(sims)/len(sims)     

In [25]:
def calculate_similarity_matrix(X, y, sim_method='cosine'):
    
    import similarities
    
    sim_df = pd.DataFrame(index=y.columns, columns=y.columns)
    
    for col in y.columns:
    
        indexes = y[y[col]==1].index
        sim_df.loc[col, col] = similarities.calculate_within_class_similarity(X.loc[indexes])
    
    for col1, col2 in list(combinations(y.columns, 2)):
        sim_df.loc[col1, col2] = calculate_between_class_similarities(col1, col2, X, y)
    
    return sim_df

In [None]:
X_num = utilities.vectorize_data(X, embedding_method)
X_num = pd.Series([np.squeeze(i) for i in X_num])
sim_df = calculate_similarity_matrix(X_num, y)

l_sum = 100*np.diag(sim_df).sum()/12 - (sim_df.sum().sum()-np.diag(sim_df).sum())/66
print(f'{l_sum:.2f}')  

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [None]:
# plt.figure()

sns.heatmap(sim_df.fillna(0), annot=True,
xticklabels=sim_df.columns,
yticklabels=sim_df.columns, cmap="rocket_r", ax=ax1)

# plt.show()

In [20]:
train_set = []

for col in y.columns:
    
    idxs = y[y[col]==1].index
    tmp_set = X.loc[idxs]
    
    for pair in combinations(tmp_set, 2):
        train_set.append(InputExample(texts=list(pair), label=1))

['information security has technical administrative and physical safeguards in place to help protect against unauthorized access to use or disclosure of user information we maintain under our security practices and policies access to personally identifiable information is authorized only for those who have business need for such access and sensitive records are to be retained only as long as necessary for business or legal needs and destroyed before disposal although we work hard to protect personal information that we collect and store no program is one hundred percent secure and we cannot guarantee that our safeguards will prevent every unauthorized attempt to access use or disclose personal information', 'data retention we will retain your information for the period necessary to fulfill the purposes outlined in this privacy policy unless longer retention period is required or allowed by applicable law']
['security we place priority on the security of personal information and we unde

In [None]:
model = SentenceTransformer(embedding_method, device='cuda') # stsb-roberta-large

train_dataloader = DataLoader(train_set, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)


model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)