In [1]:
import pandas as pd
import numpy as np
import os
import torch
import pickle
import utilities
import preprocess
import similarities
import parameters
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations

from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

In [2]:
data = 'opp115'

In [3]:
embedding_methods = ['stsb-roberta-large',
                     'paraphrase-MiniLM-L6-v2',
                     'all-mpnet-base-v1',
                     'all-mpnet-base-v2',
                     'distiluse-base-multilingual-cased-v1']

In [4]:
random_state = parameters.random_state
np.random.seed(random_state)
data_paths = parameters.data_paths

In [5]:
df = utilities.read_data(data_paths[data])
X = df['text'].apply(preprocess.preprocess_text)
y = df.drop(['text'], axis=1)

In [6]:
def calculate_between_class_similarities(col1, col2, X, y):
    
    sims = []
    
    for idx1 in y[y[col1]==1].index:
        for idx2 in y[y[col2]==1].index:
            sims.append(similarities.vector_similarity(X.loc[idx1], X.loc[idx2]))
    
    return sum(sims)/len(sims)     

In [7]:
def calculate_similarity_matrix(X, y, sim_method='cosine'):
    
    import similarities
    
    sim_df = pd.DataFrame(index=y.columns, columns=y.columns)
    
    for col in y.columns:
    
        indexes = y[y[col]==1].index
        sim_df.loc[col, col] = similarities.calculate_within_class_similarity(X.loc[indexes], 'average')
    
    for col1, col2 in list(combinations(y.columns, 2)):
        sim_df.loc[col1, col2] = calculate_between_class_similarities(col1, col2, X, y)
    
    return sim_df

In [8]:
def calculate_matrix_score(sim_df):
    scores = []
    for col in sim_df.columns:
        scores.append((sim_df.loc[col, col] - sim_df.loc[col].drop(col).max())/sim_df.loc[col, col])
    return scores

In [9]:
def vectorize_data(text, model):
    
    # model = SentenceTransformer(model_name)
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    vectors = model.encode(text, convert_to_tensor=False, device=device)
    
    return vectors

In [10]:
sample_size_by_category = {'Data Retention':50, 'Data Security':20, 'Do Not Track':10,
                            'First Party Collection/Use':50, 'International and Specific Audiences':80,
                            'Introductory/Generic':80, 'Policy Change':10, 'Practice not covered':80,
                            'Privacy contact information':60, 'Third Party Sharing/Collection':50,
                            'User Access, Edit and Deletion':60, 'User Choice/Control':60}

In [11]:
def create_train_data(X, y, sample_size):
    
    train_set = []

    for col in y.columns:

        idxs = y[y[col]==1].index
        tmp_set = X.loc[idxs].sample(sample_size_by_category[col])

        for pair in combinations(tmp_set, 2):
            train_set.append(InputExample(texts=list(pair), label=1.0))

    for col1, col2 in combinations(y.columns,2):

        idxs1 = y[(y[col1]==1)&(y[col2]==0)].index
        idxs2 = y[(y[col2]==1)&(y[col1]==0)].index
        tmp_set1 = X.loc[idxs1].sample(sample_size)
        tmp_set2 = X.loc[idxs2].sample(sample_size)

        for pair in zip(tmp_set1, tmp_set2):
            train_set.append(InputExample(texts=list(pair), label=0.0))

    print(len(train_set))
    
    return train_set

In [12]:
result_file_name = 'embedding_tuning_opp115-3.p'

In [13]:
if not os.path.exists(result_file_name):
    tuning_results = {}
    with open(result_file_name, 'wb') as f:
        pickle.dump(tuning_results, f)

In [14]:
with open(result_file_name, 'rb') as f:
    tuning_results = pickle.load(f)

In [15]:
iterate = [i for i in embedding_methods if i not in tuning_results.keys()]

In [16]:
for embedding_method in iterate:
    
    X_num = utilities.vectorize_data(X, embedding_method)
    sim_df_before = calculate_similarity_matrix(X_num, y)
    
    train_set = create_train_data(X, y, sample_size=25)
    model = SentenceTransformer(embedding_method, device='cuda') 

    train_dataloader = DataLoader(train_set, shuffle=True, batch_size=6)
    train_loss = losses.CosineSimilarityLoss(model)

    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=20, warmup_steps=50)

    X_num_tuned = vectorize_data(X, model)
    X_num_tuned = pd.Series([np.squeeze(i) for i in X_num_tuned])
    sim_df_after = calculate_similarity_matrix(X_num_tuned, y)
    
    tuning_results[embedding_method] = (sim_df_before, sim_df_after)
    
    with open(result_file_name, 'wb') as f:
        pickle.dump(tuning_results, f)

20395




Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

20395


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

20395


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

20395


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

20395


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3400 [00:00<?, ?it/s]

In [17]:
stop

NameError: name 'stop' is not defined

In [None]:
def scoring(sim_df):    
    scores = calculate_matrix_score(sim_df)[:-1]
    print(f'max: {max(scores):.2f}, min: {min(scores):.2f}, avg: {sum(scores)/len(scores):.2f}')

    l_sum = 100*np.diag(sim_df).sum()/12 - (sim_df.sum().sum()-np.diag(sim_df).sum())/66
    print(f'{l_sum:.2f}') 

In [None]:
for k,v in tuning_results.items():
    print(k)
    print('before')
    scoring(v[0])
    print('after')
    scoring(v[1])
    
    # fig, axes = plt.subplots(1, 2, figsize=(16,5), squeeze=False)
    fig, axes = plt.subplots(ncols=2, figsize=(15, 4))
    ax1, ax2 = axes
    sns.heatmap(v[0].fillna(0), annot=True,
                        xticklabels=v[0].columns,
                        yticklabels=v[0].columns, ax=ax1, cmap="rocket_r")
    sns.heatmap(v[1].fillna(0), annot=True,
                        xticklabels=v[1].columns,
                        yticklabels=v[1].columns, ax=ax2, cmap="rocket_r")

    plt.show()

    print('*'*70)
    
    
    re

In [None]:
def scoring(simdf):    
    scores = calculate_matrix_score(sim_df)[:-1]
    print(f'{embedding} --- max: {max(scores):.2f}, min: {min(scores):.2f}, avg: {sum(scores)/len(scores):.2f}')

    l_sum = 100*np.diag(sim_df).sum()/12 - (sim_df.sum().sum()-np.diag(sim_df).sum())/66
    print(f'{l_sum:.2f}')  

In [None]:
plt.figure(figsize=(7,5))

sns.heatmap(sim_df.fillna(0), annot=True,
xticklabels=sim_df.columns,
yticklabels=sim_df.columns, cmap="rocket_r")

plt.show()

#### for opp-115
sample_size_by_category = {'Data Retention':40, 'Data Security':20, 'Do Not Track':10,
                            'First Party Collection/Use':80, 'International and Specific Audiences':80,
                            'Introductory/Generic':80, 'Policy Change':10, 'Practice not covered':80,
                            'Privacy contact information':60, 'Third Party Sharing/Collection':60,
                            'User Access, Edit and Deletion':60, 'User Choice/Control':60}

In [None]:
sum_after = 100*np.diag(sim_df_after).sum()/12 - (sim_df_after.sum().sum()-np.diag(sim_df_after).sum())/66
print(f'{sum_after:.2f}')  

In [None]:
plt.figure(figsize=(7,5))

sns.heatmap(sim_df_after.fillna(0), annot=True,
xticklabels=sim_df.columns,
yticklabels=sim_df.columns, cmap="rocket_r")

plt.show()

In [None]:
ss

In [None]:
y.sum()

plt.figure(figsize=(7,5))

sns.heatmap(sim_df_after.fillna(0), annot=True,
xticklabels=sim_df.columns,
yticklabels=sim_df.columns, cmap="rocket_r")

plt.show()

model = SentenceTransformer(embedding_method, device='cuda') 

train_dataloader = DataLoader(train_set, shuffle=True, batch_size=6)
train_loss = losses.CosineSimilarityLoss(model)


model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=50)

X_num_tuned = vectorize_data(X, model)
X_num_tuned = pd.Series([np.squeeze(i) for i in X_num_tuned])
sim_df_after = calculate_similarity_matrix(X_num_tuned, y)

sum_after = 100*np.diag(sim_df_after).sum()/12 - (sim_df_after.sum().sum()-np.diag(sim_df_after).sum())/66
print(f'{sum_after:.2f}')  

plt.figure(figsize=(7,5))

sns.heatmap(sim_df_after.fillna(0), annot=True,
xticklabels=sim_df.columns,
yticklabels=sim_df.columns, cmap="rocket_r")

plt.show()

In [None]:
result = {}

for embedding in sentence_embeddings:
    
    model = SentenceTransformer(embedding_method, device='cuda') 

    train_dataloader = DataLoader(train_set, shuffle=True, batch_size=6)
    train_loss = losses.CosineSimilarityLoss(model)

    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=7, warmup_steps=50)
    X_num_tuned = vectorize_data(X, model)
    X_num_tuned = pd.Series([np.squeeze(i) for i in X_num_tuned])
    sim_df_after = calculate_similarity_matrix(X_num_tuned, y)
    
    result[embedding] = sim_df_after

In [None]:
for sentence, sim_df_after in result.items():
    sum_after = 100*np.diag(sim_df_after).sum()/12 - (sim_df_after.sum().sum()-np.diag(sim_df_after).sum())/66
    print('-'*30)
    print(sentence)
    
    print(f'{sum_after:.2f}')  
    
    plt.figure(figsize=(8,6))

    sns.heatmap(sim_df_after.fillna(0), annot=True,
    xticklabels=sim_df.columns,
    yticklabels=sim_df.columns, cmap="rocket_r")

    plt.show()

In [None]:
import pickle
filehandler = open(b"emb_tuning_reuters.p","wb")
pickle.dump(result,filehandler)

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens', device='cuda') 

train_dataloader = DataLoader(train_set, shuffle=True, batch_size=6)
train_loss = losses.CosineSimilarityLoss(model)


model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=50)

In [None]:
X_num_tuned = vectorize_data(X, model)
X_num_tuned = pd.Series([np.squeeze(i) for i in X_num_tuned])
sim_df_after = calculate_similarity_matrix(X_num_tuned, y)

sum_after = 100*np.diag(sim_df_after).sum()/12 - (sim_df_after.sum().sum()-np.diag(sim_df_after).sum())/66
print(f'{sum_after:.2f}')  

In [None]:
plt.figure(figsize=(7,5))

sns.heatmap(sim_df_after.fillna(0), annot=True,
xticklabels=sim_df.columns,
yticklabels=sim_df.columns, cmap="rocket_r")

plt.show()