referring for articles:
https://www.sbert.net/

In [1]:
import pandas as pd
import numpy as np
import preprocess
import utilities
import similarities
import parameters
import seaborn as sns
from numpy import dot
from numpy.linalg import norm
from itertools import combinations
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_paths = parameters.data_paths
sim_calculation_type = parameters.sim_calculation_type
all_sentence_embeddings =  parameters.huggingface_embeddings + parameters.openai_embeddings + parameters.google_embeddings

In [4]:
def calculate_within_class(X, y, sim_calculation_type, sim_df):
    
    import ray
    ray.init(num_cpus=3, ignore_reinit_error=True)
    
    @ray.remote
    def run(vecs, sim_calculation_type):
        return similarities.calculate_similarity_within_classes(vecs, sim_calculation_type)

    futures = [run.remote(X.loc[y[y[col]==1].index], sim_calculation_type) for col in y.columns]
    results = ray.get(futures) 
    
    for col, sim in zip(y.columns, results):    
        sim_df.loc[col, col] = sim
    
    ray.shutdown()

    return sim_df

In [5]:
def calculate_within_class(X, y, sim_calculation_type, sim_df):
                 
    max_batch_size = 2_000  
    
    for col in y.columns:   
        
        all_sim = [] 
        
        col_indexes = y[y[col]==1].index
        batch_num = int(len(col_indexes) / max_batch_size) + (len(col_indexes) % max_batch_size > 0)
        
        for batch in np.array_split(col_indexes, batch_num):
            
            all_sim.extend(similarities.calculate_similarity_within_classes(X.loc[batch], sim_calculation_type=None))
        
        sim_df.loc[col, col] = np.mean(all_sim)
        
    return sim_df

In [6]:
def calculate_between_class(X, y, sim_df):
    
    import ray
    ray.init(num_cpus=3,ignore_reinit_error=True)
    
    @ray.remote
    def run(vecs1, vecs2):
        return similarities.calculate_similarity_between_classes(vecs1, vecs2, sim_calculation_type)
    
    futures = [run.remote(X[y[y[col1]==1].index], X[y[y[col2]==1].index]) for col1, col2 in list(combinations(y.columns, 2))]
    results = ray.get(futures) 
    
    for (col1, col2), sim in zip(list(combinations(y.columns, 2)), results):
        sim_df.loc[col1, col2] = sim
    
    ray.shutdown()
    
    return sim_df

In [7]:
def calculate_between_class(X, y, sim_df):
    
    max_batch_size = 2_000
    
    for col1, col2 in list(combinations(y.columns, 2)):
        
        all_sim = [] 
        
        col1_indexes = y[y[col1]==1].index
        col2_indexes = y[y[col2]==1].index
        
        batch_num1 = int(len(col1_indexes) / max_batch_size) + (len(col1_indexes) % max_batch_size > 0)
        batch_num2 = int(len(col2_indexes) / max_batch_size) + (len(col2_indexes) % max_batch_size > 0)
        
        for batch1 in np.array_split(col1_indexes, batch_num1):
            for batch2 in np.array_split(col2_indexes, batch_num2):
                
                all_sim.extend(similarities.calculate_similarity_between_classes(X.loc[batch1], X.loc[batch2], sim_calculation_type=None))
        
        sim_df.loc[col1, col2] = np.mean(all_sim)
        
    return sim_df

In [8]:
def calculate_similarity_matrix(X, y, sim_method='cosine'):
    
    sim_df = pd.DataFrame(index=y.columns, columns=y.columns)
    sim_df = calculate_within_class(X, y, sim_calculation_type, sim_df)
    sim_df = calculate_between_class(X, y, sim_df)
    
    return sim_df

In [9]:
res_file = 'embedding_results_reuters.p'

import pickle
results = {}
with open(res_file, 'wb') as f:
    pickle.dump(results, f)

In [10]:
data = 'reuters'
path = data_paths[data]

df = utilities.read_data(path)
X = df['text']
y = df.drop(['text'], axis=1)
X = X.apply(preprocess.preprocess_text)

import pickle
with open(res_file, 'rb') as f:
    results = pickle.load(f)

iterate = [i for i in all_sentence_embeddings if i not in results.keys()]
results['failed_embedings'] = []

for embedding_method in tqdm(iterate):
    
    try:
        X_num = utilities.vectorize_data(X, embedding_method)
        sim_df = calculate_similarity_matrix(X_num, y)
        results[embedding_method] = sim_df
        print(embedding_method, ' completed ... ')
    except Exception as e: 
        print(f'! {embedding_method} failed due to {e}... ')
        results['failed_embedings'].append(embedding_method)
    
    with open(res_file, 'wb') as f:
        pickle.dump(results, f)
        
    import torch, gc
    gc.collect()
    torch.cuda.empty_cache()

  0%|                                                                                           | 0/31 [00:05<?, ?it/s]

KeyboardInterrupt



In [None]:
stop

In [None]:
results['failed_embedings']

In [None]:
import pickle
with open(res_file, 'rb') as f:
    results_opp115 = pickle.load(f)
results.pop('failed_embedings')

In [None]:
def calculate_matrix_score(sim_df):
    scores = []
    for col in sim_df.columns:
        scores.append((sim_df.loc[col, col] - sim_df.loc[col].drop(col).max())/sim_df.loc[col, col])
    return scores

best:
'paraphrase-MiniLM-L6-v2'

In [None]:
removed = ['stsb-roberta-large',
           'albert-base-v2',
           'bert-base-nli-mean-tokens',
           'bert-base-uncased',
           'distilbert-base-nli-mean-tokens',
           'nlpaueb/legal-bert-base-uncased',
           'saibo/legal-roberta-base',
           'sentence-t5-large',
           'sentence-transformers/average_word_embeddings_glove.6B.300d',
           'sentence-transformers/average_word_embeddings_glove.840B.300d',
           'distiluse-base-multilingual-cased-v1',
           'multi-qa-mpnet-base-dot-v1',
           'paraphrase-mpnet-base-v2',
           'paraphrase-multilingual-MiniLM-L12-v2',
           'paraphrase-MiniLM-L6-v2',
           'paraphrase-xlm-r-multilingual-v1',
           'universal-sentence-encoder'
          ]

In [None]:
res = results.copy()
finel_res = {v:k for v, k in res.items() if v not in removed}

In [None]:
for embedding, sim_df in finel_res.items():
    scores = calculate_matrix_score(sim_df)
    scores = scores[:-1]
    print(f'{embedding} --- max: {max(scores):.2f}, min: {min(scores):.2f}, avg: {sum(scores)/len(scores):.2f}')

In [None]:
i = 0
for embedding, sim_df in finel_res.items():
    
    plt.figure(i, figsize=(8,5))
    plt.title(f'{embedding}')
    sns.heatmap(sim_df.fillna(0), annot=True,
    xticklabels=sim_df.columns,
    yticklabels=sim_df.columns, cmap="rocket_r")
    

    plt.show()
    i+=1

In [None]:
finel_res.keys()