In [1]:
import pandas as pd
import numpy as np
import preprocess
import utilities
import similarities
import parameters
import seaborn as sns
from numpy import dot
from numpy.linalg import norm
from itertools import combinations
import matplotlib.pyplot as plt

In [2]:
data_paths = parameters.data_paths

In [None]:
def vectorize_data(text, model_name='stsb-roberta-large'):
           
    if model_name in parameters.huggingface_embeddings:
          
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer(model_name)
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        vectors = model.encode(list(text), convert_to_tensor=False, device=device)
        
    elif model_name in parameters.openai_embeddings:
        
        import openai
        import config
        openai.api_key = config.openai_api_key
        vectors = openai.Embedding.create(input = list(text), engine=model_name)
        vectors = [vec['embedding'] for vec in vectors['data']]
        
    else:
        raise ValueError('Undefined embedding type!')
        
    if type(text)== pd.core.series.Series: 
        vectors = pd.Series([np.squeeze(i) for i in vectors], index=text.index)
        
    return vectors

In [14]:
import torch

In [16]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
vectors = model.encode(list(X), convert_to_tensor=False, device=device)

Some weights of the model checkpoint at C:\Users\IsmailKaraman/.cache\torch\sentence_transformers\nlpaueb_legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
vectors.shape

(100, 768)

In [3]:
data_paths

{'opp115': 'C:\\Users\\IsmailKaraman\\workspace\\data\\privacy_policy_data\\OPP-115_v2\\majority.csv',
 'ohsumed': 'C:\\Users\\IsmailKaraman\\workspace\\GitHub\\thesis\\data\\ohsumed.csv',
 'reuters': 'C:\\Users\\IsmailKaraman\\workspace\\GitHub\\thesis\\data\\Reuters21578.csv'}

In [3]:
df = utilities.read_data(data_paths['ohsumed'])
df = df.sample(100)
X = df['text']
y = df.drop(['text'], axis=1)
X = X.apply(preprocess.preprocess_text)

In [None]:
%%time
utilities.vectorize_data(X, 'stsb-roberta-large')

In [4]:
%%time
utilities.vectorize_data(X, 'text-similarity-babbage-001')

12346    ralitoline reevaluation of anticonvulsant prof...
2476     delayed cyclo oxygenase blockade reduces the n...
5824     achalasia in the elderly effects of aging on c...
1959     serologic diagnosis of human brucellosis analy...
17213    vascular proliferation as an unusual cause of ...
                               ...                        
107      hyperplastic gastric polyps associated with pe...
1879     failure to deliver hepatitis vaccine confessio...
6491     globus pharyngeus long term follow up and prog...
17284    effect of hemoglobin concentration on maximal ...
20565    cost effectiveness of incremental programmes f...
Name: text, Length: 100, dtype: object

In [5]:
import tensorflow as tf
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
    return model(input)

ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

In [None]:
embed(X)

In [None]:
find more embeddings
Universal Sentence Encoder + Dense NN
Universal Sentence Encoder + ElasticNet

In [None]:
PRETRAINED_BERT_WEIGHTS = download_once_pretrained_transformers(
    "google/bert_uncased_L-4_H-256_A-4")
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_BERT_WEIGHTS)
model = AutoModel.from_pretrained(PRETRAINED_BERT_WEIGHTS)

In [3]:
all_sentence_embeddings =  parameters.huggingface_embeddings + parameters.openai_embeddings

In [4]:
def calculate_between_class_similarities(col1, col2, X, y):
    
    sims = []
    
    for idx1 in y[y[col1]==1].index:
        for idx2 in y[y[col2]==1].index:
            sims.append(similarities.vector_similarity(X.loc[idx1], X.loc[idx2]))
    
    return sum(sims)/len(sims)     

In [5]:
def calculate_similarity_matrix(X, y, sim_method='cosine'):
    
    import similarities
    
    sim_df = pd.DataFrame(index=y.columns, columns=y.columns)
    
    for col in y.columns:
    
        indexes = y[y[col]==1].index
        sim_df.loc[col, col] = similarities.calculate_within_class_similarity(X.loc[indexes])
    
    for col1, col2 in list(combinations(y.columns, 2)):
        sim_df.loc[col1, col2] = calculate_between_class_similarities(col1, col2, X, y)
    
    return sim_df

In [6]:
embedding_similarities = [] 
    
for data, path in data_paths.items():    

    df = utilities.read_data(path)

    X = df['text']
    y = df.drop(['text'], axis=1)
    X = X.apply(preprocess.preprocess_text)

    for embedding_method in all_sentence_embeddings:

        X_num = utilities.vectorize_data(X, embedding_method)
        sim_df = calculate_similarity_matrix(X_num, y)

        embedding_similarities.append((data, embedding_method, sim_df))

TypeError: calculate_within_class_similarity() missing 1 required positional argument: 'sim_calculation_type'

In [None]:
graph_dict = {}
for data, embedding_method, sim_df in embedding_similarities:
    graph_dict[(data, embedding_method)] = sim_df

In [None]:
def calculate_matrix_score(sim_df):
    scores = []
    for col in sim_df.columns:
        scores.append((sim_df.loc[col, col] - sim_df.loc[col].drop(col).max())/sim_df.loc[col, col])
    return scores

In [None]:
for data in data_paths.keys():
    print(data)
    for embedding_method in sentence_embeddings:
        sim_df = graph_dict[(data, embedding_method)]
        
        scores = calculate_matrix_score(sim_df)
        
        print(embedding_method)
        print(f'max: {max(scores)}, min: {min(scores)}, avg: {sum(scores)/len(scores)}')
    print('*'*50)

In [None]:
for data in data_paths.keys():
    for embedding_method in sentence_embeddings:
        
        sim_df = graph_dict[(data, embedding_method)]
        
        plt.figure()
        
        sns.heatmap(sim_df.fillna(0), annot=True,
        xticklabels=sim_df.columns,
        yticklabels=sim_df.columns, cmap="rocket_r", ax=ax1)
        ax.set_title(f'{data}, {embedding_method}')
        
        plt.show()

In [None]:
stop

In [None]:
model = AutoModel.from_pretrained(\"nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("saibo/legal-roberta-base")
                                  
tokenizer = AutoTokenizer.from_pretrained('saibo/legal-roberta-base')   

In [None]:
sentence1 = 'I really love to play football'
sentence2 = 'Playing football is my passion.'

In [None]:
def transform_sentence(sentence1, sentence2, model, preprocessing=False):
    
    model = SentenceTransformer(model)
    
    if preprocessing:
        import preprocess
        sentence1 = preprocess.preprocess_text(sentence1)
        sentence2 = preprocess.preprocess_text(sentence2)
        
    embedding1 = model.encode(sentence1, convert_to_tensor=True)
    embedding2 = model.encode(sentence2, convert_to_tensor=True)
    
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    
    return cosine_scores

In [None]:
from transformers import AutoTokenizer, AutoModel

import torch


#Mean Pooling - Take attention mask into account for correct averaging

def mean_pooling(model_output, attention_mask):

    token_embeddings = model_output[0] #First element of model_output contains all token embeddings

    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
model_name = 
'saibo/legal-roberta-base'
'nlpaueb/legal-bert-base-uncased'
'nlpaueb/legal-bert-small-uncased'
'saibo/legal-roberta-base'
'albert-base-v2'
'ALBERT-xlarge'
'ALBERT-xxlarg'

tokenizer = AutoTokenizer.from_pretrained(model_name)    
text_tensor1 = tokenizer.encode(sentence1, padding=True, truncation=True, return_tensors='pt')
text_tensor1 = tokenizer.encode(sentence2, padding=True, truncation=True, return_tensors='pt')

model = AutoModel.from_pretrained(model_name)

with torch.no_grad():

    output1 = model(text_tensor1)
    output2 = model(text_tensor2)

sentence_embeddings1 = mean_pooling(output1, text_tensor1)
sentence_embeddings2 = mean_pooling(output2, text_tensor2)

cosine_scores = util.pytorch_cos_sim(sentence_embeddings1, sentence_embeddings2)

print(cosine_scores.item())

print(sentence_embeddings1.shape, sentence_embeddings2.shape)


In [None]:
model = SentenceTransformer('stsb-roberta-large')

In [None]:
sentence1 = 'I love to play football because I am a player'
sentence2 = 'Playing football is my passion.'

In [None]:
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
print(cosine_scores.item())

embedding1 = model.encode(preprocess.preprocess_text(sentence1), convert_to_tensor=True)
embedding2 = model.encode(preprocess.preprocess_text(sentence2), convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
print(cosine_scores.item())

In [None]:
https://medium.com/nlplanet/two-minutes-nlp-11-word-embeddings-models-you-should-know-a0581763b9a9

In [None]:
"http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-55462016000400647"

In [None]:
from re import sub
from gensim.utils import simple_preprocess
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

In [None]:
query_string = 'fruit and vegetables'
documents = ['cars drive on the road', 'tomatoes are actually fruit']

stopwords = ['the', 'and', 'are', 'a']

# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

In [None]:
query_string = 'fruit and vegetables'
documents = ['cars drive on the road', 'tomatoes are actually fruit']

In [None]:
documents = ["I like Python because I can build AI applications",
          "I like Python because I can do data analytics",
          "The cat sits on the ground",
         "The cat walks on the sidewalk"]

query_string = "I like Javascript because I can build web applications"

In [None]:
# Preprocess the documents, including the query string
corpus = [preprocess(document) for document in documents]
query = preprocess(query_string)

In [None]:
# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

In [None]:
# Compute Soft Cosine Measure between the query and the documents.
# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf]

# Output the sorted similarity scores and documents
sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
for idx in sorted_indexes:
    print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')


In [None]:
http://www.scielo.org.mx/scielo.php?script=sci_arttext&pid=S1405-55462016000400647