# Setup

In [9]:
import pandas as pd
import numpy as np
from google.colab import files

In [10]:
#uploaded = files.upload()

In [11]:
import warnings
warnings.filterwarnings("ignore")
import io

# Load Data

In [236]:
df=pd.read_csv('txt_adidas.csv')
df['patterns'] = df.txt
#df['patterns'] = df.MsgBody

In [237]:
len(df.groupby(["Celebrity","author","patterns"]).size())

17489

In [238]:
df.shape

(54070, 10)

In [239]:
df.Celebrity.unique()

array(['BlackPink', 'Naeun Son', 'Kerwin Frost', 'Beyonce',
       'Karlie Kloss', 'Yara Shahidi', 'Pharrell Williams',
       'Adriene Mishler', 'Ninjas Hyper', 'Bad Bunny', 'Jerry Lorenzo',
       'Chinae Alexander', 'Ally Love', 'Zoe Saldana'], dtype=object)

In [240]:
df.Celebrity.value_counts()

BlackPink            11507
Naeun Son             7751
Bad Bunny             5338
Ninjas Hyper          4742
Karlie Kloss          4587
Beyonce               3491
Yara Shahidi          3269
Kerwin Frost          2817
Pharrell Williams     2395
Jerry Lorenzo         2325
Chinae Alexander      1912
Zoe Saldana           1840
Ally Love             1776
Adriene Mishler        320
Name: Celebrity, dtype: int64

In [241]:
df.isnull().sum()

Unnamed: 0       1840
Celebrity           0
author              0
txt                 0
id              52230
subreddit       52230
Date            52230
Score           52230
num_comments    52230
patterns            0
dtype: int64

# Clean data

In [242]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
import string
from textblob import Word
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [243]:
#df=df.groupby(["Celebrity","txt"]).size().reset_index(name="freq")
#df['patterns'] = df.txt

In [244]:
stop = stopwords.words('english')
spam_msg = ['bot', 'action', 'performed', 'automatically','please', 'contact', 'moderator']
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x.lower() for x in x.split()))
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if x not in string.punctuation)) #remove punctuations
df['patterns'] = df['patterns'].str.replace('https*\S+','')  #remove url
df['patterns'] = df['patterns'].str.replace('\'\w+','')      #remove ticks
df['patterns'] = df['patterns'].str.replace('[^\w\s]','')    
df['patterns'] = df['patterns'].str.replace('@\S+','')       #remove email
df['patterns'] = df['patterns'].str.encode('ascii', 'ignore').str.decode("utf-8")  #remove unicode
df['patterns'] = df['patterns'].str.replace('\w*\d+\w*','')  #remove digits
df['patterns'] = df['patterns'].str.replace('#\S+','')  #remove hashtag
df['patterns'] = df['patterns'].str.replace('_','')  #remove underscore
df['patterns'] = df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if  not x.isdigit()))
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop)) #remove stop words
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in spam_msg)) #remove stop words
df['patterns'] = df['patterns'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [245]:
col=["Celebrity","author","patterns"]
df_merge = df[col].groupby('Celebrity')['patterns'].apply(lambda x:x.str.cat(sep=" "))
df_merge

Celebrity
Adriene Mishler      hii im vicky let workout buddy stay motivated ...
Ally Love            would dismiss appropriation claim people take ...
Bad Bunny            think alexa bliss nikki cross bigger feud rand...
Beyonce              beyonce new soda pop obnoxious way write note ...
BlackPink            deleted camera suck noticed since album dday v...
Chinae Alexander     thats compliment thats compliment thats compli...
Jerry Lorenzo        podcast uploaded come music fashion hard see k...
Karlie Kloss         event year love shit  linger like bad perfume ...
Kerwin Frost         people say design really mean innovation one t...
Naeun Son            bully compare irene scandal allegation true lo...
Ninjas Hyper           thoguht hyper came thouht glider already cam...
Pharrell Williams    working hard say selfish want want loud clear ...
Yara Shahidi         would pretty cool set prequel hook would prett...
Zoe Saldana          first half worried bit everything open back st

In [246]:
for index, value in df_merge.items():
    print(f"Index : {index} \ {len(value.split())}")

Index : Adriene Mishler \ 2732
Index : Ally Love \ 22588
Index : Bad Bunny \ 63270
Index : Beyonce \ 33453
Index : BlackPink \ 122500
Index : Chinae Alexander \ 53803
Index : Jerry Lorenzo \ 26475
Index : Karlie Kloss \ 41476
Index : Kerwin Frost \ 17262
Index : Naeun Son \ 114808
Index : Ninjas Hyper \ 66346
Index : Pharrell Williams \ 24827
Index : Yara Shahidi \ 35389
Index : Zoe Saldana \ 1401576


In [247]:
documents_df=pd.DataFrame(df_merge.values,columns=['documents'])
documents_df.index = df_merge.index

# Define Cosine Similarity

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [25]:
def most_similar(doc_id,similarity_matrix,matrix,topn=5):
  #the default value of topn is 5. It has to be in the range of the dataframe's length
    n=topn+1
    print (f'Document: {documents_df.iloc[doc_id]}')
    print ('\n')
    print ('Similar Documents:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[0:n]:
        if ix==doc_id:
          continue
        print (f'Celebrity: {documents_df.index[ix]}',":",similarity_matrix[doc_id][ix])

# Tfidf with cosine similarity

https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630

In [300]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [301]:
numwords = 0
for i in documents_df.documents:
  numwords += len(i.split())

numwords

2026505

In [302]:
feature = []
for i in documents_df.documents:
  feature.append(i.split())
flat_list = [item for items in feature for item in items]
feature=len(set(flat_list))
feature

38027

In [303]:
# removing special characters and stop words from the text
#stop_words_l=stopwords.words('english')
import math
max_feature = math.ceil(feature*0.1)
tfidfvectoriser=TfidfVectorizer(strip_accents="ascii",max_features=max_feature,max_df = 0.2)
tfidfvectoriser.fit(documents_df.documents)
tfidf_vectors=tfidfvectoriser.transform(documents_df.documents)

In [304]:
tfidf_vectors.shape

(14, 3803)

In [305]:
tfidf_vectors=tfidf_vectors.toarray()

In [306]:
pairwise_similarities1=np.dot(tfidf_vectors,tfidf_vectors.T)
pairwise_differences1=euclidean_distances(tfidf_vectors)

In [307]:
pairwise_similarities1.shape

(14, 14)

In [308]:
for i in range(len(df_merge)):
    print(i,":",df_merge.index[i])

0 : Adriene Mishler
1 : Ally Love
2 : Bad Bunny
3 : Beyonce
4 : BlackPink
5 : Chinae Alexander
6 : Jerry Lorenzo
7 : Karlie Kloss
8 : Kerwin Frost
9 : Naeun Son
10 : Ninjas Hyper
11 : Pharrell Williams
12 : Yara Shahidi
13 : Zoe Saldana


In [309]:
most_similar(1,pairwise_similarities1,'Cosine Similarity',2)

Document: documents    would dismiss appropriation claim people take ...
Name: Ally Love, dtype: object


Similar Documents:
Celebrity: Zoe Saldana : 0.0685005633868196
Celebrity: BlackPink : 0.06543147367402181


In [310]:
most_similar(5,pairwise_similarities1,'Cosine Similarity',2)

Document: documents    thats compliment thats compliment thats compli...
Name: Chinae Alexander, dtype: object


Similar Documents:
Celebrity: Zoe Saldana : 0.0929840719531326
Celebrity: Ninjas Hyper : 0.05695826462396872


# Word2vec + TFIDF with cosine similarity

In [311]:
pip install --upgrade gensim

Requirement already up-to-date: gensim in /usr/local/lib/python3.7/dist-packages (4.0.1)


In [312]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gensim
from gensim.models import Word2Vec

In [313]:
# tokenize and pad every document to make them of the same size
tokenizer=Tokenizer()
tokenizer.fit_on_texts(documents_df.documents)
tokenized_documents=tokenizer.texts_to_sequences(documents_df.documents)
tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=max_feature,padding='post')
vocab_size=len(tokenizer.word_index)+1

In [314]:
#### train word2vec
data=[]
for i in documents_df.documents:
    li = i.split()
    data.append(li)

This Google Developers blog post says:

Well, the following "formula" provides a general rule of thumb about the number of embedding dimensions:

embedding_dimensions =  number_of_categories**0.25

That is, the embedding vector dimension should be the 4th root of the number of categories.

Interestingly, the Word2vec Wikipedia article says (emphasis mine):

Nevertheless, for skip-gram models trained in medium size corpora, with 50 dimensions, a window size of 15 and 10 negative samples seems to be a good parameter setting.

Assuming a standard-ish sized vocabulary of 1.5 million words, this rule of thumb comes surprisingly close:

50 == 1.5e6 ** 0.2751

Parameters:
https://radimrehurek.com/gensim/models/word2vec.html

In [315]:
### calculate the vector_size
def flatten(data):
    return " ".join([str(item) for var in data for item in var])

num_words = 0
for item in data:
    num_words += len(item)
num_words

2026505

In [316]:
import math
dim_size = num_words**0.25
vector_size = math.ceil(float(dim_size))
vector_size

38

In [317]:
from gensim.models.callbacks import CallbackAny2Vec

class MonitorCallback(CallbackAny2Vec):
    def __init__(self, test_words):
        self.epoch = 1
        self._test_words = test_words
        self.loss_to_be_subed = 0

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print("Model loss:", loss_now)  # print loss
        #for word in self._test_words:  # show wv logic changes
        #print(model.wv.most_similar(self._test_words,topn=1))
        self.epoch += 1

In [318]:
# Create skip-gram model
model = gensim.models.Word2Vec(data,
                                min_count = 10, 
                                vector_size = vector_size, 
                                window = 15, 
                                negative= 10,
                                sg=1,
                                ns_exponent=-0.5,
                                sample = 1e-5,
                                compute_loss=True)
callback = MonitorCallback(["adidas"])
#model1.build_vocab(data)
model.train(data, total_examples=model.corpus_count,epochs=130,callbacks=[callback],compute_loss=True)

Epoch: 1	Model loss: 1059346.625
Epoch: 2	Model loss: 961139.875
Epoch: 3	Model loss: 880501.25
Epoch: 4	Model loss: 773903.75
Epoch: 5	Model loss: 714547.0
Epoch: 6	Model loss: 664804.0
Epoch: 7	Model loss: 686457.0
Epoch: 8	Model loss: 650222.5
Epoch: 9	Model loss: 649544.0
Epoch: 10	Model loss: 625038.0
Epoch: 11	Model loss: 631661.5
Epoch: 12	Model loss: 605808.5
Epoch: 13	Model loss: 570105.0
Epoch: 14	Model loss: 588233.0
Epoch: 15	Model loss: 574117.0
Epoch: 16	Model loss: 550492.0
Epoch: 17	Model loss: 549920.0
Epoch: 18	Model loss: 544657.0
Epoch: 19	Model loss: 567465.0
Epoch: 20	Model loss: 611049.0
Epoch: 21	Model loss: 545428.0
Epoch: 22	Model loss: 564248.0
Epoch: 23	Model loss: 569189.0
Epoch: 24	Model loss: 568573.0
Epoch: 25	Model loss: 533160.0
Epoch: 26	Model loss: 561214.0
Epoch: 27	Model loss: 475196.0
Epoch: 28	Model loss: 446794.0
Epoch: 29	Model loss: 475322.0
Epoch: 30	Model loss: 469020.0
Epoch: 31	Model loss: 444544.0
Epoch: 32	Model loss: 452870.0
Epoch: 33	

(15735658, 263445650)

In [319]:
model_w2v = model.wv

In [320]:
model_w2v.most_similar("kpop",topn=10)

[('blackpink', 0.7609332799911499),
 ('debut', 0.7472858428955078),
 ('variable', 0.7254600524902344),
 ('lisas', 0.7237074375152588),
 ('solo', 0.7141395807266235),
 ('rat', 0.7126165628433228),
 ('auckland', 0.707947313785553),
 ('checking', 0.706265926361084),
 ('bp', 0.7056664228439331),
 ('syncing', 0.7026147842407227)]

In [321]:
model_w2v.most_similar("adidas",topn=10)

[('synths', 0.7627847790718079),
 ('spinning', 0.7265263199806213),
 ('hoodie', 0.714285671710968),
 ('dirty', 0.6974126100540161),
 ('till', 0.6881709098815918),
 ('loll', 0.6861075162887573),
 ('pair', 0.6833866238594055),
 ('dumbass', 0.6822615265846252),
 ('pink', 0.6735073924064636),
 ('brand', 0.6670190691947937)]

In [322]:
model_w2v.most_similar("album",topn=10)

[('song', 0.8406934142112732),
 ('full', 0.7770010232925415),
 ('definitely', 0.7717189192771912),
 ('life', 0.7579711079597473),
 ('fan', 0.7412502765655518),
 ('well', 0.7357615232467651),
 ('sound', 0.7349567413330078),
 ('favorite', 0.7331215739250183),
 ('year', 0.7320114970207214),
 ('first', 0.7297888398170471)]

In [323]:
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
embedding_matrix=np.zeros((vocab_size,vector_size))
for word,i in tokenizer.word_index.items():
    if word in model_w2v:
        embedding_matrix[i]=model_w2v[word]
# creating document-word embeddings
document_word_embeddings=np.zeros((len(tokenized_paded_documents),max_feature,vector_size))
for i in range(len(tokenized_paded_documents)):
    for j in range(len(tokenized_paded_documents[0])):
        document_word_embeddings[i][j]=embedding_matrix[tokenized_paded_documents[i][j]]
document_word_embeddings.shape

(14, 3803, 38)

In [324]:
embedding_matrix[tokenizer.word_index['kpop']]

array([ 0.42408568, -0.636186  , -0.02173865,  0.40918148, -0.16908389,
        0.07086756,  0.07810916, -0.48092079, -0.86170381,  0.14952232,
        0.94514346,  0.19336647,  0.07170593,  1.3275795 ,  0.52260041,
        0.54587066, -0.40937307,  0.78211588,  0.21297148,  0.03895288,
        0.44188151, -0.5929814 , -0.05441341, -0.05959946,  0.66997254,
        0.28445223, -0.78456414, -0.28653705, -0.87987596, -0.43999198,
       -0.33982849, -0.98056018, -0.32607019,  0.63868064, -0.90411234,
       -0.25713512, -0.23389682, -0.35958678])

In [325]:
# tf-idf vectors do not keep the original sequence of words, converting them into actual word sequences from the documents
document_embeddings=np.zeros((len(tokenized_paded_documents),vector_size))
words=tfidfvectoriser.get_feature_names()

In [326]:
for i in range(len(document_word_embeddings)):
  for j in range(len(words)):
      document_embeddings[i]+=embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
document_embeddings=document_embeddings/np.sum(tfidf_vectors,axis=1).reshape(-1,1)

In [327]:
pairwise_similarities2=cosine_similarity(document_embeddings)
pairwise_differences2=euclidean_distances(document_embeddings)

In [328]:
for i in range(len(df_merge)):
    print(i,":",df_merge.index[i])

0 : Adriene Mishler
1 : Ally Love
2 : Bad Bunny
3 : Beyonce
4 : BlackPink
5 : Chinae Alexander
6 : Jerry Lorenzo
7 : Karlie Kloss
8 : Kerwin Frost
9 : Naeun Son
10 : Ninjas Hyper
11 : Pharrell Williams
12 : Yara Shahidi
13 : Zoe Saldana


In [329]:
most_similar(3,pairwise_similarities2,'Cosine Similarity',topn=5)

Document: documents    beyonce new soda pop obnoxious way write note ...
Name: Beyonce, dtype: object


Similar Documents:
Celebrity: Zoe Saldana : 0.8770287067841157
Celebrity: BlackPink : 0.8523215448691734
Celebrity: Bad Bunny : 0.8246935174193453
Celebrity: Pharrell Williams : 0.8201593105735584
Celebrity: Jerry Lorenzo : 0.8095129852538354


In [330]:
most_similar(4,pairwise_similarities2,'Cosine Similarity',9)

Document: documents    deleted camera suck noticed since album dday v...
Name: BlackPink, dtype: object


Similar Documents:
Celebrity: Zoe Saldana : 0.8720907953094005
Celebrity: Naeun Son : 0.8638473730029163
Celebrity: Beyonce : 0.8523215448691734
Celebrity: Ninjas Hyper : 0.8513153737462954
Celebrity: Ally Love : 0.8313145733446775
Celebrity: Bad Bunny : 0.811518546732444
Celebrity: Yara Shahidi : 0.790294186904051
Celebrity: Jerry Lorenzo : 0.7834732926000468
Celebrity: Karlie Kloss : 0.7823273395043979


In [331]:
most_similar(9,pairwise_similarities2,'Cosine Similarity',13) #most_similar(5,pairwise_differences2,'Euclidean Distance')

Document: documents    bully compare irene scandal allegation true lo...
Name: Naeun Son, dtype: object


Similar Documents:
Celebrity: BlackPink : 0.8638473730029163
Celebrity: Yara Shahidi : 0.8375228309239479
Celebrity: Zoe Saldana : 0.8142787155763014
Celebrity: Ally Love : 0.8033789179762868
Celebrity: Beyonce : 0.801742611270665
Celebrity: Bad Bunny : 0.7895482528601664
Celebrity: Chinae Alexander : 0.7692683821695445
Celebrity: Jerry Lorenzo : 0.7614928518951466
Celebrity: Ninjas Hyper : 0.7492894109621924
Celebrity: Pharrell Williams : 0.7352073440339298
Celebrity: Adriene Mishler : 0.7130013775561989
Celebrity: Kerwin Frost : 0.6835916437818846
Celebrity: Karlie Kloss : 0.681870998883518


In [332]:
most_similar(0,pairwise_similarities2,'Cosine Similarity') #most_similar(5,pairwise_differences2,'Euclidean Distance')

Document: documents    hii im vicky let workout buddy stay motivated ...
Name: Adriene Mishler, dtype: object


Similar Documents:
Celebrity: Naeun Son : 0.7130013775561989
Celebrity: Ally Love : 0.6983490487361188
Celebrity: Yara Shahidi : 0.6901856435045982
Celebrity: Ninjas Hyper : 0.6710691624307652
Celebrity: BlackPink : 0.6622485836682845


# Tfidf + GloVe with cosine similarity

Get the pre-trained GloVe model from Stanford, at: https://figshare.com/articles/dataset/Twitter_pre-trained_word_vectors/11640300

In [None]:
# reading Glove word embeddings into a dictionary with "word" as key and values as word vectors

embeddings_index = dict()

with open("glove.twitter.27B.200d.txt") as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
len(embeddings_index.keys())

In [None]:
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 

embedding_matrix=np.zeros((vocab_size,len(embeddings_index["key"])))

for word,i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# tf-idf vectors do not keep the original sequence of words, converting them into actual word sequences from the documents

document_embeddings=np.zeros((len(tokenized_paded_documents),len(embeddings_index["key"])))
words=tfidfvectoriser.get_feature_names()

for i in range(documents_df.shape[0]):
    for j in range(len(words)):
        document_embeddings[i]+=embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
        
document_embeddings=document_embeddings/np.sum(tfidf_vectors,axis=1).reshape(-1,1)

In [None]:
document_embeddings.shape

In [None]:
document_embeddings

In [None]:
pairwise_similarities3=cosine_similarity(document_embeddings)
pairwise_differences3=euclidean_distances(document_embeddings)

In [None]:
most_similar(12,pairwise_similarities3,'Cosine Similarity')

In [None]:
most_similar(5,pairwise_differences3,'Euclidean Distance')

# Doc2vec with cosine similarity




In [281]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [282]:
tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[index]) for index, doc in enumerate(documents_df.documents)]

In [283]:
model_d2v = Doc2Vec(vector_size=200,alpha=0.025, 
                    min_count=5,
                    window=15, 
                    negative=10
                    )
  
model_d2v.build_vocab(tagged_data)


model_d2v.train(tagged_data,total_examples=model_d2v.corpus_count,epochs=20)

In [284]:
document_embeddings=np.zeros((documents_df.shape[0],200))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]

In [285]:
pairwise_similarities4=cosine_similarity(document_embeddings)
pairwise_differences4=euclidean_distances(document_embeddings)

In [286]:
most_similar(5,pairwise_similarities4,'Cosine Similarity')

Document: documents    thats compliment thats compliment thats compli...
Name: Chinae Alexander, dtype: object


Similar Documents:
Celebrity: Adriene Mishler : 0.416259493209572
Celebrity: Yara Shahidi : 0.355317501078371
Celebrity: Ninjas Hyper : 0.30180110105181673
Celebrity: Ally Love : 0.22825447662881118
Celebrity: Naeun Son : 0.19406690738210192


In [287]:
most_similar(1,pairwise_similarities4,'Cosine Similarity')

Document: documents    would dismiss appropriation claim people take ...
Name: Ally Love, dtype: object


Similar Documents:
Celebrity: Adriene Mishler : 0.3417566260514602
Celebrity: Ninjas Hyper : 0.28731826007422784
Celebrity: Yara Shahidi : 0.2582524493002825
Celebrity: Kerwin Frost : 0.23588460627503618
Celebrity: Chinae Alexander : 0.22825447662881118


In [288]:
most_similar(4,pairwise_similarities4,'Cosine Similarity')

Document: documents    deleted camera suck noticed since album dday v...
Name: BlackPink, dtype: object


Similar Documents:
Celebrity: Adriene Mishler : 0.399896444571846
Celebrity: Pharrell Williams : 0.346123153470443
Celebrity: Naeun Son : 0.311758632233048
Celebrity: Kerwin Frost : 0.3029020315345088
Celebrity: Karlie Kloss : 0.18936372907837934


# BERT Sentence Embeddings with Sentence_Transformers

In [333]:
pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.7/dist-packages (1.1.0)


In [334]:
from sentence_transformers import SentenceTransformer

Pre-trained model: https://www.sbert.net/docs/pretrained_models.html

##### Paraphrase Identification
The following models are recommended for various applications, as they were trained on Millions of paraphrase examples. They create extremely good results for various similarity and retrieval tasks. They are currently under development, better versions and more details will be released in future. But they many tasks they work better than the NLI / STSb models.

**paraphrase-distilroberta-base-v1 - Trained on large scale paraphrase data.**

**paraphrase-xlm-r-multilingual-v1 - Multilingual version of** 

**paraphrase-distilroberta-base-v1, trained on parallel data for 50+ languages.** 

(Teacher: paraphrase-distilroberta-base-v1, Student: xlm-r-base)

##### Semantic Textual Similarity
The following models were optimized for Semantic Textual Similarity (STS). They were trained on SNLI+MultiNLI and then fine-tuned on the STS benchmark train set.

The best available models for STS are:

**stsb-roberta-large - STSb performance: 86.39**

**stsb-roberta-base - STSb performance: 85.44**

**stsb-bert-large - STSb performance: 85.29**

**stsb-distilbert-base - STSb performance: 85.16**

» Full List of STS Models

##### Duplicate Questions Detection
The following models were trained for duplicate questions mining and duplicate questions retrieval. You can use them to detect duplicate questions in a large corpus (see paraphrase mining) or to search for similar questions (see semantic search).

Available models:

**quora-distilbert-base - Model first tuned on NLI+STSb data, then fine-tune* for Quora Duplicate Questions detection retrieval.**

**quora-distilbert-multilingual - Multilingual version of quora-distilbert-base. Fine-tuned with parallel data for 50+ languages.**

##### Question-Answer Retrieval - MSMARCO
The following models were trained on MSMARCO Passage Ranking, a dataset with 500k real queries from Bing search. Given a search query, find the relevant passages.

**msmarco-distilbert-base-v3: MRR@10: 33.13 on MS MARCO dev set**

**msmarco-roberta-base-ance-fristp: MRR@10: 33.03 on MS MARCO dev set**

In [335]:
sbert_model = SentenceTransformer('stsb-bert-large', 
                                  device='cuda'
                                  )

In [336]:
document_embeddings = sbert_model.encode(documents_df['documents'])

In [337]:
len(document_embeddings)

14

In [338]:
pairwise_similarities5=cosine_similarity(document_embeddings)
pairwise_differences5=euclidean_distances(document_embeddings)

In [339]:
for i in range(len(df_merge.index)):
  print(i,":",df_merge.index[i])

0 : Adriene Mishler
1 : Ally Love
2 : Bad Bunny
3 : Beyonce
4 : BlackPink
5 : Chinae Alexander
6 : Jerry Lorenzo
7 : Karlie Kloss
8 : Kerwin Frost
9 : Naeun Son
10 : Ninjas Hyper
11 : Pharrell Williams
12 : Yara Shahidi
13 : Zoe Saldana


In [342]:
most_similar(4,pairwise_similarities5,'Cosine Similarity',9)

Document: documents    deleted camera suck noticed since album dday v...
Name: BlackPink, dtype: object


Similar Documents:
Celebrity: Naeun Son : 0.69564617
Celebrity: Bad Bunny : 0.67971146
Celebrity: Karlie Kloss : 0.6724578
Celebrity: Pharrell Williams : 0.6397086
Celebrity: Yara Shahidi : 0.6257952
Celebrity: Beyonce : 0.611153
Celebrity: Zoe Saldana : 0.54505795
Celebrity: Jerry Lorenzo : 0.5433645
Celebrity: Kerwin Frost : 0.5085624


In [343]:
most_similar(9,pairwise_similarities5,'Cosine Similarity',9)

Document: documents    bully compare irene scandal allegation true lo...
Name: Naeun Son, dtype: object


Similar Documents:
Celebrity: BlackPink : 0.69564617
Celebrity: Karlie Kloss : 0.6646825
Celebrity: Pharrell Williams : 0.60054475
Celebrity: Bad Bunny : 0.59413934
Celebrity: Chinae Alexander : 0.56174713
Celebrity: Beyonce : 0.5557746
Celebrity: Jerry Lorenzo : 0.55543506
Celebrity: Zoe Saldana : 0.546686
Celebrity: Yara Shahidi : 0.53791356


# BERT Sentence Embeddings with HuggingFace Transformers

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask



#Sentences we want sentence embeddings for
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

Document: documents    expected shes fucked idk else say level public...
Name: Naeun Son, dtype: object


Similar Documents:
Celebrity: Chinae Alexander : 0.9429753
Celebrity: Zoe Saldana : 0.91698265
Celebrity: Kerwin Frost : 0.9116169
Celebrity: Pharrell Williams : 0.90966386
Celebrity: Karlie Kloss : 0.86605984
Celebrity: Ninjas Hyper : 0.84975
Celebrity: Ally Love : 0.8271163
Celebrity: Beyonce : 0.815284
Celebrity: Yara Shahidi : 0.7773283
Celebrity: Jerry Lorenzo : 0.7764535
Celebrity: Adriene Mishler : 0.736783
Celebrity: BlackPink : 0.6076021
Celebrity: Bad Bunny : 0.5791051


# Sent2vec

In [None]:
pip install sent2vec

Collecting sent2vec
  Downloading https://files.pythonhosted.org/packages/4e/c6/1f57065edbc772d9529e4a5f75cb812f29bcc2bf59b8e4c34c8ecfd83fe3/sent2vec-0.2.0-py3-none-any.whl
Installing collected packages: sent2vec
Successfully installed sent2vec-0.2.0


In [None]:
from scipy import spatial
from sent2vec.vectorizer import Vectorizer

In [None]:
string = []
for i in df.patterns.to_list():
  string.append(str(i))

print(len(string))

6245


In [None]:
sentences = str(documents_df.documents.to_list())

vectorizer = Vectorizer()
vectorizer.bert(sentences)
vectors_bert = vectorizer.vectors

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [None]:
dist_1 = spatial.distance.cosine(vectors_bert[5], vectors_bert[12])
dist_2 = spatial.distance.cosine(vectors_bert[13], vectors_bert[12])
print('dist_1: {0}, dist_2: {1}'.format(dist_1, dist_2))
# dist_1: 0.043, dist_2: 0.192

In [None]:
a=[]
b=[]
for i in range(len(df.patterns)):
  if "scrapped" in df.patterns[i]:
    a.append(i)
  if "adidas" in df.patterns[i]:
    b.append(i)

print(set.intersection(set(a), set(b)))
print(len(b)/len(df.Celebrity))

{6598, 5932, 26893, 27129, 26224, 6833, 6169, 26461}
0.0023857961901239135


In [None]:
c=set.intersection(set(a), set(b))
for i in c:
  print(df.txt[i])
  print("------------------------------")


['Yeah his adidas collection scrapped now']
------------------------------
['Yeah his adidas collection scrapped now']
------------------------------
['Yeah his adidas collection scrapped now']
------------------------------
['Yeah his adidas collection scrapped now']
------------------------------
['Yeah his adidas collection scrapped now']
------------------------------
['Yeah his adidas collection scrapped now']
------------------------------
['Yeah his adidas collection scrapped now']
------------------------------
['Yeah his adidas collection scrapped now']
------------------------------


In [None]:
import collections
from nltk import ngrams
def top5_words(text,ngram,n):
    counts = collections.Counter(ngrams(text.split(),ngram))
    return counts.most_common(n)

In [None]:
# Naeun Son
top5_words(str(df_merge[9]),3,20)

[(('best', 'new', 'actor'), 100),
 (('seo', 'ji', 'hye'), 88),
 (('watermarkfree', 'english', 'hardsubbed'), 88),
 (('english', 'hardsubbed', 'stream'), 88),
 (('hardsubbed', 'stream', 'episode'), 88),
 (('stream', 'episode', 'available'), 88),
 (('note', 'recommend', 'ublock'), 88),
 (('recommend', 'ublock', 'origin'), 88),
 (('ublock', 'origin', 'protect'), 88),
 (('origin', 'protect', 'adscryptomining'), 88),
 (('kim', 'hee', 'ae'), 88),
 (('kang', 'ha', 'neul'), 84),
 (('yoo', 'jae', 'suk'), 76),
 (('please', 'contact', 'moderator'), 73),
 (('protect', 'adscryptomining', 'openloadstreamangostreamcherry'), 72),
 (('adscryptomining', 'openloadstreamangostreamcherry', 'chrome'), 72),
 (('openloadstreamangostreamcherry', 'chrome', 'firefox'), 72),
 (('bot', 'action', 'performed'), 69),
 (('action', 'performed', 'automatically'), 69),
 (('performed', 'automatically', 'please'), 69)]

In [None]:
# Karlie Kloss
top5_words(str(df_merge[7]),3,20)

[(('bot', 'action', 'performed'), 191),
 (('action', 'performed', 'automatically'), 191),
 (('performed', 'automatically', 'please'), 109),
 (('automatically', 'please', 'contact'), 109),
 (('please', 'contact', 'moderator'), 109),
 (('im', 'bot', 'action'), 82),
 (('removed', 'removed', 'removed'), 74),
 (('season', 'million', 'viewer'), 66),
 (('sure', 'follow', 'community'), 57),
 (('follow', 'community', 'guideline'), 57),
 (('community', 'guideline', 'get'), 57),
 (('guideline', 'get', 'user'), 57),
 (('get', 'user', 'flair'), 57),
 (('user', 'flair', 'check'), 57),
 (('flair', 'check', 'instagram'), 57),
 (('check', 'instagram', 'page'), 57),
 (('instagram', 'page', 'celebrity'), 57),
 (('page', 'celebrity', 'content'), 57),
 (('celebrity', 'content', 'bot'), 57),
 (('content', 'bot', 'action'), 57)]

In [None]:
temp_txt=str(df_merge.tolist())
top5_words(temp_txt,4,5)

[(('bot', 'action', 'performed', 'automatically'), 4186),
 (('action', 'performed', 'automatically', 'please'), 3454),
 (('performed', 'automatically', 'please', 'contact'), 3454),
 (('automatically', 'please', 'contact', 'moderator'), 3454),
 (('halo', 'halo', 'halo', 'halo'), 981)]