In [503]:
#Import necessarily libraries
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [504]:
#Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

# First Layer Locallity Sensitive Hashing
https://www.learndatasci.com/tutorials/building-recommendation-engine-locality-sensitive-hashing-lsh-python/

In [505]:
#Number of Permutations
permutations = 20

In [506]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [507]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['SpamTerms']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

# Upload the Dataset

In [508]:
db = pd.read_csv('/Users/ibrahim/Desktop/TCN/FinalProject/Dataset/SpamTerms.csv')#Please change this to your folder location
db['text']= db['SpamTerms']
#print(db)
#sep='|', names=m_cols , encoding='latin-1'

In [509]:
db=db.head(573)#Read the number of rows you would like from the train dataset
print(db)#404289

                   SpamTerms                     text
0                    All-new                  All-new
1                       Boss                     Boss
2               Don’t delete             Don’t delete
3        Drastically reduced      Drastically reduced
4             Exclusive deal           Exclusive deal
..                       ...                      ...
566   You have been selected   You have been selected
567  You have been selected!  You have been selected!
568         You’re a Winner!         You’re a Winner!
569    You’ve been selected!    You’ve been selected!
570              Your income              Your income

[571 rows x 2 columns]


In [510]:
forest = get_forest(db, permutations)

It took 0.2173469066619873 seconds to build forest.


In [576]:
num_recommendations = 100 #We would like to get the best 100 candidates
query = "SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"
result = predict(query, db, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.0021009445190429688 seconds to query forest.

 Top Recommendation(s) is(are) 
 555        Win
556    Win big
Name: SpamTerms, dtype: object


In [577]:
#Create a list of candiates to be taken to the next layer which is Cosine Simialrity
candidates=[]
candidates=result.values

In [578]:
print(candidates)

['Win' 'Win big']


# Second Layer BERT Vectors+Cosine Similarity
https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/

In [579]:
#Tokenize the candidates to be used in Bert Vector model
import nltk
from nltk.tokenize import word_tokenize
tokenized_sent = []
for s in candidates:
    tokenized_sent.append(word_tokenize(s.lower()))


In [580]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np

[nltk_data] Downloading package punkt to /Users/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [581]:
def cosine(u, v):#Cosine Similarity Calculation
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [582]:
from sentence_transformers import SentenceTransformer #Vectorize the sentences using bert
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [583]:
sentence_embeddings = sbert_model.encode(candidates)

#print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))
#print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])


In [584]:
query_vec = sbert_model.encode([query])[0]#Vectorize the query using bert 

In [585]:
Cosine_Candidates={} #This dicitonary to get the candidates and thier cosine similarity
for sent in candidates:
    sim = cosine(query_vec, sbert_model.encode([sent])[0])
    #print("Sentence = ", sent, "; similarity = ", sim)
    Cosine_Candidates[sent]=[sim]

In [586]:
import operator#To sort the candidates from the highest to the lowest
sorted_d = dict(sorted(Cosine_Candidates.items(), key=operator.itemgetter(1),reverse=True))
print(sorted_d)

{'Win big': [0.29142788], 'Win': [0.1522886]}


In [587]:
NW_Candidates2=[] #The spam terms that has 85% similarity only
for i in sorted_d:
    if sorted_d[i] >= [0.0001]:
        NW_Candidates2.append(i)

In [588]:
#sort the dictionary in descending way and get the best 10 possible similar spam
NW_Candidates=NW_Candidates2[:20]

In [589]:
print("The SMS Text is classified as spam and the following spam terms detected\n",NW_Candidates)

The SMS Text is classified as spam and the following spam terms detected
 ['Win big', 'Win']
