In [1]:
# In this file we will classify spam or not spam using BERT model
# The idea is that we will feed the entire sentence to the BERT model and the model will convert it into word embedding
# and then that embedding can be fed into neural network to define spam or not spam

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [3]:
import pandas as pd
import os

In [4]:
url = "https://raw.githubusercontent.com/codebasics/deep-learning-keras-tf-tutorial/master/47_BERT_text_classification/spam.csv"
df = pd.read_csv(url)
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
df.tail(3)

Unnamed: 0,Category,Message
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [7]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [8]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [9]:
df.shape

(5572, 2)

In [10]:
df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [11]:
df["Category"].unique()

array(['ham', 'spam'], dtype=object)

In [12]:
df.groupby("Category").count()

Unnamed: 0_level_0,Message
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [13]:
# As we can see that the data is imbalanced so we will use the downsampling approach to balance our dataset
# Lets seperate the ham and spam emails

ham = df[df["Category"]=="ham"]

In [14]:
ham[:3]

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...


In [15]:
spam = df[df["Category"]=="spam"]
spam[:3]

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...


In [16]:
print(len(ham)==len(spam))

False


In [17]:
print("Length of ham",len(ham))
print("Length of spam",len(spam))



Length of ham 4825
Length of spam 747


In [18]:
# Lets dowmsize the ham samples
# .sample picks up random samples of defined size
downsized_ham = ham.sample(len(spam))

In [19]:
len(downsized_ham)

747

In [20]:
# now lets concate the dataframes into one dataframe

df_balanced = pd.concat([downsized_ham,spam])

In [21]:
df_balanced.shape[0]

1494

In [22]:
df_balanced.groupby("Category").count()

Unnamed: 0_level_0,Message
Category,Unnamed: 1_level_1
ham,747
spam,747


In [23]:
# Now as we can see above that our dataset is balanced completely and we can now work upon it

In [24]:
# now lets define the spam and not spam into one and zero by adding a new column in the dataframe

df_balanced["spam"] = df_balanced["Category"].apply(lambda x: 1 if x=="spam" else 0)

In [25]:
df_balanced.head(3)

Unnamed: 0,Category,Message,spam
3555,ham,am up to my eyes in philosophy,0
3462,ham,K.. I yan jiu liao... Sat we can go 4 bugis vi...,0
142,ham,"Sir, Waiting for your mail.",0


In [26]:
df_balanced["spam"].unique()

array([0, 1])

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced["Category"], df_balanced["spam"], stratify=df_balanced["spam"] )

In [29]:
print("Xtrain size",len(X_train))
print("Xtest size",len(X_test))
print("ytrain size",len(y_train))
print("ytest size",len(y_test))

Xtrain size 1120
Xtest size 374
ytrain size 1120
ytest size 374


In [30]:
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
# This bert model by tensorflow is a variation that is trained on uncased(lower) large data having 768 hidden layer and attention as 12

preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
# The above model does not take a regualar text input it needs a input in a certain way

In [31]:
import tensorflow_hub as hub
import tensorflow_text as text

In [32]:
bert_preprocess = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

In [33]:
sentence_list = ["this is good","this is not good","when will i get a job"]

In [34]:
def pre_process(sentence):
  '''
  takes a sentence preprocesses it then encode it using bert
  Args:
    sentence
  Returns:
    encoded sentence
    '''
  pre = bert_preprocess(sentence)
  bert_ans = bert_encoder(pre)
  return bert_ans["pooled_output"]


In [35]:
embedding = pre_process(sentence_list)

In [36]:
# This is how you can access doc string
help(pre_process)

Help on function pre_process in module __main__:

pre_process(sentence)
    takes a sentence preprocesses it then encode it using bert
    Args:
      sentence
    Returns:
      encoded sentence



In [37]:
print(embedding)

tf.Tensor(
[[-0.883171   -0.20125957  0.44979236 ...  0.2878238  -0.53117675
   0.87313503]
 [-0.877158   -0.23722361  0.33415693 ...  0.23475447 -0.5366823
   0.90517753]
 [-0.8975002  -0.2825184  -0.01428926 ...  0.02635407 -0.59242195
   0.92442954]], shape=(3, 768), dtype=float32)


In [38]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up",
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8435172 , -0.51327294, -0.88845766, ..., -0.7474892 ,
        -0.7531475 ,  0.91964495],
       [-0.87208366, -0.5054398 , -0.94446695, ..., -0.85847527,
        -0.71745366,  0.8808299 ]], dtype=float32)>

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
word_list =["day","donald trump","monday","pizza","table","sugar"]

In [41]:
word_embeddings = pre_process(word_list)

In [42]:
print(word_embeddings)

tf.Tensor(
[[-0.8391369  -0.18010639  0.5011485  ...  0.53323644 -0.64019215
   0.8799521 ]
 [-0.88634974 -0.30751267 -0.16326094 ...  0.2571022  -0.5674306
   0.81172764]
 [-0.9329612  -0.392646   -0.55494726 ... -0.02638581 -0.712349
   0.9419164 ]
 [-0.7695413  -0.22803241  0.44442275 ...  0.39122653 -0.5410951
   0.81326383]
 [-0.78727424 -0.27071986  0.07494732 ...  0.07324858 -0.58368754
   0.8320589 ]
 [-0.8160595  -0.17812513  0.6109353  ...  0.464158   -0.5998999
   0.80679697]], shape=(6, 768), dtype=float32)


In [43]:
# Now lets experiment with cosine similarity
# which is if two words are closely related then the angle between them will be less or closer will tend to 1 and vice versa


In [44]:
def explore_cosine_similarity(arr):
  """
    takes word embeddings of words to calculate cosine similarity between two words

    Args
      embedding of words

    Return
      cosine similarity score between 2 words

  """

  for i in range(len(arr)-1):
    similarity = cosine_similarity([arr[i]],[arr[i+1]])
    print(f"Cosine similarity between {i} and {i+1} is {similarity}")

In [45]:
 explore_cosine_similarity(word_embeddings)

Cosine similarity between 0 and 1 is [[0.90730566]]
Cosine similarity between 1 and 2 is [[0.90644276]]
Cosine similarity between 2 and 3 is [[0.85045254]]
Cosine similarity between 3 and 4 is [[0.9652059]]
Cosine similarity between 4 and 5 is [[0.9395255]]
