In [2]:
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 31.7 MB/s 
Collecting tensorflow<2.10,>=2.9.0
  Downloading tensorflow-2.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |████████████████████████████████| 511.7 MB 6.2 kB/s 
Collecting tensorboard<2.10,>=2.9
  Downloading tensorboard-2.9.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 45.6 MB/s 
[?25hCollecting tensorflow-estimator<2.10.0,>=2.9.0rc0
  Downloading tensorflow_estimator-2.9.0-py2.py3-none-any.whl (438 kB)
[K     |████████████████████████████████| 438 kB 63.3 MB/s 
Collecting keras<2.10.0,>=2.9.0rc0
  Downloading keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 45.8 MB/s 
Collecting flatbuffe

In [1]:
import tensorflow_hub as hub
import pandas as pd
import tensorflow_text as text
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

In [4]:
from google.colab import files
uploaded = files.upload()

Saving spam.csv to spam.csv


In [6]:
import chardet
with open("spam.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'confidence': 0.7270322499829184, 'encoding': 'Windows-1252', 'language': ''}

In [7]:
# load data
df = pd.read_csv('spam.csv',encoding='Windows-1252')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
df1=df[['v1','v2']]
df1.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#rename columns to original names
df1.rename(columns={'v1':'Category','v2':'Message'},inplace=True)
df1.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df1.columns

Index(['Category', 'Message'], dtype='object')

In [12]:
#distinct values in category
df1['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [14]:
'''
Clearly, the data is imbalanced and there are more good emails(ham) than spam emails. 
This may lead to a problem as a model may learn all the features of the ham emails over spam emails
 and thus always predict all emails as ham(OVERFITTIN!).
 So before proceeding, we need to take care of that.
'''

'\nClearly, the data is imbalanced and there are more good emails(ham) than spam emails. \nThis may lead to a problem as a model may learn all the features of the ham emails over spam emails\n and thus always predict all emails as ham(OVERFITTIN!).\n So before proceeding, we need to take care of that.\n'

In [15]:
#Downsample our data-Downsampling is a technique where the majority class is downsampled to match the minority class. 
#Since our data has only one column(feature) it ok to use it.
# check percentage of data - states how much data needs to be balanced
print(str(round(747/4825,2))+'%')

0.15%


In [17]:
# creating 2 new dataframe as df_ham , df_spam

df_spam = df1[df1['Category']=='spam']
df_ham = df1[df1['Category']=='ham']
print("Ham Dataset Shape:", df_ham.shape)
print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 2)
Spam Dataset Shape: (747, 2)


In [18]:
# downsampling ham dataset - take only random 747 example
# will use df_spam.shape[0] - 747
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [19]:
# concating both dataset - df_spam and df_ham_balanced to create df_balanced dataset
df_balanced = pd.concat([df_spam , df_ham_downsampled])

In [20]:
df_balanced

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
3635,ham,\ME 2 BABE I FEEL THE SAME LETS JUST 4GET ABOU...
3939,ham,Does daddy have a bb now.
675,ham,"I dont knw pa, i just drink milk.."
3958,ham,Have a nice day my dear.


In [21]:
df_balanced['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [22]:
df_balanced.sample(10)

Unnamed: 0,Category,Message
4124,ham,May b approve panalam...but it should have mor...
535,ham,"Good afternoon, my love! How goes that day ? I..."
4461,ham,"Sorry I flaked last night, shit's seriously go..."
780,ham,Your opinion about me? 1. Over 2. Jada 3. Kusr...
562,ham,Geeeee ... I love you so much I can barely sta...
1486,ham,Hello lover! How goes that new job? Are you th...
885,ham,Gibbs unsold.mike hussey
4820,ham,Im good! I have been thinking about you...
4084,spam,Orange brings you ringtones from all time Char...
543,ham,4 oclock at mine. Just to bash out a flat plan.


In [23]:
#preprocess data
'''
As can be seen, we have only text as categorical data, and the model doesn’t understand them. So instead of text,
 we can just assign integer labels to
 our class ham and spam as 0 and 1 respectively, and store it in new column spam. This is called- Hot-Encoding
'''

'\nAs can be seen, we have only text as categorical data, and the model doesn’t understand them. So instead of text,\n we can just assign integer labels to\n our class ham and spam as 0 and 1 respectively, and store it in new column spam. This is called- Hot-Encoding\n'

In [24]:
# creating numerical repersentation of category - one hot encoding
df_balanced['spam'] = df_balanced['Category'].apply(lambda x:1 if x=='spam' else 0)

In [26]:
df_balanced.sample(10)

Unnamed: 0,Category,Message,spam
262,ham,MY NO. IN LUTON 0125698789 RING ME IF UR AROUN...,0
3672,ham,Moji just informed me that you saved our lives...,0
4690,ham,Surly ill give it to you:-) while coming to re...,0
2134,ham,If he started searching he will get job in few...,0
583,ham,my ex-wife was not able to have kids. Do you w...,0
5427,spam,Santa Calling! Would your little ones like a c...,1
4295,spam,Please CALL 08712402578 immediately as there i...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
2118,spam,U can WIN å£100 of Music Gift Vouchers every w...,1
3562,spam,Auction round 4. The highest bid is now å£54. ...,1


In [27]:
#perform train test split

In [28]:
# loading train test split
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['spam'],
                                                    stratify = df_balanced['spam'])

In [29]:
#stratify-ensures that both the train and test sets have the 
#proportion of examples in each class that is present in the provided “y” array

In [None]:
'''
This marks the end of the pre-processing part and now our model is ready for training. But before that, we need to
 generate word embedding and that’s what we are going to see in the next section:
'''

In [30]:
# downloading preprocessing files and model
bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [31]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'Inputs')
preprocessed_text = bert_preprocessor(text_input)
embeed = bert_encoder(preprocessed_text)
dropout = tf.keras.layers.Dropout(0.1, name = 'Dropout')(embeed['pooled_output'])
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Dense')(dropout)

In [32]:
# creating final model
model = tf.keras.Model(inputs = [text_input], outputs = [outputs])

In [33]:
# check the summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Inputs (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['Inputs[0][0]']                 
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [34]:
Metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')
           ]

In [35]:
# compiling our model
model.compile(optimizer ='adam',
               loss = 'binary_crossentropy',
               metrics = Metrics)

In [36]:
history = model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
# Evaluating performance
model.evaluate(X_test,y_test)



[0.3221336007118225,
 0.8823529481887817,
 0.8592965006828308,
 0.9144384860992432]

In [None]:
'''
It is similar to training results which may lead to the wrong interpretation of the model. 
So we need a better way to understand how our model is performing and usually, 
classification reports and confusion matrices are the way to go.
'''

In [38]:
# getting y_pred by predicting over X_text and flattening it
y_pred = model.predict(X_test)
y_pred = y_pred.flatten() # require to be in one-dimensional array , for easy manipulation



In [42]:
predict_text = [
                # Spam
                'We’d all like to get a $10,000 deposit on our bank accounts out of the blue, but winning a prize—especially if you’ve never entered a contest', 
                'Netflix is sending you a refund of $12.99. Please reply with your bank account and routing number to verify and get your refund', 
                'Your account is temporarily frozen. Please log in to to secure your account ', 
                #ham
                'The article was published on 18th August itself',
                'Although we are unable to give you an exact time-frame at the moment, I would request you to stay tuned for any updates.',
                'The image you sent is a UI bug, I can check that your article is marked as regular and is not in the monetization program.'
]

In [43]:
test_results = model.predict(predict_text)




In [44]:
output = np.where(test_results>0.5,'spam', 'ham')

In [45]:
output

array([['spam'],
       ['spam'],
       ['spam'],
       ['ham'],
       ['spam'],
       ['spam']], dtype='<U4')