*AT&T spam detector*

### *Imports & Installations*

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 36.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 49.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
!pip install -q -U "tensorflow-text==2.9.*"

[K     |████████████████████████████████| 4.6 MB 4.7 MB/s 
[?25h

In [None]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, GRU, LSTM
import tensorflow_text

import pathlib 

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.offline as py
import plotly.tools as tls

color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]

import os
import io
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
warnings.filterwarnings("ignore")

pio.templates.default = "plotly_dark"
matplotlib.style.use('dark_background')

In [None]:
tf.__version__

'2.9.2'

### *EDA & Data cleaning*

In [None]:
# Import dataset with Pandas 
dataset = pd.read_csv("https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Deep+Learning/project/spam.csv", error_bad_lines=False, encoding='iso-8859-1')
dataset.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
dataset = dataset.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
dataset["type"] = dataset["v1"].apply(lambda x: 1 if x=="spam" else 0)

In [None]:
dataset.head()

Unnamed: 0,v1,v2,type
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# fig = px.bar

In [None]:
dataset["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [None]:
dataset["v2"].value_counts()

Sorry, I'll call later                                                                                                                                                 30
I cant pick the phone right now. Pls send a message                                                                                                                    12
Ok...                                                                                                                                                                  10
7 wonders in My WORLD 7th You 6th Ur style 5th Ur smile 4th Ur Personality 3rd Ur Nature 2nd Ur SMS and 1st \Ur Lovely Friendship\"... good morning dear"               4
Say this slowly.? GOD,I LOVE YOU &amp; I NEED YOU,CLEAN MY HEART WITH YOUR BLOOD.Send this to Ten special people &amp; u c miracle tomorrow, do it,pls,pls do it...     4
                                                                                                                                                      

*We notice that there is a lot of data with numbers and punctuations.*

### *Preprocessing*

In [None]:
!python -m spacy download en_core_web_md -q

2023-01-02 15:20:29.623994: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[K     |████████████████████████████████| 42.8 MB 4.9 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
import en_core_web_md
nlp = en_core_web_md.load()

In [None]:
# Import Stop words 
from spacy.lang.en.stop_words import STOP_WORDS

*We will now have to clean our texts in order to prepare them for training.*
*Let's do this in three different steps :*
 - using the command `str.isalnum` remove all characters from your strings that are not alphanumeric except for whitespaces.
 - using `str.replace`, `str.lower` and `str.strip` replace double whitespaces with single whitespaces, convert all characters to lowercase and trim starting and finishing whitespaces.
 - using spacy, replace all tokens in your texts with `lemma_` and remove all the stop words.

In [None]:
dataset["msg_clean"] = dataset["v2"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" " or ch=="'"))
dataset["msg_clean"] = dataset["msg_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
dataset["msg_clean"] = dataset["msg_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))
dataset.head()

Unnamed: 0,v1,v2,type,msg_clean
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think usf live


In [None]:
mask = dataset.msg_clean.apply(lambda x: type(x)==str)
mask.value_counts()

True    5572
Name: msg_clean, dtype: int64

*Let's instanciate the tokenizer, we set it up to keep only the most 1000 common words*

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000, oov_token="out_of_vocab") # instanciate the tokenizer
tokenizer.fit_on_texts(dataset.msg_clean)
dataset["msg_encoded"] = tokenizer.texts_to_sequences(dataset.msg_clean)

In [None]:
dataset.head()

Unnamed: 0,v1,v2,type,msg_clean,msg_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great wor...,"[1, 230, 445, 462, 941, 32, 49, 204, 942, 77, ..."
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni,"[8, 194, 463, 290, 2, 1]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...,"[11, 302, 3, 531, 659, 30, 1, 845, 421, 1, 1, ..."
3,ham,U dun say so early hor... U c already then say...,0,u dun early hor u c,"[2, 125, 150, 1, 2, 79]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think usf live,"[701, 20, 660, 131]"


In [None]:
tokenizer.index_word

{1: 'out_of_vocab',
 2: 'u',
 3: '2',
 4: 'ur',
 5: 'come',
 6: '4',
 7: 'know',
 8: 'ok',
 9: 'ltgt',
 10: 'good',
 11: 'free',
 12: 'send',
 13: 'like',
 14: 'want',
 15: 'day',
 16: 'time',
 17: 'love',
 18: 'text',
 19: 'tell',
 20: 'think',
 21: 'need',
 22: 'today',
 23: 'txt',
 24: 'home',
 25: 'lor',
 26: 'r',
 27: 'reply',
 28: 'stop',
 29: 'sorry',
 30: 'win',
 31: 'mobile',
 32: 'n',
 33: 'phone',
 34: 'new',
 35: 'week',
 36: 'later',
 37: 'work',
 38: 'da',
 39: 'hi',
 40: 'ask',
 41: 'd',
 42: 'miss',
 43: 'ì',
 44: 'hope',
 45: 'night',
 46: 'claim',
 47: 's',
 48: 'thing',
 49: 'great',
 50: 'try',
 51: 'wait',
 52: 'oh',
 53: 'hey',
 54: 'leave',
 55: 'meet',
 56: 'dear',
 57: 'pls',
 58: 'happy',
 59: 'message',
 60: 'number',
 61: 'wat',
 62: 'friend',
 63: 'm',
 64: 'thank',
 65: 'feel',
 66: 'way',
 67: 'late',
 68: 'prize',
 69: 'right',
 70: 'find',
 71: 'pick',
 72: 'tomorrow',
 73: 'yes',
 74: 'yeah',
 75: '1',
 76: 'min',
 77: 'e',
 78: 'msg',
 79: 'c',
 80: '

### *Train test split*

*Tensorflow is incapable as of now to create a tensor dataset based on lists of different lengths, we will have to store all of our encoded texts into a single numpy array before creating the tensorflow dataset.
Not all our sequences are the same length, this is where the `tf.keras.preprocessing.sequence.pad_sequences` comes in handy, it will add zero padding at the beginning (`padding="pre"`) or at the end (`padding="post"`) of your sequences so they all have equal length.
Let's Pad the sequences*

In [None]:
msg_pad = tf.keras.preprocessing.sequence.pad_sequences(dataset.msg_encoded, padding="post")

In [None]:
# Train Test Split
xtrain, xval, ytrain, yval = train_test_split(msg_pad,dataset.type, test_size=0.3)

In [None]:
train = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
val = tf.data.Dataset.from_tensor_slices((xval, yval))

In [None]:
train_batch = train.shuffle(len(train)).batch(64)
val_batch = val.shuffle(len(val)).batch(64)

In [None]:
# Let's check one batch 
for msg, msg_type in train_batch.take(1):
  print(msg, msg_type)

tf.Tensor(
[[ 52   1 696 ...   0   0   0]
 [ 92 199  87 ...   0   0   0]
 [  1   1 508 ...   0   0   0]
 ...
 [ 64   1 199 ...   0   0   0]
 [172  32 654 ...   0   0   0]
 [  3   1   1 ...   0   0   0]], shape=(64, 74), dtype=int32) tf.Tensor(
[0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0], shape=(64,), dtype=int64)


### *CLASSIFICATION MODELING* ###

### *SimpleRNN*

In [None]:
vocab_size = tokenizer.num_words
model = tf.keras.Sequential([
                  # Couche d'Input Word Embedding           
                  tf.keras.layers.Embedding(vocab_size+1, 8, input_shape=[msg.shape[1],],name="embedding"),
                  # Gobal average pooling
                  tf.keras.layers.GlobalAveragePooling1D(),

                  # Couche Dense classique
                  tf.keras.layers.Dense(16, activation='relu'),

                  # Couche de sortie avec le nombre de neurones en sortie égale au nombre de classe avec fonction softmax
                  tf.keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 74, 8)             8008      
                                                                 
 global_average_pooling1d (G  (None, 8)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                144       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 8,169
Trainable params: 8,169
Non-trainable params: 0
_________________________________________________________________


In [None]:
optimizer= tf.keras.optimizers.Adam()

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy
              ()])

In [None]:
history = model.fit(train_batch, 
                    epochs=50, 
                    validation_data=val_batch)
val_loss_simple_RNN, val_acc_simple_RNN = model.evaluate(xval, yval)
print('Val accuracy simple_RNN:', val_acc_simple_RNN)
print('Val loss simple_RNN:', val_loss_simple_RNN)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Val accuracy simple_RNN: 0.9832535982131958
Val loss simple_RNN: 0.08369994163513184


In [None]:
# Visualization of the training process on the loss function
fig = go.Figure(data=[
                      go.Scatter(
                          y=history.history["loss"],
                          name="Training loss",
                          mode="lines",
                          marker=dict(
                          color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history.history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title="Training and validation on the loss function",
    xaxis_title="epochs",
    yaxis_title="Cross Entropy"    
)
fig.show()

*The results from the RNN model are good. The model continuously learns form the training examples, and starts overfitting after epoch number 20. This is a schoolbook example of model training right here*

*In addition to this, the MSE on the validation data is around 0.08*

In [None]:
# Visualization of accuracy training 
fig = go.Figure(data=[
                      go.Scatter(
                          y=history.history["binary_accuracy"],
                          name="Training accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history.history["val_binary_accuracy"],
                          name="Validation accruracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title="Training and Validation Accuracy",
    xaxis_title="epochs",
    yaxis_title="Accuracy"    
)
fig.show()

### *GRU*

In [None]:
vocab_size = tokenizer.num_words
model_gru = tf.keras.Sequential([
                  Embedding(vocab_size+1, 8, input_shape=[msg.shape[1],],name="embedding"),
                  GRU(units=64, return_sequences=True), # maintains the sequential nature
                  GRU(units=32, return_sequences=False), # returns the last output
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),
                  Dense(1, activation="sigmoid")
])

In [None]:
model_gru.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 74, 8)             8008      
                                                                 
 gru_2 (GRU)                 (None, 74, 64)            14208     
                                                                 
 gru_3 (GRU)                 (None, 32)                9408      
                                                                 
 dense_5 (Dense)             (None, 16)                528       
                                                                 
 dense_6 (Dense)             (None, 8)                 136       
                                                                 
 dense_7 (Dense)             (None, 1)                 9         
                                                                 
Total params: 32,297
Trainable params: 32,297
Non-trai

In [None]:
model_gru.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

In [None]:
# Model training 
gru_history = model_gru.fit(train_batch,
                              epochs=20, 
                              validation_data=val_batch
)
val_loss_gru, val_acc_gru = model_gru.evaluate(xval, yval)
print("Val accuracy gru:", val_acc_gru)
print("Val loss gru:", val_loss_gru)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Val accuracy gru: 0.8630383014678955
Val loss gru: 0.3994116187095642


In [None]:
# Visualization of the training process on the loss function
fig = go.Figure(data=[
                      go.Scatter(
                          y=gru_history.history["loss"],
                          name="Training loss",
                          mode="lines",
                          marker=dict(
                          color=color_chart[0]
                          )),
                      go.Scatter(
                          y=gru_history.history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title="Training and validation on the loss function",
    xaxis_title="epochs",
    yaxis_title="Cross Entropy"    
)
fig.show()

In [None]:
# Visualization of accuracy training 
fig = go.Figure(data=[
                      go.Scatter(
                          y=gru_history.history["accuracy"],
                          name="Training accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=gru_history.history["val_accuracy"],
                          name="Validation accruracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title="Training and Validation Accuracy",
    xaxis_title="epochs",
    yaxis_title="Accuracy"    
)
fig.show()

### *LSTM*

In [None]:
vocab_size = tokenizer.num_words
model_lstm = tf.keras.Sequential([
                  Embedding(vocab_size+1, 8, input_shape=[msg.shape[1],],name="embedding"),
                  LSTM(units=64, return_sequences=True), # maintains the sequential nature
                  LSTM(units=32, return_sequences=False), # returns the last output
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),
                  Dense(1, activation="sigmoid")
])

In [None]:
model_lstm.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 74, 8)             8008      
                                                                 
 lstm_4 (LSTM)               (None, 74, 64)            18688     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dense_14 (Dense)            (None, 16)                528       
                                                                 
 dense_15 (Dense)            (None, 8)                 136       
                                                                 
 dense_16 (Dense)            (None, 1)                 9         
                                                                 
Total params: 39,785
Trainable params: 39,785
Non-trai

In [None]:
model_lstm.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

In [None]:
# Model training 
lstm_history = model_lstm.fit(train_batch,
                              epochs=20, 
                              validation_data=val_batch
)
val_loss_lstm, val_acc_lstm = model_lstm.evaluate(xval, yval)
print("Val accuracy lstm:", val_acc_lstm)
print("Val loss lstm:", val_loss_lstm)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Val accuracy lstm: 0.9246411323547363
Val loss lstm: 0.2627447247505188


In [None]:
# Visualization of the training process on the loss function
fig = go.Figure(data=[
                      go.Scatter(
                          y=lstm_history.history["loss"],
                          name="Training loss",
                          mode="lines",
                          marker=dict(
                          color=color_chart[0]
                          )),
                      go.Scatter(
                          y=lstm_history.history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title="Training and validation on the loss function",
    xaxis_title="epochs",
    yaxis_title="Cross Entropy"    
)
fig.show()

In [None]:
# Visualization of accuracy training 
fig = go.Figure(data=[
                      go.Scatter(
                          y=lstm_history.history["accuracy"],
                          name="Training accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=lstm_history.history["val_accuracy"],
                          name="Validation accruracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title="Training and Validation Accuracy",
    xaxis_title="epochs",
    yaxis_title="Accuracy"    
)
fig.show()

### *Model comparaison*

In [None]:
# RNN model
print('Val accuracy simple_RNN:', val_acc_simple_RNN)
print('Val loss simple_RNN:', val_loss_simple_RNN)

Val accuracy simple_RNN: 0.9832535982131958
Val loss simple_RNN: 0.08369994163513184


In [None]:
# GRU model 
print("Val accuracy gru:", val_acc_gru)
print("Val loss gru:", val_loss_gru)

Val accuracy gru: 0.8630383014678955
Val loss gru: 0.3996928334236145


In [None]:
# LSTM model
print("Val accuracy gru:", val_acc_lstm)
print("Val loss gru:", val_loss_lstm)

Val accuracy gru: 0.9246411323547363
Val loss gru: 0.2627447247505188


*We are better able to detect spam with the simple RNN model rather than GRU model and LSTM model.*
