In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
# Getting the data files
!wget "https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/train-data.tsv"
!wget "https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/valid-data.tsv"

--2020-11-23 20:59:58--  https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/train-data.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/plain]
Saving to: ‘train-data.tsv’


2020-11-23 20:59:58 (49.9 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2020-11-23 20:59:59--  https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/valid-data.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/plain]
Saving to: ‘valid-data.tsv’


2020-11-23 20:59:59 (41.1 

In [3]:
# Importing tsv data to pd dataframes
def csv_to_df():
  df_train = pd.read_csv("train-data.tsv", sep = "\t", header = None, names = ['class', 'message'])
  df_test = pd.read_csv("valid-data.tsv", sep = "\t", header = None, names = ['class','message'])
  return df_train, df_test

In [4]:
# Cleaning the data
def clean_data(df_train, df_test):
  # Handling categorical values
  class_train = df_train['class'].astype('category').cat.codes
  class_test = df_test['class'].astype('category').cat.codes
  return class_train, class_test

In [5]:
# Cleaning the messages
def clean_text(msg):
  # Text preprocessing
  english_words = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()
  msg = re.sub(r'([^\s\w])+', ' ', msg)
  msg = " ".join([lemmatizer.lemmatize(words) for words in msg.split() if not words in english_words])
  msg = msg.lower()
  return msg

In [6]:
# Preprocessing data
def preprocessing_data(msg_series):
  # Vectorization of data
  seq = tknzr.texts_to_sequences(msg_series)
  # Padding the data
  seq_matrix = sequence.pad_sequences(seq, maxlen = 500)
  return seq_matrix

In [7]:
# Building the model 
def create_model():
  model = tf.keras.models.Sequential([tf.keras.layers.Input(shape=[500]),tf.keras.layers.Embedding(1000, 50, input_length=500), tf.keras.layers.LSTM(64),tf.keras.layers.Dense(256, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(1, activation='relu') ])
  model.compile(loss='binary_crossentropy',optimizer='RMSprop',metrics=['accuracy'])
  print(model.summary())
  return model

In [8]:
# Training the model
def train_model(model, seq_matrix_train, class_train):
  # Monitoring validation loss to keep it at a minimum and stop training the model if it reaches a minimum
  model.fit(seq_matrix_train, class_train, batch_size=128, epochs=10, validation_split=0.2, callbacks= [tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001)] , verbose = 1)

In [9]:
# Evaluating the model
def evaluate_model(model, seq_matrix_test, class_test):
  eval = model.evaluate(seq_matrix_test, class_test)
  print('Loss: {:.3f}, Accuracy: {:.3f}'.format(eval[0], eval[1]))

In [10]:
# Predict messages based on model
def predict_message(model, pred_text):
  pred_text = clean_text(pred_text)
  p = model.predict(preprocessing_data(pd.Series([pred_text])))[0]
  if p<0.5:
    class_label = "ham"
  else:
    class_label = "spam" 

  return (p[0], class_label)

In [11]:
# Testing the model
def test_predictions(model):
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(model, msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

In [12]:
if __name__ == '__main__':
  df_train, df_test = csv_to_df()
  class_train, class_test = clean_data(df_train, df_test)
  msg_train = df_train['message'].apply(lambda message : clean_text(message))
  
  # Tokenizer fits on cleaned message training data and keeps 1000 frequently occuring words
  tknzr = Tokenizer(num_words = 1000)
  tknzr.fit_on_texts(msg_train)

  seq_matrix_train = preprocessing_data(msg_train)
  msg_test = df_test['message'].apply(lambda message : clean_text(message))
  seq_matrix_test = preprocessing_data(msg_test)
  model = create_model()
  print("Training the model....")
  train_model(model, seq_matrix_train, class_train)
  print("Evaluating the model....")
  evaluate_model(model, seq_matrix_test, class_test)
  print("Testing the model....")
  pred_text = "how are you doing today?"
  prediction = predict_message(model, pred_text)
  print("Prediction of text {} is : ".format(pred_text),prediction)
  predict_message(model, pred_text)
  test_predictions(model)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           50000     
_________________________________________________________________
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
dense (Dense)                (None, 256)               16640     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 96,337
Trainable params: 96,337
Non-trainable params: 0
_________________________________________________________________
None
Training the model....
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Evaluating the model....
Loss: 0.114, Accuracy