# Import the necessary packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pickle

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


## Get all 3 models

In [None]:
modelBoW  = keras.models.load_model('h5df/model.BOW')
modelWE   = keras.models.load_model('h5df/model.WE')
modelLSTM = keras.models.load_model('h5df/model.WE_LSTM')

### Test with one or more examples!
* Simply put your mail inside the **generated_text** variable between **double quote**
* Then use the predict function and specify which model you want to use in order to get your prediction
* For instance, there are 3 models available: modelBoW, modelWE, modelLSTM

In [3]:
generated_text = ["You just won free tickets ! call us FREE",
                  "I don't feel that good today, do you think we can report our call ?",
                  "Do you think you will be able to come to my party tonight ?",
                  "gz! congratulation! sms YES on 54233 to win, free",
                  "Your 2004 account for 07XXXXXXXXX shows 786",
                  "complimentary c UR awarded a trip to EuroDisin Trav Urgent"]

In [4]:
def predict(model, generated_text = generated_text, maxlen=150):
    with open('pickle/XY.data', 'rb') as f:
        sentences_train, sentences_test, y_train, y_test = pickle.load(f)
    if model == modelLSTM:
        tokenizer = Tokenizer(num_words=1000,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                      lower=True,                                    
                      split=' ',                                      
                      char_level=False)                               
        tokenizer.fit_on_texts(sentences_train)

        txts = tokenizer.texts_to_sequences(generated_text)
        txts = pad_sequences(txts, maxlen=maxlen)
        preds = modelLSTM.predict(txts)
        generated_text = np.array(generated_text)
        generated_text = generated_text[:, None]
        for pred in preds:
            if pred > 0.5:
                label = "SPAM"
            else: label = "HAM"
        
        results = np.concatenate((preds,generated_text), axis=1)
        print(results)
        
    elif model == modelBoW:
        vectorizer = CountVectorizer(stop_words = 'english', lowercase = True)
        vectorizer.fit(sentences_train)

        
        txts = vectorizer.transform(generated_text)
        preds = modelBoW.predict(txts)
        generated_text = np.array(generated_text)
        generated_text = generated_text[:, None]
        for pred in preds:
            if pred > 0.5:
                label = "SPAM"
            else: label = "HAM"
        
        results = np.concatenate((preds,generated_text), axis=1)
        return(results)
    
    elif model == modelWE:
        tokenizer = Tokenizer(num_words=1000,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      lower=True,                                   
                      split=' ',                                      
                      char_level=False)                               
        tokenizer.fit_on_texts(sentences_train)

        txts = tokenizer.texts_to_sequences(generated_text)
        txts = pad_sequences(txts, maxlen=maxlen)
        preds = modelWE.predict(txts)
        generated_text = np.array(generated_text)
        generated_text = generated_text[:, None]
        for pred in preds:
            if pred > 0.5:
                label = "SPAM"
            else: label = "HAM"
        
        results = np.concatenate((preds,generated_text), axis=1)
        return(results)
        

In [5]:
predict(modelLSTM, generated_text)

[['0.8749161' 'You just won free tickets ! call us FREE']
 ['0.0007192903'
  "I don't feel that good today, do you think we can report our call ?"]
 ['0.00044878424'
  'Do you think you will be able to come to my party tonight ?']
 ['0.611833' 'gz! congratulation! sms YES on 54233 to win, free']
 ['0.057145834' 'Your 2004 account for 07XXXXXXXXX shows 786']
 ['0.9603834'
  'complimentary c UR awarded a trip to EuroDisin Trav Urgent']]


### Results
According to the predictions of our generated text, it seems like our model did the job (**pretty well**).
Indeed, we need to take into account the fact that we had only 5000 observations to train our mode.

An extension would have been to use pre-trained model in order to get better results (BERT, Word2Vec and so on...) or to generate more data ourselves.