In [1]:
import pandas as pd
import numpy as np
import itertools

import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn import metrics

from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [2]:
## keras-tuner does hyper parameter tuning for Keras models (CNN)
!pip install keras-tuner

Collecting keras-tuner
[?25l  Downloading https://files.pythonhosted.org/packages/20/ec/1ef246787174b1e2bb591c95f29d3c1310070cad877824f907faba3dade9/keras-tuner-1.0.2.tar.gz (62kB)
[K     |█████▏                          | 10kB 16.4MB/s eta 0:00:01[K     |██████████▍                     | 20kB 18.1MB/s eta 0:00:01[K     |███████████████▋                | 30kB 14.8MB/s eta 0:00:01[K     |████████████████████▉           | 40kB 13.9MB/s eta 0:00:01[K     |██████████████████████████      | 51kB 11.6MB/s eta 0:00:01[K     |███████████████████████████████▎| 61kB 13.2MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 5.7MB/s 
Collecting terminaltables
  Downloading https://files.pythonhosted.org/packages/9b/c4/4a21174f32f8a7e1104798c445dacdc1d4df86f2f26722767034e4de4bff/terminaltables-3.1.0.tar.gz
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-an

In [3]:
from kerastuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

In [4]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


# Reading in Data Sources

In [5]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_train_fSP.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_test_fSP.pkl')

## Extract text for embedding
X_train = X_train_full['cleaned_text'].apply(lambda x: '' if x != x else x).values
X_test = X_test_full['cleaned_text'].apply(lambda x: '' if x != x else x).values

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_train_fSP.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_test_fSP.pkl')

# Tokenize Text

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
vocab_map = t.word_index

# integer encode the documents
encoded_docs_train = t.texts_to_sequences(X_train)
encoded_docs_test = t.texts_to_sequences(X_test)

max_length = max(max(map(len, encoded_docs_train)), max(map(len, encoded_docs_test)))
print("vocab_size: " + str(vocab_size))
print("max_length: " + str(max_length))

# pad documents to a max length 
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')


print("len of padded_docs: " + str(len(padded_docs_train)))

vocab_size: 23586
max_length: 159
len of padded_docs: 22521


In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
  FPR = FP/(FP+TN)
  sensitivity = TP/ (TP + FN)
  specificity = TN/ (TN + FP)
  print("False Postive Rate: " + str(FPR) )
  print("Sensitivity: " + str(sensitivity) )
  print("Specificity: " + str(specificity) + "\n")

from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Creating Embedding Matrix for GLOVE

In [None]:
# load the whole Glove embedding into memory
embeddings_index = dict()
f = open("/content/gdrive/My Drive/BT4222/Codes/glove.6B.300d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [None]:
##create Glove embedding matrix for the CNN layer
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

(23586, 300)


# Hyperparameter Tuning for GLOVE CNN Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

def build_model(hp):
# define the model
  model = Sequential()
  model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))

  model.add(Conv1D(
        filters = hp.Int('conv_1_filter', min_value = 32, max_value = 256, step = 16),
        kernel_size = hp.Choice('conv_1_kernel', values = [3,5]),
        activation = 'relu'))
  
  
  model.add(GlobalMaxPooling1D())
  model.add(Flatten())
  model.add(Dense( units = hp.Int('dense_1_units', min_value=3, max_value = 15, step = 3),
                  activation = 'relu'))
      
  model.add(Dense(1, activation='sigmoid'))
  # compile the model
  model.compile(optimizer=Adam( hp.Choice('learning_rate', values = [1e-2, 1e-3])),
                loss='binary_crossentropy',
                metrics=['accuracy',f1_m,precision_m, recall_m])
                
  return model

# Training Best Model for GLOVE CNN

In [None]:
tuner_search = RandomSearch(build_model, objective = 'val_accuracy', max_trials = 50, 
                            directory='my_dir',project_name='CNN_glove', seed = 1)

tuner_search.search(padded_docs_train, y_train, epochs = 200, callbacks = [EarlyStopping(monitor='val_loss', patience=5)],
             validation_split= 0.25)

## Tuning took 20min 05s

Trial 50 Complete [00h 00m 22s]
val_accuracy: 0.9344698786735535

Best val_accuracy So Far: 0.9445924162864685
Total elapsed time: 00h 20m 05s
INFO:tensorflow:Oracle triggered exit


In [None]:
## Observe best hyperparameters
model = tuner_search.get_best_models(num_models=1)[0]
tuner_search.results_summary()

Results summary
Results in my_dir/CNN_glove
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
conv_1_filter: 240
conv_1_kernel: 3
dense_1_units: 12
learning_rate: 0.001
Score: 0.9445924162864685
Trial summary
Hyperparameters:
conv_1_filter: 224
conv_1_kernel: 3
dense_1_units: 9
learning_rate: 0.001
Score: 0.9444148540496826
Trial summary
Hyperparameters:
conv_1_filter: 144
conv_1_kernel: 3
dense_1_units: 9
learning_rate: 0.001
Score: 0.9440596699714661
Trial summary
Hyperparameters:
conv_1_filter: 208
conv_1_kernel: 3
dense_1_units: 6
learning_rate: 0.001
Score: 0.9438821077346802
Trial summary
Hyperparameters:
conv_1_filter: 160
conv_1_kernel: 5
dense_1_units: 6
learning_rate: 0.001
Score: 0.942461371421814
Trial summary
Hyperparameters:
conv_1_filter: 128
conv_1_kernel: 3
dense_1_units: 9
learning_rate: 0.001
Score: 0.9422838091850281
Trial summary
Hyperparameters:
conv_1_filter: 96
conv_1_kernel: 3
dense_1_units: 6
learning_rate: 0

In [None]:
## Test Results for CNN-Glove
y_test_pred = model.predict_classes(padded_docs_test)
y_test_prob = model.predict_proba(padded_docs_test)
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)




Test Results:
              precision    recall  f1-score   support

           0      0.952     0.956     0.954      6138
           1      0.946     0.940     0.943      4955

    accuracy                          0.949     11093
   macro avg      0.949     0.948     0.949     11093
weighted avg      0.949     0.949     0.949     11093

[[5870  268]
 [ 295 4660]]
Accuracy: 0.9492472730550798
AUC_ROC: 0.9887513854734974
f1 score: 0.9430334918546999
False Postive Rate: 0.04366243075920495
Sensitivity: 0.9404641775983855
Specificity: 0.956337569240795



In [None]:
model.save('/content/gdrive/My Drive/BT4222/Code (Final Submission)/Spam Models/Saved Models/CNN_glove_spam')

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/BT4222/Code (Final Submission)/Spam Models/Saved Models/CNN_glove_spam/assets


#Creating Embedding Matrix for FastText

In [None]:
# loading the whole FastText embedding
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
cc = KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/BT4222/Codes/crawl-300d-2M.vec', limit=400000)
vocab = cc.vocab

In [None]:
##create Fasttext embedding matrix for the CNN layer
cc_word_vector_matrix = np.zeros((len(vocab_map)+1 , 300))

print(cc_word_vector_matrix.shape)

for word, index in vocab_map.items():
    try:
        vector = cc.word_vec(word)
        cc_word_vector_matrix[index] = vector
    except:
        print(word)

print(cc_word_vector_matrix.shape)

(23586, 300)
tammie
causey
mouton
ponton
pathed
jacoby
cornhusker
slone
pickering
vallum
quickset
nonsensitive
marcello
titman
goddard
choate
schoolcraft
crandall
scotia
osseously
doubletree
branner
affricate
martel
peaker
ordinator
haymarket
chorally
dauphine
medianly
gaddi
befog
althea
bedim
bitterroot
delphine
tawney
crowfoot
visagraph
reseda
byroad
checkerberry
covid
bereave
bronchiole
coercible
armload
adz
linder
auspex
beplaster
cosec
konstantin
kingwood
orillion
inaccordance
alliterate
afforest
seraglio
brawner
applicate
actinolite
almagest
officialize
cannel
diesis
skel
somers
areaway
nasalization
duplicable
bimetallism
monocotyledon
affiance
disgustful
destinate
aniseikonic
woodside
portman
goff
culver
basidiomycete
cowman
colza
balletomane
canvasback
linecut
brandywine
allemand
contradistinction
accentual
crossbill
francisca
petrie
acock
confiscable
therese
graywacke
turtleback
cotman
depredate
impend
mantrap
chorine
cholesterin
chrysolite
belove
agee
embrittle
sproat
conflag

# Hyperparameter Tuning for FastText CNN Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

def build_model_cc(hp):
# define the model
  model = Sequential()
  model.add(Embedding(vocab_size, 300, weights=[cc_word_vector_matrix], input_length=max_length, trainable=False))

  model.add(Conv1D(
        filters = hp.Int('conv_1_filter', min_value = 32, max_value = 256, step = 16),
        kernel_size = hp.Choice('conv_1_kernel', values = [3,5]),
        activation = 'relu'))
  
  
  model.add(GlobalMaxPooling1D())
  model.add(Flatten())
  model.add(Dense( units = hp.Int('dense_1_units', min_value=3, max_value = 15, step = 3),
                  activation = 'relu'))
      
  model.add(Dense(1, activation='sigmoid'))
  # compile the model
  model.compile(optimizer=Adam( hp.Choice('learning_rate', values = [1e-2, 1e-3])),
                loss='binary_crossentropy',
                metrics=['accuracy',f1_m,precision_m, recall_m])
                
  return model

# Training Best Model for FastText CNN

In [None]:
tuner_search2 = RandomSearch(build_model_cc, objective = 'val_accuracy', max_trials = 50, 
                            directory='my_dir',project_name='CNN_cc', seed = 1)

tuner_search2.search(padded_docs_train, y_train, epochs = 200, callbacks = [EarlyStopping(monitor='val_loss', patience=5)],
             validation_split= 0.25)



Trial 50 Complete [00h 00m 25s]
val_accuracy: 0.9474338293075562

Best val_accuracy So Far: 0.9561356902122498
Total elapsed time: 00h 19m 35s
INFO:tensorflow:Oracle triggered exit


In [None]:
## Observe best hyperparameters
model2 = tuner_search2.get_best_models(num_models=1)[0]
tuner_search2.results_summary()

Results summary
Results in my_dir/CNN_cc
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
conv_1_filter: 256
conv_1_kernel: 3
dense_1_units: 9
learning_rate: 0.001
Score: 0.9561356902122498
Trial summary
Hyperparameters:
conv_1_filter: 208
conv_1_kernel: 3
dense_1_units: 6
learning_rate: 0.001
Score: 0.9548925757408142
Trial summary
Hyperparameters:
conv_1_filter: 96
conv_1_kernel: 3
dense_1_units: 15
learning_rate: 0.001
Score: 0.9538270235061646
Trial summary
Hyperparameters:
conv_1_filter: 144
conv_1_kernel: 3
dense_1_units: 9
learning_rate: 0.001
Score: 0.9536494612693787
Trial summary
Hyperparameters:
conv_1_filter: 240
conv_1_kernel: 3
dense_1_units: 12
learning_rate: 0.001
Score: 0.9536494612693787
Trial summary
Hyperparameters:
conv_1_filter: 128
conv_1_kernel: 3
dense_1_units: 3
learning_rate: 0.001
Score: 0.9527614712715149
Trial summary
Hyperparameters:
conv_1_filter: 224
conv_1_kernel: 3
dense_1_units: 9
learning_rate: 0.

In [None]:
## Test Results 
y_test_pred2 = model2.predict_classes(padded_docs_test)
y_test_prob2 = model2.predict_proba(padded_docs_test)
print("Test Results:")
show_results(y_test, y_test_pred2, y_test_prob2)




Test Results:
              precision    recall  f1-score   support

           0      0.956     0.965     0.961      6138
           1      0.957     0.945     0.951      4955

    accuracy                          0.956     11093
   macro avg      0.956     0.955     0.956     11093
weighted avg      0.956     0.956     0.956     11093

[[5926  212]
 [ 275 4680]]
Accuracy: 0.9560984404579465
AUC_ROC: 0.9902113646474182
f1 score: 0.9505433126840661
False Postive Rate: 0.03453893776474422
Sensitivity: 0.9445005045408678
Specificity: 0.9654610622352557



In [None]:
model2.save('/content/gdrive/My Drive/BT4222/Code (Final Submission)/Spam Models/Saved Models/CNN_fasttext_spam')

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/BT4222/Code (Final Submission)/Spam Models/Saved Models/CNN_fasttext_spam/assets


# Loaded Models Performance
#(Run cells up to and including tokenize text section, skip embedding onwards and come here to run loaded, optimal models)

In [12]:
dependencies = {
    'recall_m': recall_m,
    'precision_m': precision_m,
    'f1_m': f1_m
}

In [13]:
from tensorflow.keras.models import load_model
loaded_cc_spam = load_model('/content/gdrive/My Drive/BT4222/Code (Final Submission)/Spam Models/Saved Models/CNN_fasttext_spam', custom_objects = dependencies)
loaded_glove_spam = load_model('/content/gdrive/My Drive/BT4222/Code (Final Submission)/Spam Models/Saved Models/CNN_glove_spam', custom_objects = dependencies)

In [15]:
%time loaded_y_test_pred_cc = loaded_cc_spam.predict_classes(padded_docs_test)
%time loaded_y_test_prob_cc = loaded_cc_spam.predict_proba(padded_docs_test)
print("Test Results for CC:")
show_results(y_test, loaded_y_test_pred_cc, loaded_y_test_prob_cc)



CPU times: user 969 ms, sys: 136 ms, total: 1.1 s
Wall time: 933 ms




CPU times: user 983 ms, sys: 131 ms, total: 1.11 s
Wall time: 925 ms
Test Results for CC:
              precision    recall  f1-score   support

           0      0.956     0.965     0.961      6138
           1      0.957     0.945     0.951      4955

    accuracy                          0.956     11093
   macro avg      0.956     0.955     0.956     11093
weighted avg      0.956     0.956     0.956     11093

[[5926  212]
 [ 275 4680]]
Accuracy: 0.9560984404579465
AUC_ROC: 0.9902113646474182
f1 score: 0.9505433126840661
False Postive Rate: 0.03453893776474422
Sensitivity: 0.9445005045408678
Specificity: 0.9654610622352557



In [16]:
%time loaded_y_test_pred_glove = loaded_glove_spam.predict_classes(padded_docs_test)
%time loaded_y_test_prob_glove = loaded_glove_spam.predict_proba(padded_docs_test)
print("Test Results for CC:")
show_results(y_test, loaded_y_test_pred_glove, loaded_y_test_prob_glove)



CPU times: user 1.05 s, sys: 151 ms, total: 1.2 s
Wall time: 1 s




CPU times: user 993 ms, sys: 124 ms, total: 1.12 s
Wall time: 941 ms
Test Results for CC:
              precision    recall  f1-score   support

           0      0.952     0.956     0.954      6138
           1      0.946     0.940     0.943      4955

    accuracy                          0.949     11093
   macro avg      0.949     0.948     0.949     11093
weighted avg      0.949     0.949     0.949     11093

[[5870  268]
 [ 295 4660]]
Accuracy: 0.9492472730550798
AUC_ROC: 0.9887513854734974
f1 score: 0.9430334918546999
False Postive Rate: 0.04366243075920495
Sensitivity: 0.9404641775983855
Specificity: 0.956337569240795

