In [None]:
import pandas as pd
import numpy as np
import itertools

import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn import metrics

from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
## keras-tuner does hyper parameter tuning for Keras models (CNN)
!pip install keras-tuner

Collecting keras-tuner
[?25l  Downloading https://files.pythonhosted.org/packages/20/ec/1ef246787174b1e2bb591c95f29d3c1310070cad877824f907faba3dade9/keras-tuner-1.0.2.tar.gz (62kB)
[K     |█████▏                          | 10kB 14.1MB/s eta 0:00:01[K     |██████████▍                     | 20kB 14.3MB/s eta 0:00:01[K     |███████████████▋                | 30kB 9.5MB/s eta 0:00:01[K     |████████████████████▉           | 40kB 8.1MB/s eta 0:00:01[K     |██████████████████████████      | 51kB 5.2MB/s eta 0:00:01[K     |███████████████████████████████▎| 61kB 5.8MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.4MB/s 
Collecting terminaltables
  Downloading https://files.pythonhosted.org/packages/9b/c4/4a21174f32f8a7e1104798c445dacdc1d4df86f2f26722767034e4de4bff/terminaltables-3.1.0.tar.gz
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.wh

In [None]:
from kerastuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


# Reading in Data Sources

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_train_fSC.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_test_fSC.pkl')

## Extract text for embedding
X_train = X_train_full['cleaned_text'].apply(lambda x: '' if x != x else x).values
X_test = X_test_full['cleaned_text'].apply(lambda x: '' if x != x else x).values

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_train_fSC.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_test_fSC.pkl')


# Tokenize Text

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
vocab_map = t.word_index

# integer encode the documents
encoded_docs_train = t.texts_to_sequences(X_train)
encoded_docs_test = t.texts_to_sequences(X_test)

max_length = max(max(map(len, encoded_docs_train)), max(map(len, encoded_docs_test)))
print("vocab_size: " + str(vocab_size))
print("max_length: " + str(max_length))

# pad documents to a max length 
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')


print("len of padded_docs: " + str(len(padded_docs_train)))

vocab_size: 4632
max_length: 157
len of padded_docs: 1519


In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
  FPR = FP/(FP+TN)
  sensitivity = TP/ (TP + FN)
  specificity = TN/ (TN + FP)
  print("False Postive Rate: " + str(FPR) )
  print("Sensitivity: " + str(sensitivity) )
  print("Specificity: " + str(specificity) + "\n")

from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Creating Embedding Matrix for GLOVE

In [None]:
# load the whole Glove embedding into memory
embeddings_index = dict()
f = open("/content/gdrive/My Drive/BT4222/Codes/glove.6B.300d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [None]:
##create Glove embedding matrix for the CNN layer
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

(4632, 300)


# Hyperparameter Tuning for GLOVE CNN Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

def build_model(hp):
# define the model
  model = Sequential()
  model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))

  model.add(Conv1D(
        filters = hp.Int('conv_1_filter', min_value = 32, max_value = 256, step = 16),
        kernel_size = hp.Choice('conv_1_kernel', values = [3,5]),
        activation = 'relu'))
  
  
  model.add(GlobalMaxPooling1D())
  model.add(Flatten())
  model.add(Dense( units = hp.Int('dense_1_units', min_value=3, max_value = 15, step = 3),
                  activation = 'relu'))
      
  model.add(Dense(1, activation='sigmoid'))
  # compile the model
  model.compile(optimizer=Adam( hp.Choice('learning_rate', values = [1e-2, 1e-3])),
                loss='binary_crossentropy',
                metrics=['accuracy',f1_m,precision_m, recall_m])
                
  return model

# Training Best Model for GLOVE CNN

In [None]:
tuner_search = RandomSearch(build_model, objective = 'val_accuracy', max_trials = 50, 
                            directory='my_dir',project_name='CNN_glove', seed = 1)

tuner_search.search(padded_docs_train, y_train, epochs = 200, callbacks = [EarlyStopping(monitor='val_loss', patience=5)],
             validation_split= 0.25)

## Tuning took 20min 05s

Trial 50 Complete [00h 00m 50s]
val_accuracy: 0.9789473414421082

Best val_accuracy So Far: 0.9868420958518982
Total elapsed time: 00h 42m 50s
INFO:tensorflow:Oracle triggered exit


In [None]:
## Observe best hyperparameters
model = tuner_search.get_best_models(num_models=1)[0]
tuner_search.results_summary()

Results summary
Results in my_dir/CNN_glove
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
conv_1_filter: 176
conv_1_kernel: 3
dense_1_units: 3
learning_rate: 0.01
Score: 0.9868420958518982
Trial summary
Hyperparameters:
conv_1_filter: 176
conv_1_kernel: 3
dense_1_units: 9
learning_rate: 0.01
Score: 0.9868420958518982
Trial summary
Hyperparameters:
conv_1_filter: 176
conv_1_kernel: 3
dense_1_units: 6
learning_rate: 0.01
Score: 0.9842105507850647
Trial summary
Hyperparameters:
conv_1_filter: 64
conv_1_kernel: 5
dense_1_units: 3
learning_rate: 0.01
Score: 0.9842105507850647
Trial summary
Hyperparameters:
conv_1_filter: 160
conv_1_kernel: 5
dense_1_units: 3
learning_rate: 0.01
Score: 0.9815789461135864
Trial summary
Hyperparameters:
conv_1_filter: 128
conv_1_kernel: 3
dense_1_units: 3
learning_rate: 0.001
Score: 0.9815789461135864
Trial summary
Hyperparameters:
conv_1_filter: 80
conv_1_kernel: 3
dense_1_units: 15
learning_rate: 0.01
S

In [None]:
## Test Results for CNN-Glove
y_test_pred = model.predict_classes(padded_docs_test)
y_test_prob = model.predict_proba(padded_docs_test)
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)




Test Results:
              precision    recall  f1-score   support

           0      0.973     0.970     0.972       265
           1      0.984     0.986     0.985       484

    accuracy                          0.980       749
   macro avg      0.978     0.978     0.978       749
weighted avg      0.980     0.980     0.980       749

[[257   8]
 [  7 477]]
Accuracy: 0.9799732977303071
AUC_ROC: 0.9948386090753157
f1 score: 0.9845201238390092
False Postive Rate: 0.03018867924528302
Sensitivity: 0.9855371900826446
Specificity: 0.969811320754717



In [None]:
model.save('/content/gdrive/My Drive/BT4222/Code (Final Submission)/Scam Models/Saved Models/CNN_glove_scam')

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/BT4222/Code (Final Submission)/Scam Models/Saved Models/CNN_glove_scam/assets


#Creating Embedding Matrix for FastText

In [None]:
# loading the whole FastText embedding
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
cc = KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/BT4222/Codes/crawl-300d-2M.vec', limit=400000)
vocab = cc.vocab

In [None]:
##create Fasttext embedding matrix for the CNN layer
cc_word_vector_matrix = np.zeros((len(vocab_map)+1 , 300))

print(cc_word_vector_matrix.shape)

for word, index in vocab_map.items():
    try:
        vector = cc.word_vec(word)
        cc_word_vector_matrix[index] = vector
    except:
        print(word)

print(cc_word_vector_matrix.shape)

(4632, 300)
nonsensitive
francisca
diesis
inaccordance
scotia
dyker
woodside
selah
verst
covid
kra
robbin
quickset
quahog
doum
taha
clodhopper
cockspur
boga
alate
undersign
rockaway
noncorporate
fub
antihypnotic
braggardism
antiphonetic
anoestrum
antipass
backwort
antipolygamy
baldhead
asporogenic
apterygote
individua
indurate
covent
misset
mogador
outthrough
notan
garvey
thave
gallerylike
annal
outwell
beata
curite
somers
meros
cableman
auricula
albuginitis
actiniochrome
aortomalacia
antihalation
carburometer
agaricic
blastid
affricated
catacromyodian
lansdowne
crossway
semidetached
bestare
hamal
albe
deathin
matti
dilapidate
loma
verd
obispo
hadji
omphalos
akra
semiprivate
wamus
disconsider
crowberry
pensy
parklike
fono
eradicable
reregistration
buba
unrenovated
plica
stockholding
housekeep
semiduplex
(4632, 300)


# Hyperparameter Tuning for FastText CNN Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

def build_model_cc(hp):
# define the model
  model = Sequential()
  model.add(Embedding(vocab_size, 300, weights=[cc_word_vector_matrix], input_length=max_length, trainable=False))

  model.add(Conv1D(
        filters = hp.Int('conv_1_filter', min_value = 32, max_value = 256, step = 16),
        kernel_size = hp.Choice('conv_1_kernel', values = [3,5]),
        activation = 'relu'))
  
  
  model.add(GlobalMaxPooling1D())
  model.add(Flatten())
  model.add(Dense( units = hp.Int('dense_1_units', min_value=3, max_value = 15, step = 3),
                  activation = 'relu'))
      
  model.add(Dense(1, activation='sigmoid'))
  # compile the model
  model.compile(optimizer=Adam( hp.Choice('learning_rate', values = [1e-2, 1e-3])),
                loss='binary_crossentropy',
                metrics=['accuracy',f1_m,precision_m, recall_m])
                
  return model

# Training Best Model for FastText CNN

In [None]:
tuner_search2 = RandomSearch(build_model_cc, objective = 'val_accuracy', max_trials = 50, 
                            directory='my_dir',project_name='CNN_cc', seed = 1)

tuner_search2.search(padded_docs_train, y_train, epochs = 200, callbacks = [EarlyStopping(monitor='val_loss', patience=5)],
             validation_split= 0.25)



Trial 50 Complete [00h 01m 14s]
val_accuracy: 0.9868420958518982

Best val_accuracy So Far: 0.9947368502616882
Total elapsed time: 00h 58m 06s
INFO:tensorflow:Oracle triggered exit


In [None]:
## Observe best hyperparameters
model2 = tuner_search2.get_best_models(num_models=1)[0]
tuner_search2.results_summary()

Results summary
Results in my_dir/CNN_cc
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
conv_1_filter: 160
conv_1_kernel: 5
dense_1_units: 3
learning_rate: 0.01
Score: 0.9947368502616882
Trial summary
Hyperparameters:
conv_1_filter: 176
conv_1_kernel: 3
dense_1_units: 6
learning_rate: 0.01
Score: 0.99210524559021
Trial summary
Hyperparameters:
conv_1_filter: 192
conv_1_kernel: 3
dense_1_units: 6
learning_rate: 0.01
Score: 0.99210524559021
Trial summary
Hyperparameters:
conv_1_filter: 176
conv_1_kernel: 3
dense_1_units: 3
learning_rate: 0.01
Score: 0.9894737005233765
Trial summary
Hyperparameters:
conv_1_filter: 80
conv_1_kernel: 3
dense_1_units: 15
learning_rate: 0.01
Score: 0.9894737005233765
Trial summary
Hyperparameters:
conv_1_filter: 144
conv_1_kernel: 3
dense_1_units: 9
learning_rate: 0.01
Score: 0.9894737005233765
Trial summary
Hyperparameters:
conv_1_filter: 256
conv_1_kernel: 3
dense_1_units: 15
learning_rate: 0.01
Score: 

In [None]:
## Test Results 
y_test_pred2 = model2.predict_classes(padded_docs_test)
y_test_prob2 = model2.predict_proba(padded_docs_test)
print("Test Results:")
show_results(y_test, y_test_pred2, y_test_prob2)




Test Results:
              precision    recall  f1-score   support

           0      0.970     0.977     0.974       265
           1      0.988     0.983     0.986       484

    accuracy                          0.981       749
   macro avg      0.979     0.980     0.980       749
weighted avg      0.981     0.981     0.981       749

[[259   6]
 [  8 476]]
Accuracy: 0.9813084112149533
AUC_ROC: 0.993840636207703
f1 score: 0.9855072463768116
False Postive Rate: 0.022641509433962263
Sensitivity: 0.9834710743801653
Specificity: 0.9773584905660377



In [None]:
model2.save('/content/gdrive/My Drive/BT4222/Code (Final Submission)/Scam Models/Saved Models/CNN_fasttext_scam')

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/BT4222/Code (Final Submission)/Scam Models/Saved Models/CNN_fasttext_scam/assets


# Loaded Models Performance
#(Run cells up to and including tokenize text section, skip embedding onwards and come here to run loaded, optimal models)

In [None]:
dependencies = {
    'recall_m': recall_m,
    'precision_m': precision_m,
    'f1_m': f1_m
}

In [None]:
from tensorflow.keras.models import load_model
loaded_cc_scam = load_model('/content/gdrive/My Drive/BT4222/Code (Final Submission)/Scam Models/Saved Models/CNN_fasttext_scam', custom_objects = dependencies)
loaded_glove_scam = load_model('/content/gdrive/My Drive/BT4222/Code (Final Submission)/Scam Models/Saved Models/CNN_glove_scam', custom_objects = dependencies)

In [None]:
%time loaded_y_test_pred_cc = loaded_cc_scam.predict_classes(padded_docs_test)
%time loaded_y_test_prob_cc = loaded_cc_scam.predict_proba(padded_docs_test)
print("Test Results for CC:")
show_results(y_test, loaded_y_test_pred_cc, loaded_y_test_prob_cc)



CPU times: user 1.89 s, sys: 9.65 ms, total: 1.9 s
Wall time: 1.02 s




CPU times: user 1.88 s, sys: 14.3 ms, total: 1.9 s
Wall time: 1.03 s
Test Results for CC:
              precision    recall  f1-score   support

           0      0.970     0.977     0.974       265
           1      0.988     0.983     0.986       484

    accuracy                          0.981       749
   macro avg      0.979     0.980     0.980       749
weighted avg      0.981     0.981     0.981       749

[[259   6]
 [  8 476]]
Accuracy: 0.9813084112149533
AUC_ROC: 0.993840636207703
f1 score: 0.9855072463768116
False Postive Rate: 0.022641509433962263
Sensitivity: 0.9834710743801653
Specificity: 0.9773584905660377



In [None]:
%time loaded_y_test_pred_glove = loaded_glove_scam.predict_classes(padded_docs_test)
%time loaded_y_test_prob_glove = loaded_glove_scam.predict_proba(padded_docs_test)
print("Test Results for CC:")
show_results(y_test, loaded_y_test_pred_glove, loaded_y_test_prob_glove)



CPU times: user 1.4 s, sys: 16.6 ms, total: 1.41 s
Wall time: 802 ms




CPU times: user 1.32 s, sys: 20.8 ms, total: 1.34 s
Wall time: 748 ms
Test Results for CC:
              precision    recall  f1-score   support

           0      0.973     0.970     0.972       265
           1      0.984     0.986     0.985       484

    accuracy                          0.980       749
   macro avg      0.978     0.978     0.978       749
weighted avg      0.980     0.980     0.980       749

[[257   8]
 [  7 477]]
Accuracy: 0.9799732977303071
AUC_ROC: 0.9948386090753157
f1 score: 0.9845201238390092
False Postive Rate: 0.03018867924528302
Sensitivity: 0.9855371900826446
Specificity: 0.969811320754717

