In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import keras
import tensorflow_hub as hub
import tensorflow as tf
import keras.backend as K

Using TensorFlow backend.


In [2]:
data = pd.read_csv("spam.csv", encoding = "latin-1")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
y = data.v1.values
X = data.v2.values

In [4]:
le = LabelEncoder()
le.fit(y)
le.classes_

array(['ham', 'spam'], dtype=object)

In [5]:
y_transformed = le.transform(y)
y_cat = keras.utils.to_categorical(y_transformed)

In [6]:
y_cat

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [7]:
url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(url)

INFO:tensorflow:Using C:\Users\iko\AppData\Local\Temp\tfhub_modules to cache modules.
Instructions for updating:
Colocations handled automatically by placer.


In [8]:
def UniversalEncoder(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [9]:
X_train = np.asarray(X[:5000])
y_train = np.asarray(y_cat[:5000])

X_test = np.asarray(X[5000:])
y_test = np.asarray(y_cat[5000:])

In [29]:
# buildng the model

from keras.layers import Lambda, Input, Dense
from keras.models import Model

text_input = Input(shape = (1, ), dtype = tf.string)
encoded_text = Lambda(UniversalEncoder, output_shape = (512, ))(text_input)
layer1 = Dense(32, activation = "relu")(encoded_text)
layer2 = Dense(2, activation = "softmax")(layer1)
model = Model(inputs = [text_input], outputs = layer2)

optimizer = keras.optimizers.Adam(lr = 0.01)
model.compile(optimizer = optimizer,
             loss = "categorical_crossentropy",
             metrics = ['accuracy'])

with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    history = model.fit(X_train, y_train, epochs= 3, batch_size= 32)
    model.save_weights("./model.h5")

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [30]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model.load_weights("./model.h5")
    preds = model.predict(X_test, batch_size = 32)

In [31]:
predicted = le.inverse_transform(np.argmax(preds, axis = 1))

In [24]:
from sklearn import metrics

In [32]:
metrics.accuracy_score(y[5000:], predicted)

0.9807692307692307

In [33]:
print(metrics.classification_report(y[5000:], predicted))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       498
        spam       0.91      0.95      0.93        74

   micro avg       0.98      0.98      0.98       572
   macro avg       0.95      0.97      0.96       572
weighted avg       0.98      0.98      0.98       572



In [34]:
print(metrics.confusion_matrix(y[5000:], predicted))

[[491   7]
 [  4  70]]


In [57]:
# using tfidf vectorizer and random forest as a classifier as a benchmark

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

tf = TfidfVectorizer()
rf = RandomForestClassifier(n_estimators = 150,
                           random_state = 42)

pipeline = Pipeline([('tfiddf', tf),
                     ("rforest", rf)])

X_train, X_test, y_train, y_test = X[:5000], X[5000:], y_transformed[:5000], y_transformed[5000:]
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfiddf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...ators=150, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [53]:
preds = pipeline.predict(X_test)

In [54]:
metrics.accuracy_score(y_test, preds)

0.9807692307692307

In [55]:
print(metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       498
           1       1.00      0.85      0.92        74

   micro avg       0.98      0.98      0.98       572
   macro avg       0.99      0.93      0.95       572
weighted avg       0.98      0.98      0.98       572



In [56]:
print(metrics.confusion_matrix(y_test, preds))

[[498   0]
 [ 11  63]]


In [58]:
ada = AdaBoostClassifier(n_estimators = 150,
                           random_state = 42)

pipeline2 = Pipeline([('tfiddf', tf),
                     ("adaboost", ada)])
pipeline2.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfiddf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...hm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=150, random_state=42))])

In [60]:
preds2 = pipeline2.predict(X_test)
metrics.accuracy_score(y_test, preds2)
print(metrics.confusion_matrix(y_test, preds2))

[[496   2]
 [ 10  64]]
