In [1]:
import pandas as pd
data = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label','message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
text = data['message']
class_label = data['label']

In [3]:
import numpy as np
from keras.utils.np_utils import to_categorical
classes_list = ["ham","spam"]
label_index = class_label.apply(classes_list.index)
label1 = np.asarray(label_index)
label = to_categorical(np.asarray(label1))

In [4]:
from keras.preprocessing.text import Tokenizer
tk=Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True, split=" ")
tk.fit_on_texts(text)
index=tk.word_index
#print(index)
x = tk.texts_to_sequences(text)
#print (x)
vocab_size = len(index)

In [5]:
from keras.preprocessing.sequence import pad_sequences
embedding_vecor_length =32
padded_docs = pad_sequences(x, maxlen=embedding_vecor_length, padding='post')
print (padded_docs)

[[  49  471 4436 ...    0    0    0]
 [  46  336 1499 ...    0    0    0]
 [  47  489    8 ...  392 2998    0]
 ...
 [9007   60    8 ...    0    0    0]
 [   5  534  114 ...    0    0    0]
 [2687   61  465 ...    0    0    0]]


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, label, test_size=0.30, random_state=42)

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Input
from keras.layers import Embedding, LSTM, Bidirectional, SimpleRNN, GRU
from keras.models import Sequential, Model

lstm_input= Input(shape=(embedding_vecor_length,),  dtype='int32', name='lstm_input')
x= Embedding(vocab_size+1, 100, input_length=embedding_vecor_length,trainable=True)(lstm_input)
x1=LSTM(256,return_sequences=True)(x)
lstm_out= LSTM(128,return_sequences=False)(x1)
main_output = Dense(2,activation='softmax', name='main_output')(lstm_out)
model = Model(inputs=lstm_input, outputs=main_output)
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_input (InputLayer)     [(None, 32)]              0         
                                                                 
 embedding (Embedding)       (None, 32, 100)           901000    
                                                                 
 lstm (LSTM)                 (None, 32, 256)           365568    
                                                                 
 lstm_1 (LSTM)               (None, 128)               197120    
                                                                 
 main_output (Dense)         (None, 2)                 258       
                                                                 
Total params: 1,463,946
Trainable params: 1,463,946
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
model.fit(X_train, y_train, validation_data = (X_test, y_test),
              epochs=2,batch_size=10, verbose=2)

Epoch 1/2
390/390 - 60s - loss: 0.1147 - accuracy: 0.9651 - val_loss: 0.0552 - val_accuracy: 0.9862 - 60s/epoch - 153ms/step
Epoch 2/2
390/390 - 57s - loss: 0.0910 - accuracy: 0.9564 - val_loss: 0.2340 - val_accuracy: 0.8756 - 57s/epoch - 146ms/step


<keras.callbacks.History at 0x7f6166c7ef50>

In [9]:
predictions_test = model.predict(X_test)
predictions_test1 = np.zeros_like(predictions_test)
predictions_test1[np.arange(len(predictions_test)), predictions_test.argmax(1)] = 1

In [10]:
from sklearn.metrics import classification_report
from sklearn import metrics
predictions_test1 = np.argmax(predictions_test1,axis=1)

y_test = np.argmax(y_test,axis=1)

print(metrics.confusion_matrix(y_test,predictions_test1))

print(classification_report(y_test,predictions_test1))

[[1266  182]
 [  26  198]]
              precision    recall  f1-score   support

           0       0.98      0.87      0.92      1448
           1       0.52      0.88      0.66       224

    accuracy                           0.88      1672
   macro avg       0.75      0.88      0.79      1672
weighted avg       0.92      0.88      0.89      1672

