###### Step-1:Download the dataset using requests library

In [42]:
import pathlib

In [43]:
datasetfile = pathlib.Path("smsspamcollection")

In [44]:
if datasetfile.is_file():
    print('dataset already exists')
else:
    import requests
    import zipfile
    import io  
    r = requests.get("http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",stream=True)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall() #download's and extracts to current directory
    

dataset already exists


######  Useful URL's <br/>
https://github.com/dbsheta/spam-detection-using-deep-learning/blob/master/spam_detection.ipynb <br/>
http://mindmech.net/?p=61 <br/>
http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection <br/>
https://www.kaggle.com/jacklinggu/tfidf-to-keras-dense-neural-network

###### Step-2:  Create dataframe with tab seperated data of the file

In [45]:
import pandas

In [46]:
df = pandas.read_csv('smsspamcollection', sep='\t',header=None)

In [47]:
df.columns = ['label', 'msg'] # set column names

###### print the top 5 lines

In [48]:
df.head(5)

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
msgs = df['msg'].values

In [50]:
labels = [1 if x == "spam" else 0 for x in df['label'].values]

In [51]:
msgs[1:5]

array(['Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       'U dun say so early hor... U c already then say...',
       "Nah I don't think he goes to usf, he lives around here though"], dtype=object)

In [52]:
labels[1:5]

[0, 1, 0, 0]

###### Step-3:  Preprocess data

In [53]:
import keras

In [54]:
import keras.preprocessing.text as kpt

In [55]:
tokenizer = kpt.Tokenizer()
tokenizer.fit_on_texts(msgs)
sequences = tokenizer.texts_to_sequences(msgs)
word2index = tokenizer.word_index
num_words = len(word2index)
print("Found  unique tokens " + num_words.__str__());

Found  unique tokens 9009


In [56]:
MAX_WORDS_IN_SEQ=9009#1000

In [57]:
import keras.preprocessing.sequence as kps

In [58]:
import keras.utils

In [59]:
import sklearn
import sklearn.model_selection

In [60]:
data = kps.pad_sequences(sequences, maxlen=MAX_WORDS_IN_SEQ, padding='post', truncating='post')
print(labels[:10])
labels = keras.utils.to_categorical(labels)
print(labels[:10])

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(data, labels, test_size=0.2)


[0, 0, 1, 0, 0, 1, 0, 0, 1, 1]
[[ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]]
Shape of data tensor: (5572, 9009)
Shape of label tensor: (5572, 2)


###### Step-4:  Build Model

In [64]:
from keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, Dense, Input, Embedding, MaxPooling1D, Flatten

In [67]:
from keras.models import Model, load_model

In [68]:
EMBED_DIM = 100

In [69]:
input_seq = Input(shape=[MAX_WORDS_IN_SEQ, ], dtype='int32')
embed_seq = Embedding(num_words, EMBED_DIM, embeddings_initializer='glorot_uniform', input_length=MAX_WORDS_IN_SEQ)(
    input_seq)
conv_1 = Conv1D(128, 5, activation='relu')(embed_seq)
conv_1 = MaxPooling1D(pool_size=5)(conv_1)
conv_2 = Conv1D(128, 5, activation='relu')(conv_1)
conv_2 = MaxPooling1D(pool_size=5)(conv_2)
conv_3 = Conv1D(128, 5, activation='relu')(conv_2)
conv_3 = MaxPooling1D(pool_size=35)(conv_3)
flat = Flatten()(conv_3)
# flat = Dropout(0.25)(flat)
fc1 = Dense(128, activation='relu')(flat)
# dense_1 = Dropout(0.25)(flat)
fc2 = Dense(2, activation='softmax')(fc1)

model = Model(input_seq, fc2)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [70]:
# Testing ---------------------------------------
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 9009)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 9009, 100)         900900    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 9005, 128)         64128     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 1801, 128)         0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1797, 128)         82048     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 359, 128)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 355, 128)          82048     
__________

In [None]:
model.fit(x_train, y_train, epochs=100, batch_size=1, verbose=1);

Epoch 1/100
 864/4457 [====>.........................] - ETA: 389s - loss: 0.3500 - acc: 0.8877