### Load the dependencies

In [1]:
# Import the keras libraries
import numpy as np
from keras.models import Sequential
from keras.layers import Activation, Dense, LSTM, Dropout
from keras.layers.embeddings import Embedding #To convert an integer to embedding
from keras.preprocessing import sequence #To convert a variable length sentence into a prespecified length
# fix random seed for reproducibility
np.random.seed(7)
from keras.utils.np_utils import to_categorical
import pandas as pd

Using TensorFlow backend.


In [2]:
# Import the sklearn libraries
import re
import sklearn
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import svm,grid_search
from sklearn.metrics import confusion_matrix
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB



In [3]:
# Check the relevant path in your local machine
import os
os.getcwd()

'/nfsroot/data/home/s_jaysetty/sms'

In [4]:
# Read the data

sms_data = pd.read_csv('TRAIN_SMS.csv')
print(sms_data)

      Label                                            Message
0       ham                 oh how abt 2 days before Christmas
1      info  Welcome to OVATION HOLD R.No. 184, 114, 395, 3...
2      info  Thank you for using your ICICI bank CREDITcard...
3       ham  schedule a meeting with the entire team in the...
4       ham                                Tommy is my brother
5      spam  OTP is 817453 for the txn of INR 8262.00 at SP...
6       ham                   the meeting is scheduled by john
7      spam  Dear customer, We wish you a Merry Christmas. ...
8      spam  Delivered: Your package withPawzone Red 1.25 i...
9      info  The PNR for your Air India Flt 7I115 for PGH-B...
10     info  Bimal Auto Agency : Service of your car KA52C8...
11     info  Appointment with Dr Clayton in Pune on 2011-08...
12     info  Maha Veer Auto Agency : Service of your car KA...
13     spam  Dear AirAsia Customer, flight 5Q658 from RJA s...
14     info  Dear Guest, Thanks for choosing Forlini's 

In [8]:
# Simple definition to process the words further

def message_to_words(raw_message):
#     letters_only = re.sub("[^a-zA-Z]", " ", raw_converse) 
    # 1. Lower case & split  
    words = raw_message.lower().split()                             
    # 2. Convert stop words to a set
    stops = set(stopwords.words("english"))                  
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    # 4. Join the words back into one string
    return(b" ".join(meaningful_words)) 

In [15]:
# Checking the length of the column
num_message = sms_data["Message"].size

In [18]:
print(num_message)
print(Message[:3], Label[:3])

30000
([u'oh abt 2 days christmas', u'welcome ovation hold r.no. 184, 114, 395, 378 ch.in 2014-10-21 3:53 ch.out 2014-11-01 12:00.', u'thank using icici bank creditcard ending 5253 rs. 2520.00 alike snapdeal 2013-05-31 21:35'], ['ham', 'info', 'info'])


In [13]:
# Initialize empty directories

Message = []
Label = [] # First target

for x in range(len(sms_data.Message)):
    Message.append(message_to_words(str(sms_data.Message[x]).decode('ascii','ignore')))
    Label.append(sms_data.Label[x])

### Load the data

In [19]:
#Create word frequency matrix for every Message, 
#Apply Tfidf vectorizer-extracting upto 1400 features


vectorizer = CountVectorizer(analyzer = "word",\
                             tokenizer = None,\
                             preprocessor = None,
                             ngram_range=(1,1), # We are only interested in uni grams
                             max_features=1400, # Limits to 1400 features
                             stop_words = None) 

data_features = vectorizer.fit_transform(Message)
type(data_features)
vectorizer.get_feature_names()

[u'00',
 u'000',
 u'01',
 u'02',
 u'03',
 u'04',
 u'05',
 u'06',
 u'07',
 u'08',
 u'09',
 u'10',
 u'100',
 u'1000',
 u'10am',
 u'10hrs',
 u'10th',
 u'11',
 u'11am',
 u'11hrs',
 u'12',
 u'121',
 u'12hrs',
 u'12th',
 u'13',
 u'13hrs',
 u'14',
 u'14hrs',
 u'14th',
 u'15',
 u'15hrs',
 u'15th',
 u'16',
 u'16hrs',
 u'17',
 u'17hrs',
 u'17th',
 u'18',
 u'1800',
 u'18hrs',
 u'18th',
 u'19',
 u'19hrs',
 u'19th',
 u'1st',
 u'20',
 u'200',
 u'2009',
 u'2010',
 u'2011',
 u'2012',
 u'2013',
 u'2014',
 u'2015',
 u'2016',
 u'2017',
 u'20hrs',
 u'20th',
 u'21',
 u'21hrs',
 u'22',
 u'22hrs',
 u'23',
 u'23hrs',
 u'24',
 u'24hrs',
 u'25',
 u'250',
 u'25hrs',
 u'26',
 u'26hrs',
 u'27',
 u'27hrs',
 u'28',
 u'28hrs',
 u'29',
 u'29hrs',
 u'2mrw',
 u'2nd',
 u'2pm',
 u'30',
 u'300',
 u'30am',
 u'30hrs',
 u'30pm',
 u'31',
 u'31hrs',
 u'31st',
 u'32',
 u'32hrs',
 u'33',
 u'33hrs',
 u'34',
 u'34hrs',
 u'35',
 u'35hrs',
 u'36',
 u'36hrs',
 u'37',
 u'37hrs',
 u'38',
 u'38hrs',
 u'39',
 u'39hrs',
 u'3g',
 u'3pm',
 u

In [21]:
print(len(Message), len(Label)) #30000 obs
print(np.unique(Label)) # 3 different categories

(30000, 30000)
['ham' 'info' 'spam']


In [22]:
# Convert the data_features to array

# data features for categories
data_features = data_features.toarray()
data_features = pd.DataFrame(data_features)
data_features["Label"] = sms_data["Label"]
data_features = data_features.sample(frac =1)

# #data features for sub_categories

# data_features["sub_categories"] = health_data["sub_categories"]
# data_features = data_features.sample(frac =1)

In [23]:
# Verify the shape and head of data_features
print(data_features[:3])
print(data_features.shape)

       0  1  2  3  4  5  6  7  8  9  ...    1391  1392  1393  1394  1395  \
1252   0  0  0  0  0  0  0  0  0  0  ...       0     0     0     0     0   
10444  0  0  0  0  0  0  0  0  0  0  ...       0     0     0     0     0   
8994   1  0  0  0  0  0  0  0  0  0  ...       0     0     0     0     0   

       1396  1397  1398  1399  Label  
1252      0     0     0     0   spam  
10444     0     0     0     0    ham  
8994      0     0     0     0   spam  

[3 rows x 1401 columns]
(30000, 1401)


In [24]:
print(type(sms_data))
print(type(data_features))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [25]:
# Split into 60% train, 20% val and 20% test

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.ix[perm[:train_end]]
    validate = df.ix[perm[train_end:validate_end]]
    test = df.ix[perm[validate_end:]]
    return train, validate, test

#First split
train, validate, test = train_validate_test_split(data_features) 
cols = [col for col in data_features.columns if col not in ["Label"]]

train.x = train[cols]
train.y = train["Label"]
# train.z = train["sub_categories"]

validate.x = validate[cols]
validate.y = validate["Label"]
# validate.z = validate["sub_categories"]

test.x = test[cols]
test.y = test["Label"]
# test.z = test["sub_categories"]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


In [27]:
# Verify the shape of train.x, train.y
print(train.x.shape, train.y.shape)
print(validate.x.shape, validate.y.shape)
print(test.x.shape, test.y.shape)
# print(health_data.converse.map(len).max())

print(train.x[:3])
print(train.y[:3])

((18000, 1400), (18000,))
((6000, 1400), (6000,))
((6000, 1400), (6000,))
       0     1     2     3     4     5     6     7     8     9     ...   1390  \
15559     0     0     0     0     0     0     0     0     0     0  ...      0   
11214     0     0     0     0     0     0     0     0     0     0  ...      0   
14976     0     0     0     0     0     0     0     0     0     0  ...      0   

       1391  1392  1393  1394  1395  1396  1397  1398  1399  
15559     0     0     0     0     0     0     0     0     0  
11214     0     0     0     0     0     0     0     0     0  
14976     0     0     0     0     0     0     0     0     0  

[3 rows x 1400 columns]
15559    ham
11214    ham
14976    ham
Name: Label, dtype: object


In [15]:
# Truncate and Pad input sequences

max_review_length = 500 # This should be actually health_data.converse.map(len).max()
train.x = sequence.pad_sequences(train.x, maxlen=max_review_length)
test.x = sequence.pad_sequences(test.x, maxlen=max_review_length)

ValueError: `sequences` must be a list of iterables. Found non-iterable: 0

### Let's inspect the first two sentences of train and their classes

In [28]:
# Simple Naive Bayes Model

Mnb = MultinomialNB()
Mnb.fit(train.x, train.y)
preds_NB = Mnb.predict(test.x)
confusion_matrix(test.y,preds_NB)

array([[1894,   96,   38],
       [   0, 2638,    0],
       [  32,    8, 1294]])

In [29]:
# Naive Bayes Accuracy and recall

accuracy = metrics.accuracy_score(test.y,preds_NB)
recall = metrics.recall_score(test.y,preds_NB, average = 'macro')
print(accuracy, recall)

(0.97099999999999997, 0.9679800139378042)


### Inspect the same sentences after padding

In [30]:
# Basic checks so that train.x, train.y,..,test.y are all in the same shape
print(np.unique(train.y))
# print(np.unique(train.z))

['ham' 'info' 'spam']


In [31]:
# Convert the train.y into simple categorical variables 0,1,2
train.y = pd.Categorical(train.y)
train.y.codes
train.y = to_categorical(np.asarray(train.y.codes))

# Similarly convert the test.y into simple categorical variables 0,1,2
test.y = pd.Categorical(test.y)
test.y.codes
test.y = to_categorical(np.asarray(test.y.codes))

# Similarly convert the val.y into simple categorical variables 0,1,2
validate.y = pd.Categorical(validate.y)
validate.y.codes
validate.y = to_categorical(np.asarray(validate.y.codes))

print(train.y.shape)

(18000, 3)


In [32]:
print(train.y.shape)
# print(train.z[:3])

(18000, 3)


In [34]:
# Convert the train.y into simple categorical variables from 0,1,2,..,6
train.z = pd.Categorical(train.z)
train.z.codes
train.z = to_categorical(np.asarray(train.z.codes))

# Similarly convert the test.y into simple categorical variables from 0,1,2,..,6
test.z = pd.Categorical(test.z)
test.z.codes
test.z = to_categorical(np.asarray(test.z.codes))

# Similarly convert the val.y into simple categorical variables from 0,1,2,..,6
validate.z = pd.Categorical(validate.z)
validate.z.codes
validate.z = to_categorical(np.asarray(validate.z.codes))

In [34]:
# Verify the shape of train.x and train.y before feeding into the model
print(train.x.shape, train.y.shape)
print(type(train.x), type(train.y))

print(validate.x.shape, validate.y.shape)

((18000, 1400), (18000, 3))
(<class 'pandas.core.frame.DataFrame'>, <type 'numpy.ndarray'>)
((6000, 1400), (6000, 3))


In [35]:
# Creating a custome "Recall" error function in Keras backend

import keras.backend as K

def recall(y_true, y_pred):
    TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    PP = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = TP / (PP + K.epsilon())
    return recall

In [36]:
# Simple MLP, SGD and Dropout [6 categories]

from keras.optimizers import SGD

model = Sequential()
model.add(Dense(128, input_dim=1400, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', recall])

# sgd = SGD(lr=0.04, decay=1e-6, momentum=0.6, nesterov=True)
# model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) 0.82% test accuracy
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               179328    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total params: 179,715
Trainable params: 179,715
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
# Fit the model for y = categories
model.fit(np.array(train.x), train.y, nb_epoch=15, batch_size=32, verbose=2, validation_data=(np.array(validate.x), validate.y))

scores_mlp = model.evaluate(np.array(test.x), test.y, verbose=0)
print("Accurcay_mlp_cat: %.2f%%" % (scores_mlp[1]*100))
print("Recall_mlp_cat: %.2f%%" % (scores_mlp[2]*100))



Train on 18000 samples, validate on 6000 samples
Epoch 1/15
3s - loss: 0.1138 - acc: 0.9681 - recall: 0.9367 - val_loss: 0.0177 - val_acc: 0.9952 - val_recall: 0.9948
Epoch 2/15
2s - loss: 0.0131 - acc: 0.9968 - recall: 0.9968 - val_loss: 0.0134 - val_acc: 0.9960 - val_recall: 0.9960
Epoch 3/15
1s - loss: 0.0071 - acc: 0.9982 - recall: 0.9982 - val_loss: 0.0138 - val_acc: 0.9958 - val_recall: 0.9957
Epoch 4/15
1s - loss: 0.0047 - acc: 0.9988 - recall: 0.9988 - val_loss: 0.0138 - val_acc: 0.9962 - val_recall: 0.9962
Epoch 5/15
1s - loss: 0.0033 - acc: 0.9990 - recall: 0.9990 - val_loss: 0.0151 - val_acc: 0.9957 - val_recall: 0.9957
Epoch 6/15
1s - loss: 0.0025 - acc: 0.9994 - recall: 0.9994 - val_loss: 0.0159 - val_acc: 0.9957 - val_recall: 0.9957
Epoch 7/15
1s - loss: 0.0022 - acc: 0.9996 - recall: 0.9996 - val_loss: 0.0173 - val_acc: 0.9957 - val_recall: 0.9957
Epoch 8/15
1s - loss: 0.0016 - acc: 0.9996 - recall: 0.9996 - val_loss: 0.0171 - val_acc: 0.9960 - val_recall: 0.9960
Epoch 9

In [37]:
# Simple MLP, SGD and Dropout [21 sub_categories]

model = Sequential()
model.add(Dense(128, input_dim=5000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(21, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', recall])

# sgd = SGD(lr=0.04, decay=1e-6, momentum=0.6, nesterov=True)
# model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy', recall])
print(model.summary())

# Fit the model for y = categories
model.fit(np.array(train.x), train.z, nb_epoch=20, batch_size=32, verbose=2, validation_data=(np.array(val.x), val.z))

scores_mlp = model.evaluate(np.array(test.x), test.z, verbose=0)
print("Accurcay_mlp_sub_cat: %.2f%%" % (scores_mlp[1]*100))
print("Recall_mlp_sub_cat: %.2f%%" % (scores_mlp[2]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 128)               640128    
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 21)                2709      
Total params: 642,837
Trainable params: 642,837
Non-trainable params: 0
_________________________________________________________________
None
Train on 36658 samples, validate on 9165 samples
Epoch 1/20
5s - loss: 1.1832 - acc: 0.6602 - recall: 0.4836 - val_loss: 0.8797 - val_acc: 0.7200 - val_recall: 0.6123
Epoch 2/20
5s - loss: 0.8057 - acc: 0.7411 - recall: 0.6425 - val_loss: 0.8467 - val_acc: 0.7261 - val_recall: 0.6468
Epoch 3/20
5s - loss: 0.6784 - acc: 0.7773 - recall: 0.6972 - val_loss: 0.8522 - val_acc: 0.7276 - val_recall:

In [41]:
# LSTM Model [3 categories]

top_words = 500
embedding_vecor_length = 32
max_converse_length = 500
# max_converse_length = health_data.converse.map(len).max() # This gives the max length of the converse, used 500 here
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=1400))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', recall])
print(model.summary())

model.fit(np.array(train.x), train.y, nb_epoch=10, batch_size=64, validation_data=(np.array(validate.x), validate.y))
# Final evaluation of the model
scores = model.evaluate(np.array(test.x), test.y, verbose=0)
print("Accuracy_lstm_cat: %.2f%%" % (scores[1]*100))
print("Recall_lstm_cat: %.2f%%" % (scores[2]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1400, 32)          16000     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 303       
Total params: 69,503
Trainable params: 69,503
Non-trainable params: 0
_________________________________________________________________
None
Train on 18000 samples, validate on 6000 samples
Epoch 1/10
 1152/18000 [>.............................] - ETA: 457s - loss: 1.0866 - acc: 0.4106 - recall: 0.6814

KeyboardInterrupt: 

In [50]:
# # LSTM Model [21 categories]

# top_words = 500
# embedding_vecor_length = 32
# max_converse_length = 500
# # max_converse_length = health_data.converse.map(len).max() # This gives the max length of the converse, used 500 here
# model = Sequential()
# model.add(Embedding(top_words, embedding_vecor_length, input_length=500))
# model.add(LSTM(100))
# model.add(Dense(21, activation='sigmoid'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', recall])
# print(model.summary())

# model.fit(np.array(train.x), train.z, nb_epoch=1, batch_size=64, validation_data=(np.array(val.x), val.z))
# # Final evaluation of the model
# scores = model.evaluate(np.array(test.x), test.z, verbose=0)
# print("Accuracy_lstm_sub_cat: %.2f%%" % (scores[1]*100))
# print("Recall_lstm_sub_cat: %.2f%%" % (scores[2]*100))

In [42]:
# CNN 1D 

# set parameters:
max_features = 500
maxlen = 1400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10 #Models computational time is a lot

In [43]:
# CNN 1D [3 categories]

from keras.layers import Conv1D, GlobalMaxPooling1D

print('Build Convolution 1D model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.3))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 border_mode='valid',
                 activation='relu',
                 subsample_length=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(3))
model.add(Activation('sigmoid'))

model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', recall])

Build Convolution 1D model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1400, 50)          25000     
_________________________________________________________________
dropout_2 (Dropout)          (None, 1400, 50)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1398, 250)         37750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_3 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)      



In [44]:
model.fit(np.array(train.x), train.y, batch_size=batch_size, nb_epoch=epochs, validation_data=(np.array(validate.x), validate.y) )

# Final evaluation of the model
scores = model.evaluate(np.array(test.x), test.y, verbose=0)
print("Accuracy_cnn1d_cat: %.2f%%" % (scores[1]*100))
print("Recall_cnn1d_cat: %.2f%%" % (scores[2]*100))

Train on 18000 samples, validate on 6000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy_cnn1d_cat: 34.48%
Recall_cnn1d_cat: 33.75%


In [45]:
# # CNN 1D [21 categories]

# from keras.layers import Conv1D, GlobalMaxPooling1D

# print('Build Convolution 1D model...')
# model = Sequential()

# # we start off with an efficient embedding layer which maps
# # our vocab indices into embedding_dims dimensions
# model.add(Embedding(max_features,
#                     embedding_dims,
#                     input_length=maxlen))
# model.add(Dropout(0.2))

# # we add a Convolution1D, which will learn filters
# # word group filters of size filter_length:
# model.add(Conv1D(filters,
#                  kernel_size,
#                  border_mode='valid',
#                  activation='relu',
#                  subsample_length=1))
# # we use max pooling:
# model.add(GlobalMaxPooling1D())

# # We add a vanilla hidden layer:
# model.add(Dense(hidden_dims))
# model.add(Dropout(0.2))
# model.add(Activation('relu'))

# # We project onto a single unit output layer, and squash it with a sigmoid:
# model.add(Dense(21))
# model.add(Activation('sigmoid'))

# model.summary()

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', recall])

In [46]:
# model.fit(np.array(train.x), train.z, batch_size=batch_size, nb_epoch=epochs, validation_data=(np.array(val.x), val.z) )

# # Final evaluation of the model
# scores = model.evaluate(np.array(test.x), test.z, verbose=0)
# print("Accuracy_cnn1d_sub_cat: %.2f%%" % (scores[1]*100))
# print("Recall_cnn1d_sub_cat: %.2f%%" % (scores[2]*100))

In [47]:
# import matplotlib.pyplot as plt


# model_mlp = model.fit(np.array(train.x), train.z, nb_epoch=20, batch_size=32, verbose=2, validation_data=(np.array(val.x), val.z))


In [48]:
# history = model_mlp
# # list all data in history
# print(history.history.keys())
# # summarize history for accuracy
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

In [49]:
# # summarize history for loss
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()