In [None]:
## Author : Karuna
# Different CNN models tried 

In [1]:
# import statements
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [81]:
#read the test data from the csv file
testdata = pd.read_table('clean_data_test_balanced.csv',
                    sep='|',
                    usecols=[0,1],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [82]:
#read the training data from the csv file
traindata = pd.read_table('clean_data_train_balanced.csv',
                    sep='|', 
                    usecols=[0,1],
                    dtype={'label':int,'comment':str},
                    keep_default_na=False)

In [83]:
print("Test data is ", testdata.shape)
print("Train data is ", traindata.shape )

Test data is  (243572, 2)
Train data is  (978039, 2)


In [84]:
# training_data, test_data = train_test_split(training_data, test_size=0.20)

In [85]:
# dividing datasets into train and test
X_train = traindata['comment']
y_train = traindata['label']
X_test = testdata['comment']
y_test = testdata['label']

In [86]:
print(X_train.shape)
print(X_test.shape)


(978039,)
(243572,)


In [88]:
def getwordlist(text):
    # Convert words to lower case and split them
    text = text.lower().split()    
    text = " ".join(text)    
    #Remove Special Characters
    text=re.sub(r'[^a-z\d ]','',text)    
    #Replace Numbers
    text=re.sub(r'\d+','n',text)
    # Return a list of words
    return(text)

In [89]:
comments = []
for text in X_train:
    comments.append(getwordlist(text))

In [90]:
test_comments=[]
for text in X_test:
    test_comments.append(getwordlist(text))

In [94]:
# Use Kears tokenizer to get word vectors
tokenizer = Tokenizer()
tokenizer.fit_on_texts(comments + test_comments) 
sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

word_index = tokenizer.word_index
print('unique words %s' % len(word_index))

train_data = pad_sequences(sequences, maxlen=2500, padding='post')
print('Train data shape:', train_data.shape)

test_data = pad_sequences(test_sequences, maxlen=2500, padding='post')
print('Test data shape:', test_data.shape)

unique words 207219
Train data shape: (978039, 2500)
Test data shape: (243572, 2500)


In [61]:
#Glove Vectors loaded
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [95]:
# prepare embeddings for layer
words_max = len(word_index)+1
embedding_matrix = np.zeros((len(word_index)+1, 100))
for word, i in word_index.items():
    if i >= words_max:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [18]:
## import statements
from keras.models import Sequential
from keras.layers import Dense ,Dropout, Input, Activation
from keras.layers import Flatten, GlobalMaxPooling1D, Conv1D, LSTM
from keras.layers.embeddings import Embedding
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [77]:
# Create an non re-trainable embedding layer with embedding size 100
nb_words = len(word_index)+1
embedding_layer = Embedding(nb_words,
        100,
        weights=[embedding_matrix],
        input_length=2500,
        trainable=False)

In [20]:
# 1st tried model 
#Simple NN with embedding layer and final dense with relu activation
#binary_crossentropy used as loss function and RMSprop as optimizer
# Model gave accuracy of 59.6%
model_1 = Sequential()
model_1.add(embedding_layer)
model_1.add(Flatten())
model_1.add(Dense(1, activation='relu'))
model_1.compile(loss='binary_crossentropy',
    optimizer='RMSprop',
    metrics=['accuracy'])
print(model_1.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 100)         18109000  
_________________________________________________________________
flatten_1 (Flatten)          (None, 200000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 200001    
Total params: 18,309,001
Trainable params: 200,001
Non-trainable params: 18,109,000
_________________________________________________________________
None


In [22]:
hist = model_1.fit(train_data, y_train, 
        epochs=2, batch_size=128)

Epoch 1/2
Epoch 2/2


In [23]:
loss, accuracy = model_1.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 59.609014


In [24]:
# secon dmodel, simple NN with output dense but sigmoid as activation
#optimizer is changed to adam, gave accuracy of 63.04%
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 100)         18109000  
_________________________________________________________________
flatten_2 (Flatten)          (None, 200000)            0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 200001    
Total params: 18,309,001
Trainable params: 200,001
Non-trainable params: 18,109,000
_________________________________________________________________
None


In [25]:
hist_1 = model.fit(train_data, y_train,epochs=2, batch_size=128)

Epoch 1/2
Epoch 2/2


In [26]:
loss, accuracy = model.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 63.049569


In [20]:
# third try, NN model with embedding layer, global max pooling and dense with 16 units having activation as Relu
#dropout layer has been added to reduce overfitting
# model gave accuracy of 65%
model_2 = Sequential()
model_2.add(embedding_layer)
model_2.add(Flatten())
model_2.add(Dropout(0.5))
model_2.add(GlobalMaxPooling1D())
model_2.add(Dense(16, activation='relu'))
model_2.add(Dense(1, activation='sigmoid'))

model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model_2.summary())

hist_2 = model_2.fit(train_data, y_train,epochs=5, batch_size=512)

loss, accuracy = model_2.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 100)         18109000  
_________________________________________________________________
flatten_1 (Flatten)          (None, 200000)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 200000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                3200016   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 21,309,033
Trainable params: 3,200,033
Non-trainable params: 18,109,000
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 65.221770


In [None]:
#added a dense layer with relu and final layer with softmax activation
# stopped running it due to low accuracy

model_3 = Sequential()
model_3.add(embedding_layer)
model_3.add(Flatten())
model_3.add(Dropout(0.5))
model_3.add(Dense(1, activation='relu'))
model_3.add(Dense(1, activation='sigmoid'))
model_3.add(Dense(1, activation='softmax'))

model_3.compile(optimizer='adam', loss='mse', metrics=['acc'])

print(model_3.summary())

hist_3 = model_3.fit(train_data, y_train,epochs=10, batch_size=512)

loss, accuracy = model_3.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 100)         18109000  
_________________________________________________________________
flatten_1 (Flatten)          (None, 200000)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 200000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 200001    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 2         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 2         
Total params: 18,309,005
Trainable params: 200,005
Non-trainable params: 18,109,000
__________________________________________________________

In [64]:
# CNN model 1
# 1D convolutional layer with 250 units and kernal size 3. added dense lauer of 16 with activation as tanh.
# 2 dropuout layers are added to reduce overfitting, final layer is with sigmoid activation
# gave accuracy of 71.8%
cb = EarlyStopping(monitor='val_acc', min_delta=0, patience=10, mode='auto')

model_4 = Sequential()
model_4.add(embedding_layer)

model_4.add(Conv1D(250, 3, padding='valid',activation='relu',strides=1))
model_4.add(GlobalMaxPooling1D())
model_4.add(Dropout(0.2))
model_4.add(Dense(16, activation='tanh'))
model_4.add(Dropout(0.2))
model_4.add(Dense(1, activation='sigmoid'))
model_4.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(model_4.summary())

hist_4 = model_4.fit(train_data, y_train,epochs=12, batch_size=512, callbacks=[cb], validation_split=0.15)

loss, accuracy = model_4.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 2500, 100)         20722000  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 2498, 250)         75250     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 250)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                4016      
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 17        
Total para

In [76]:
loss, accuracy = model_4.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 71.183880


In [65]:
# Savinf the model to the disk
from keras.models import model_from_json 
# serialize model to JSON
model_json = model_4.to_json()
with open("model_4.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model_4.save_weights("model_4.h5")
print("Saved model to disk")

Saved model to disk


In [68]:
# load json and create model
json_file = open('model_4.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model_4.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(test_data, y_test, verbose=1)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))


Loaded model from disk
acc: 71.18%


In [75]:
print(confusion_matrix(y_test,y_predict))  
print(classification_report(y_test,y_predict))  
print(accuracy_score(y_test,y_predict))

[[90728 29836]
 [40352 82656]]
              precision    recall  f1-score   support

           0       0.69      0.75      0.72    120564
           1       0.73      0.67      0.70    123008

   micro avg       0.71      0.71      0.71    243572
   macro avg       0.71      0.71      0.71    243572
weighted avg       0.71      0.71      0.71    243572

0.7118387992051631


In [79]:
# Model with re-trainable embeddings
# for the CNN model with 71.18% accuracy, changed the embedding layer as re-trainable.
# model overfitted the data while training with 80.62 accuracy, but reduced to 70.8% on test data
nb_words = len(word_index)+1
embedding_layer = Embedding(nb_words,
        100,
        weights=[embedding_matrix],
        input_length=2500,
        trainable=True)

cb = EarlyStopping(monitor='val_acc', min_delta=0, patience=10, mode='auto')

model = Sequential()
model.add(embedding_layer)

model.add(Conv1D(250, 3, padding='valid',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(16, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())

hist = model.fit(train_data, y_train,epochs=5, batch_size=512, callbacks=[cb], validation_split=0.15)

loss, accuracy = model.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 2500, 100)         20722000  
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 2498, 250)         75250     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 250)               0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 16)                4016      
_________________________________________________________________
dropout_12 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 17        
Total para