In [None]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano'
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from tensorflow.python.keras.layers import Layer, InputSpec
from keras import initializers
%matplotlib inline

In [1]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


**TextAttnBiRNN**

In [2]:
!nvidia-smi

Sat Nov 27 16:25:46 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
cd /content/gdrive/MyDrive/College/Semester5/NLP/project

/content/gdrive/MyDrive/College/Semester5/NLP/project


In [4]:
# !pip install --upgrade tensorflow

In [11]:
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.layers import Layer


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2
            hidden = LSTM(64, return_sequences=True)(words)
            sentence = Attention()(hidden)
            # next add a Dense layer (for classification/regression) or whatever...
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0

        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(name='{}_W'.format(self.name),
                                 shape=(input_shape[-1],),
                                 initializer=self.init,
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(name='{}_b'.format(self.name),
                                     shape=(input_shape[1],),
                                     initializer='zero',
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        e = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))  # e = K.dot(x, self.W)
        if self.bias:
            e += self.b
        e = K.tanh(e)

        a = K.exp(e)
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())
        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)

        c = K.sum(a * x, axis=1)
        return c

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim


In [5]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Dense, Bidirectional
from keras.layers import CuDNNLSTM
import tensorflow.keras.layers

class TextAttBiRNN(Model):
    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        super(TextAttBiRNN, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.bi_rnn = Bidirectional(CuDNNLSTM(128, return_sequences=True))  # LSTM or GRU
        # self.conv_layer = layers.Convolution1D(100, 3, activation="relu")
        # self.pooling_layer = layers.GlobalMaxPool1D()

        self.attention = Attention(self.maxlen)
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of TextAttBiRNN must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of TextAttBiRNN must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        # x = self.conv_layer(embedding)
        # x = self.pooling_layer(x)
        x = self.bi_rnn(embedding)
        x = self.attention(x)
        output = self.classifier(x)
        return output


In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
import pandas as pd
import numpy
from keras.preprocessing import text, sequence
from sklearn import metrics
max_features = 5000
maxlen = 70
batch_size = 32
embedding_dims = 50


print('Loading data...')
train = pd.read_csv("/content/gdrive/MyDrive/College/Semester5/NLP/project/han_train.csv")
test = pd.read_csv("/content/gdrive/MyDrive/College/Semester5/NLP/project/han_test.csv")
print(train.columns)
print(test.columns)
x_train, y_train, x_test, y_test = train['message'], train['class'], test['message'], test['class']
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)...')
# x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
# x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

embeddings_index = {}
for i, line in enumerate(open('/content/gdrive/MyDrive/Amazon/productner/data/glove.6B.100d.txt')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

token = text.Tokenizer()
token.fit_on_texts(train['message'])
word_index = token.word_index

x_train = sequence.pad_sequences(token.texts_to_sequences(x_train), maxlen=maxlen)
x_test = sequence.pad_sequences(token.texts_to_sequences(x_test), maxlen=maxlen)

embedding_matrix = numpy.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
Index(['id', 'title', 'author', 'message', 'class'], dtype='object')
Index(['id', 'title', 'author', 'message', 'class', 'predicted'], dtype='object')
16640 train sequences
4160 test sequences
Pad sequences (samples x time)...
x_train shape: (16640, 70)
x_test shape: (4160, 70)


In [None]:
epochs = 4
print('Build model...')
model = TextAttBiRNN(maxlen, max_features, embedding_dims)
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

print('Test...')
result = model.predict(x_test)

Build model...
Train...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test...


In [None]:
result = (result.reshape(1,-1)[0]>0.5).astype(int)

In [None]:
print("ACC", metrics.accuracy_score(result, y_test))
print("Prec", metrics.precision_score(result, y_test))
print("REC", metrics.recall_score(result, y_test))
print("F1", metrics.f1_score(result, y_test))

ACC 0.9247596153846154
Prec 0.9297501178689298
REC 0.9232209737827716
F1 0.9264740427531125


**HCAN**

In [6]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, TimeDistributed
# from keras.layers import CuDNNGRU
from keras.layers import GRU

class HCAN(Model):
    def __init__(self,
                 maxlen_sentence,
                 maxlen_word,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        super(HCAN, self).__init__()
        self.maxlen_sentence = maxlen_sentence
        self.maxlen_word = maxlen_word
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        # Word part
        input_word = Input(shape=(self.maxlen_word,))
        x_word = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen_word)(input_word)
        x_word = layers.Convolution1D(100, 10, activation="relu", padding = 'same')(x_word)
        # x_word = layers.GlobalMaxPool1D()(x_word)

        x_word = Bidirectional(GRU(128, return_sequences=True))(x_word)  # LSTM or GRU
        x_word = Attention(self.maxlen_word)(x_word)
        model_word = Model(input_word, x_word)
        # Sentence part
        self.word_encoder_att = TimeDistributed(model_word)
        self.sentence_encoder = Bidirectional(GRU(128, return_sequences=True))  # LSTM or GRU
        self.sentence_att = Attention(self.maxlen_sentence)
        # Output part
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 3:
            raise ValueError('The rank of inputs of HAN must be 3, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen_sentence:
            raise ValueError('The maxlen_sentence of inputs of HAN must be %d, but now is %d' % (self.maxlen_sentence, inputs.get_shape()[1]))
        if inputs.get_shape()[2] != self.maxlen_word:
            raise ValueError('The maxlen_word of inputs of HAN must be %d, but now is %d' % (self.maxlen_word, inputs.get_shape()[2]))
        x_sentence = self.word_encoder_att(inputs)
        x_sentence = self.sentence_encoder(x_sentence)
        x_sentence = self.sentence_att(x_sentence)
        output = self.classifier(x_sentence)
        return output


In [8]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
import pandas as pd
import numpy
from keras.preprocessing import text, sequence
from sklearn import metrics

max_features = 5000
maxlen_sentence = 16
maxlen_word = 25
batch_size = 32
embedding_dims = 50


print('Loading data...')
train = pd.read_csv("/content/gdrive/MyDrive/College/Semester5/NLP/project/han_train.csv")
test = pd.read_csv("/content/gdrive/MyDrive/College/Semester5/NLP/project/han_test.csv")
print(train.columns)
print(test.columns)
x_train, y_train, x_test, y_test = train['message'], train['class'], test['message'], test['class']
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x #sentence x #word)...')
embeddings_index = {}
for i, line in enumerate(open('/content/gdrive/MyDrive/Amazon/productner/data/glove.6B.100d.txt')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

token = text.Tokenizer()
token.fit_on_texts(train['message'])
word_index = token.word_index

x_train = sequence.pad_sequences(token.texts_to_sequences(x_train),  maxlen=maxlen_sentence * maxlen_word)
x_test = sequence.pad_sequences(token.texts_to_sequences(x_test), maxlen=maxlen_sentence * maxlen_word)

embedding_matrix = numpy.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

x_train = x_train.reshape((len(x_train), maxlen_sentence, maxlen_word))
x_test = x_test.reshape((len(x_test), maxlen_sentence, maxlen_word))
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
Index(['id', 'title', 'author', 'message', 'class'], dtype='object')
Index(['id', 'title', 'author', 'message', 'class', 'predicted'], dtype='object')
16640 train sequences
4160 test sequences
Pad sequences (samples x #sentence x #word)...
x_train shape: (16640, 16, 25)
x_test shape: (4160, 16, 25)


In [9]:
from tensorflow.keras import layers
from keras.callbacks import ModelCheckpoint

In [12]:
epochs = 5
print('Build model...')
model = HCAN(maxlen_sentence, maxlen_word, max_features, embedding_dims)
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

print('Test...')
result = model.predict(x_test)

Build model...
Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test...


In [13]:
result = (result.reshape(1,-1)[0]>0.5).astype(int)

In [14]:
print("ACC", metrics.accuracy_score(result, y_test))
print("Prec", metrics.precision_score(result, y_test))
print("REC", metrics.recall_score(result, y_test))
print("F1", metrics.f1_score(result, y_test))

ACC 0.9824519230769231
Prec 0.9863272041489863
REC 0.9794007490636704
F1 0.982851773549448


**Saving Trained Model**

In [18]:
model.save_weights("_hcan_")

**Load Saved Model weights**

In [19]:
model2 = HCAN(maxlen_sentence, maxlen_word, max_features, embedding_dims)
model2.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [20]:
model2.load_weights("_hcan_")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbebf93de50>

In [21]:
result = model2.predict(x_test)

In [23]:
result = (result.reshape(1,-1)[0]>0.5).astype(int)
print("ACC", metrics.accuracy_score(result, y_test))
print("Prec", metrics.precision_score(result, y_test))
print("REC", metrics.recall_score(result, y_test))
print("F1", metrics.f1_score(result, y_test))

ACC 0.9824519230769231
Prec 0.9863272041489863
REC 0.9794007490636704
F1 0.982851773549448
