<a href="https://colab.research.google.com/github/harenlin/Sentiment-Analysis-With-Tensorflow/blob/main/Sentiment_Analysis_with_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis with CNN

import packages

In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Reshape, Embedding, Activation
from keras.layers import Dense, Dropout, Conv2D, Flatten, MaxPool2D, Input, concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


load in data

In [2]:
# hyper-parameters
vocab_size = 3000
max_seq_len = 300
embedding_dim = 100

# load-in dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)
# x_train = array of indices, you can see whats in it
print(x_train[1]) # [1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 2, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 2, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 2, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 2, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 2, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 2, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]
print(x_train.shape) # (25000,)
print(y_train.shape) # (25000,)
print(x_test.shape)  # (25000,)
print(y_test.shape)  # (25000,)

# pad the sequence 
x_train = pad_sequences(x_train, maxlen = max_seq_len)
x_test = pad_sequences(x_test, maxlen = max_seq_len)
print(x_train.shape) # (25000, 300)
print(x_test.shape)  # (25000, 300)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 2, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 2, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 2, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 2, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 2, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 2, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]
(25000,)
(25000,)
(25000,)
(25000,)
(25000, 300)
(25000, 300)


Model Definition

In [3]:
# different size of kernal
filter_sizes = [3,4,5]

def convolutions(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
    inputs = Input(shape = (max_seq_len, embedding_dimension, 1))
    cnns = []
    for size in filter_sizes:
        cnn = Conv2D(filters=64, kernel_size=(size, embedding_dimension), strides=1, padding='valid', activation='relu')(inputs)
        pooling = MaxPool2D(pool_size=(max_seq_len-size+1, 1), padding='valid')(cnn)
        cnns.append(pooling)
    cnns_outputs = concatenate(cnns)
    model = Model(inputs=inputs, outputs=cnns_outputs)
    return model

def cnn_nlp_model(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
    model = Sequential([
        Embedding(input_dim=vocabulary_size, output_dim=embedding_dimension, input_length=max_seq_len),
        Reshape(target_shape=(max_seq_len, embedding_dimension, 1)), # 2D -> 3D
        convolutions(vocab_size, embedding_dim, max_seq_len, filter_sizes),
        Flatten(),
        Dense(10, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

Model Training

In [4]:
model = cnn_nlp_model(vocab_size, embedding_dim, max_seq_len, filter_sizes)
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 100)          300000    
_________________________________________________________________
reshape (Reshape)            (None, 300, 100, 1)       0         
_________________________________________________________________
model (Functional)           (None, 1, 1, 192)         76992     
_________________________________________________________________
flatten (Flatten)            (None, 192)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1930      
_________________________________________________________________
dropout (Dropout)            (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 1

Second way to define model

In [5]:
# different size of kernal
filter_sizes = [3,4,5]

def convolutions(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
    inputs = Input(shape = (max_seq_len, embedding_dimension, 1))
    cnns = []
    for size in filter_sizes:
        cnn = Conv2D(filters=64, kernel_size=(size, embedding_dimension), strides=1, padding='valid', activation='relu')(inputs)
        pooling = MaxPool2D(pool_size=(max_seq_len-size+1, 1), padding='valid')(cnn)
        cnns.append(pooling)
    cnns_outputs = concatenate(cnns)
    model = Model(inputs=inputs, outputs=cnns_outputs)
    return model

def cnn_nlp_model(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size, output_dim=embedding_dimension, input_length=max_seq_len))
    model.add(Reshape(target_shape=(max_seq_len, embedding_dimension, 1))) # 2D -> 3D
    model.add(convolutions(vocab_size, embedding_dim, max_seq_len, filter_sizes))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

model = cnn_nlp_model(vocab_size, embedding_dim, max_seq_len, filter_sizes)
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          300000    
_________________________________________________________________
reshape_1 (Reshape)          (None, 300, 100, 1)       0         
_________________________________________________________________
model_1 (Functional)         (None, 1, 1, 192)         76992     
_________________________________________________________________
flatten_1 (Flatten)          (None, 192)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1930      
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

Third way to define model

In [6]:
class Model(tf.keras.Model):
    def __init__(self, vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
        super().__init__()
        self.embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dimension, input_length=max_seq_len)
        self.reshape = Reshape(target_shape=(max_seq_len, embedding_dimension, 1)) # 2D -> 3D
        self.cnn1 = Conv2D(filters=64, kernel_size=(filter_sizes[0], embedding_dimension), strides=1, padding='valid', activation='relu')
        self.pool1 = MaxPool2D(pool_size=(max_seq_len-filter_sizes[0]+1, 1), padding='valid')
        self.cnn2 = Conv2D(filters=64, kernel_size=(filter_sizes[1], embedding_dimension), strides=1, padding='valid', activation='relu')
        self.pool2 = MaxPool2D(pool_size=(max_seq_len-filter_sizes[1]+1, 1), padding='valid')
        self.cnn3 = Conv2D(filters=64, kernel_size=(filter_sizes[2], embedding_dimension), strides=1, padding='valid', activation='relu')
        self.pool3 = MaxPool2D(pool_size=(max_seq_len-filter_sizes[2]+1, 1), padding='valid')
        self.flatten = Flatten()
        self.fc = Dense(10, activation='relu')
        self.dropout = Dropout(0.2)
        self.out_linear = Dense(1, activation='sigmoid')
  
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.reshape(x)
        x1 = self.cnn1(x)
        x1 = self.pool1(x1)
        x2 = self.cnn1(x)
        x2 = self.pool1(x2)
        x3 = self.cnn1(x)
        x3 = self.pool1(x3)
        x = concatenate([x1,x2,x3], axis=-1)
        x = self.flatten(x)
        x = self.fc(x)
        if training: x = self.dropout(x, training=training)
        x = self.out_linear(x)
        return x

model = Model(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
