In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import re
import sys
import os
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import seaborn as sns
import string
import scipy.sparse as sparse
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import confusion_matrix , log_loss , accuracy_score , classification_report
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS 
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import naive_bayes
from sklearn.metrics import recall_score
from tensorflow import keras
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import refrom bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import keras
plt.switch_backend('agg')
%matplotlib inline

# 1. Data Loading

In [None]:
data = pd.read_csv('Reviews.csv')

In [None]:
data.head()

In [None]:
data.shape

# 2. Data Cleaning

In [None]:
data_nonulls = data.dropna()

In [None]:
data_nonulls.shape

In [None]:
data_nonulls['Time'] = pd.to_datetime(data_nonulls['Time'],unit='s')

# Changing the Score value to category of Positive and Negative

In [None]:
data_nonulls['Score'] = data["Score"].apply(lambda x: 1 if x > 3 else 0)

Stripping the non-alphabet words/characters from the input text

In [None]:
def cleaning_non_alphabet(line):
    pattern = re.compile(r'[^a-z]+')
    line = line.lower()
    line = pattern.sub(' ', line).strip()  
    return line
data_nonulls['cleaned_Text'] = data_nonulls['Text'].apply(lambda x: cleaning_non_alphabet(x))

In [None]:
def remove_stop_words(line):
    # Tokenize
    words = word_tokenize(line)
    # stop words
    stopwords_list = set(stopwords.words('english'))
    # remove stop words
    words = [word for word in words if word not in stopwords_list]
    # stemming
    ps  = PorterStemmer()
    words = [ps.stem(word) for word in words]
    # list to sentence
    line = ' '.join(words) 
    return line

data_nonulls['cleaned_Text'] = data_nonulls['cleaned_Text'].apply(lambda x: remove_stop_words(x))

In [None]:
texts = data_nonulls['cleaned_Text']
labels = []
for idx in data_nonulls['Score']:
    labels.append(idx)

# 3. Preprocessing- Tokenization and sequence padding

In [None]:
MAX_NB_WORDS = 40000

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index

In [None]:
#pad_sequences is used to ensure that all sequences in a list have the same length. 
data = pad_sequences(
                    sequences, 
                    maxlen=MAX_SEQUENCE_LENGTH
                    )

#to_categorical to convert array of labeled data(from 0 to nb_classes-1) to one-hot vector.
labels = to_categorical(np.asarray(labels))

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [None]:
#Main Data split into Training and Validation

split_percentage = 0.05
validation_data = int(split_percentage * data.shape[0])

x_train = data[:-validation_data]
y_train = labels[:-validation_data]
x_val = data[-validation_data:]
y_val = labels[-validation_data:]

# 4. Embedding

In [None]:
#Employing Embedding to provide mapping semantic meaning into a geometric space to text data.
EMBEDDING_DIM = 200
MAX_SEQUENCE_LENGTH = 150
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)

# 5. CNN model(When One Word Considered)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#Input Layer
embedded_sequences = embedding_layer(sequence_input)

#Layer 1
l_cov1= Conv1D(512, 1,activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(1)(l_cov1)

#Layer 2
l_cov2 = Conv1D(256, 1, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(1)(l_cov2)

#Layer 3
l_cov3 = Conv1D(256, 1, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(1)(l_cov3)

#Layer 4
l_cov4 = Conv1D(128, 1, activation='relu')(l_pool3)
l_pool4 = MaxPooling1D(1)(l_cov4)  

#Layer 5
l_cov5 = Conv1D(128, 1, activation='relu')(l_pool4)
l_pool5 = MaxPooling1D(1)(l_cov5)  

#Global Flattenning and Dense layer
l_flat = Flatten()(l_pool4)
l_dense = Dense(128, activation='relu')(l_flat)

#Activation Layer
preds = Dense(2, 
              activation='softmax',
              kernel_initializer='he_normal')(l_dense)

model1 = Model(sequence_input, preds)

#Compiling the model
model1.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(0.001),
              metrics=['accuracy'])

#Displaying the structure of the model
model1.summary()

#Checkpoint to save the model
checkpoint=ModelCheckpoint('cnn_model_1word.hdf5',
                   monitor='val_acc',
                   verbose=1,
                   save_best_only=True
                  )


In [None]:
#Fitting the model

history=model1.fit(
                 x_train, 
                 y_train, 
                 validation_data=(x_val, y_val),
                 epochs=10, 
                 batch_size=64,
                 callbacks=[checkpoint],
                 shuffle=True)

# 6. Plotting the Accuracy and Loss results

In [None]:
accuracy = history.history['acc']
validation_accuracy = history.history['val_acc']
Train_loss = history.history['loss']
validation_loss = history.history['val_loss']
epochs = range(1, len(accuracy) + 1)
plt.figure(figsize=(8,6))


##Plotting the Accuracy for the Model##

plt.title('Accuracy Plot')
plt.plot(epochs,accuracy, 'green', label='Training Accuracy')
plt.plot(epochs,validation_accuracy, 'brown', label='Test Accuracy')
legend = plt.legend(loc='best', shadow=True, fontsize='small')
legend.get_frame().set_facecolor('C0')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.tight_layout()

def annot_max(x,y, ax=None):
    xmax = x[np.argmax(y)]
    ymax = max(y)
    text= "Maximum Accuracy={:.5f}".format(ymax*100)
    if not ax:
        ax=plt.gca()
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.50)
    arrowprops=dict(arrowstyle="->",connectionstyle="angle,angleA=0,angleB=90")
    kw = dict(xycoords='data',textcoords="axes fraction",
              arrowprops=arrowprops, bbox=bbox_props, ha="right", va="top")
    ax.annotate(text, xy=(xmax, ymax), xytext=(0.60,0.60), **kw)

annot_max(epochs,validation_accuracy)
plt.grid(True)
plt.savefig('2wordsAccuracy.png')
plt.show()

##Plotting the Loss for the Model##

plt.figure(figsize=(8,6))
plt.title('Loss Plot')
plt.plot(epochs,Train_loss, 'green', label='Training loss')
plt.plot(epochs,validation_loss, 'brown', label='Test loss')
legend = plt.legend(loc='best', shadow=True, fontsize='small')
legend.get_frame().set_facecolor('C0')
plt.legend()
plt.xlabel('Epoch')

def annot_min(x,y, ax=None):
    xmax = x[np.argmin(y)]
    ymin = min(y)
    text= "Minimum loss={:.5f}".format(ymin)
    if not ax:
        ax=plt.gca()
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.50)
    arrowprops=dict(arrowstyle="->",connectionstyle="angle,angleA=0,angleB=90")
    kw = dict(xycoords='data',textcoords="axes fraction",
              arrowprops=arrowprops, bbox=bbox_props, ha="right", va="top")
    ax.annotate(text, xy=(xmax, ymin), xytext=(0.90,0.90), **kw)

annot_min(epochs,validation_loss)
plt.ylabel('Loss')
plt.tight_layout()
plt.grid(True)
plt.savefig('2wordsLoss.png')
plt.show()

# 7. CNN model(When Two Words Considered)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#Input Layer
embedded_sequences = embedding_layer(sequence_input)

#Layer 1
l_cov1= Conv1D(512, 1,activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(pool_size=2, strides=1)(l_cov1)

#Layer 2
l_cov2 = Conv1D(256, 1, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(pool_size=2, strides=1)(l_cov2)

#Layer 3
l_cov3 = Conv1D(256, 1, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(pool_size=2, strides=1)(l_cov3)

#layer 4
l_cov4 = Conv1D(128, 1, activation='relu')(l_pool3)
l_pool4 = MaxPooling1D(pool_size=2, strides=1)(l_cov4)  

#Model flattening and dense layers
l_flat = Flatten()(l_pool4)
l_dense = Dense(128, activation='relu')(l_flat)

#Applying Activation
preds = Dense(2, activation='softmax',kernel_initializer='he_normal')(l_dense)

model = Model(sequence_input, preds)

#Compiling the model
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(0.001),
              metrics=['accuracy'])

#Displaying the structure of the model
model.summary()

#Checkpoint to save the model
checkpoint2=ModelCheckpoint(
                    'model_cnn.hdf5.hdf5',
                    monitor='val_acc',
                    verbose=1,
                    save_best_only=True
                  )



In [None]:
#Fitting the model

history1=model.fit(
                 x_train, 
                 y_train, 
                 validation_data=(x_val, y_val),
                 epochs=10, 
                 batch_size=64,
                 callbacks=[checkpoint2],
                 shuffle=True)

# 8. Plotting the Accuracy and Loss plot

In [None]:
accuracy = history1.history['acc']
validation_accuracy = history1.history['val_acc']
Train_loss = history1.history['loss']
validation_loss = history1.history['val_loss']
epochs = range(1, len(accuracy) + 1)
plt.figure(figsize=(8,6))


##Plotting the Accuracy for the Model##

plt.title('Accuracy Plot')
plt.plot(epochs,accuracy, 'green', label='Training Accuracy')
plt.plot(epochs,validation_accuracy, 'brown', label='Test Accuracy')
legend = plt.legend(loc='best', shadow=True, fontsize='small')
legend.get_frame().set_facecolor('C0')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.tight_layout()

def annot_max(x,y, ax=None):
    xmax = x[np.argmax(y)]
    ymax = max(y)
    text= "Maximum Accuracy={:.5f}".format(ymax*100)
    if not ax:
        ax=plt.gca()
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.50)
    arrowprops=dict(arrowstyle="->",connectionstyle="angle,angleA=0,angleB=90")
    kw = dict(xycoords='data',textcoords="axes fraction",
              arrowprops=arrowprops, bbox=bbox_props, ha="right", va="top")
    ax.annotate(text, xy=(xmax, ymax), xytext=(0.60,0.60), **kw)

annot_max(epochs,validation_accuracy)
plt.grid(True)
plt.savefig('2wordsAccuracy.png')
plt.show()

##Plotting the Loss for the Model##

plt.figure(figsize=(8,6))
plt.title('Loss Plot')
plt.plot(epochs,Train_loss, 'green', label='Training loss')
plt.plot(epochs,validation_loss, 'brown', label='Test loss')
legend = plt.legend(loc='best', shadow=True, fontsize='small')
legend.get_frame().set_facecolor('C0')
plt.legend()
plt.xlabel('Epoch')

def annot_min(x,y, ax=None):
    xmax = x[np.argmin(y)]
    ymin = min(y)
    text= "Minimum loss={:.5f}".format(ymin)
    if not ax:
        ax=plt.gca()
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.50)
    arrowprops=dict(arrowstyle="->",connectionstyle="angle,angleA=0,angleB=90")
    kw = dict(xycoords='data',textcoords="axes fraction",
              arrowprops=arrowprops, bbox=bbox_props, ha="right", va="top")
    ax.annotate(text, xy=(xmax, ymin), xytext=(0.90,0.90), **kw)

annot_min(epochs,validation_loss)
plt.ylabel('Loss')
plt.tight_layout()
plt.grid(True)
plt.savefig('2wordsLoss.png')
plt.show()