In [1]:
# Check keras and tensorflow versions
import keras
import tensorflow as tf

print("Keras version: ", keras.__version__)
print("Tensorflow version: ", tf.__version__)

Keras version:  2.4.3
Tensorflow version:  2.4.0


# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt                                                 # for visualisation
import seaborn as sns
import re                                                                       # for regular expression
import string                                                                   # for handling string
import math                                                                     # for math

                                         # for EDA

# Packages for data preparation
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

import keras
from keras import models
from keras import layers
from keras import regularizers
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
import tensorflow as tf

from sklearn.metrics import confusion_matrix                                    # for the modelling and plotting of curves
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
data = '500k_hotels.csv'
raw_df = pd.read_csv(data)
raw_df.head()

Unnamed: 0,sentiments,c_review
0,0,park outside beautiful angry make post availab...
1,1,real complaint location surroundings room amen...
2,1,location staff cute breakfast range nice back ...
3,0,location nice surroundings bar restaurant nice...
4,1,amaze location building romantic setting book ...


In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   sentiments  515738 non-null  int64 
 1   c_review    510171 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.9+ MB


In [5]:
raw_df = raw_df[raw_df['c_review'].notna()]

In [6]:
raw_df['sentiments'].value_counts()

1    470327
0     39844
Name: sentiments, dtype: int64

In [7]:
df_0 = raw_df[raw_df['sentiments'] == 0].sample(n=5000)
df_1 = raw_df[raw_df['sentiments'] == 1].sample(n=5000)

In [8]:
df = pd.concat([df_0, df_1])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 99598 to 352834
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sentiments  10000 non-null  int64 
 1   c_review    10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 234.4+ KB


# Hyper parameter tuning

In [10]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [11]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers

# Main settings
embedding_dim = 50
maxlen = 100


sentences = df['c_review'].values
y = df['sentiments'].values

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size = 0.3, random_state = 42,stratify = y)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)




In [12]:

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen],
                  epochs=[10,20,30])
model = KerasClassifier(build_fn=create_model,
                        batch_size=10,
                        verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                          cv=4, verbose=1, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


In [13]:
# Evaluate results

score = grid_result.best_score_
params = grid_result.best_params_
TestAccuracy =test_accuracy
print("Best Score: {:.4f}".format(score))
print("Best Parameters: ",params)
print("Test Accuracy: {:.4f}".format(TestAccuracy))



Best Score: 0.7611
Best Parameters:  {'vocab_size': 8388, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 5, 'epochs': 20, 'embedding_dim': 50}
Test Accuracy: 0.7673


# Clear session

In [14]:
# from keras.backend import clear_session
# clear_session()