In [None]:
import keras
import tensorflow as tf  
print(keras.__version__)
print(tf.__version__)

# Loading the yelp review dataset

The goal is to predict the review rating based on the comments left by yelp users

In [None]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
df_yelp = pd.read_csv("/home/dagrawal1/Text_Analytics/CleanedYelpData.csv")

Convert target variable to one-hot encoding matrix

In [None]:
y_binary = to_categorical(df_yelp['review_rating'])

## Simple Network

Training data pre-processing: (experiment with 25,000 data points first)
1. set vocabulary size to 50,000
2. convert comments to one-hot encoding matrix
3. limit each the length of comment up to 200 words

In [None]:
vocabulary_size = 50000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df_yelp['text'][:25000])
sequences = tokenizer.texts_to_sequences(df_yelp['text'][:25000])
data = pad_sequences(sequences, maxlen=200)

In [None]:
simple_model = Sequential()
simple_model.add(Embedding(50000, 100, input_length=200))
simple_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
simple_model.add(Dense(6, activation='sigmoid'))
simple_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
import time
from keras import callbacks
stime = time.time();
Simple_lstm_history = simple_model.fit(data, y_binary[:25000],
                       epochs = 3, ##number of epochs (passes through the data)
                       batch_size = 128, ##batch size
                       validation_split = 0.4, ##fraction of data to be used as validation
                       shuffle = True, ##shuffle data after each epoch
                        callbacks=[keras.callbacks.ModelCheckpoint(
                            filepath='multi_weights_simple.h5',
                            save_best_only=True,
                            save_weights_only=True,
                            verbose=1)]
                       );
etime = time.time();
print('Total time: '+str(etime-stime));

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.figure(figsize=[15,8]);
plt.plot(Simple_lstm_history.history['acc'],'-ro',linewidth=2,label='LSTM Train');

plt.plot(Simple_lstm_history.history['val_acc'],':ro',linewidth=2,label='LSTM Test');

plt.xlabel('Epoch');
plt.ylabel('Accuracy');
plt.legend();

In [None]:
test_sequences = tokenizer.texts_to_sequences(df_yelp['text'][1000000:1200000])
test_data = pad_sequences(test_sequences, maxlen=200)
predicted_classes = lstm_model.predict_classes(test_data)

In [None]:
import numpy as np
test_Y = np.array(df_yelp['review_rating'][1000000:1200000])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_Y,predicted_classes)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
array = confusion_matrix(test_Y,predicted_classes)
df_cm = pd.DataFrame(array, index = [i for i in "12345"],
                  columns = [i for i in "12345"])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, fmt="d")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")

## Complex Network

Training data pre-processing: (experiment with 1 million data points first)
1. set vocabulary size to 80,000
2. convert comments to one-hot encoding matrix
3. limit each the length of comment up to 300 words

In [None]:
### Create sequence
vocabulary_size = 80000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df_yelp['text'][:1000000])
sequences = tokenizer.texts_to_sequences(df_yelp['text'][:1000000])
data = pad_sequences(sequences, maxlen=300)

In [None]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
lstm_model = Sequential()
lstm_model.add(Embedding(80000, 100, input_length=300))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(6, activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

In [None]:
import time
from keras import callbacks
stime = time.time();
lstm_history = lstm_model.fit(data, y_binary[:1000000],
                       epochs = 10, ##number of epochs (passes through the data)
                       batch_size = 128, ##batch size
                       validation_split = 0.4, ##fraction of data to be used as validation
                       shuffle = True, ##shuffle data after each epoch
                        callbacks=[keras.callbacks.ModelCheckpoint(
                            filepath='multi_weights_V80K_WL300.h5',
                            save_best_only=True,
                            save_weights_only=True,
                            verbose=1)]
                       );
etime = time.time();
print('Total time: '+str(etime-stime));

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.figure(figsize=[15,8]);
plt.plot(lstm_history.history['acc'],'-ro',linewidth=2,label='LSTM Train');

plt.plot(lstm_history.history['val_acc'],':ro',linewidth=2,label='LSTM Test');

plt.xlabel('Epoch');
plt.ylabel('Accuracy');
plt.legend();

In [None]:
test_sequences = tokenizer.texts_to_sequences(df_yelp['text'][1000000:1200000])
test_data = pad_sequences(test_sequences, maxlen=200)

In [None]:
import numpy as np
test_Y = np.array(df_yelp['review_rating'][1000000:1200000])

In [None]:
predicted_classes = lstm_model.predict_classes(test_data)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(test_Y,predicted_classes)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
array = confusion_matrix(test_Y,predicted_classes)
df_cm = pd.DataFrame(array, index = [i for i in "12345"],
                  columns = [i for i in "12345"])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, fmt="d")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")