In [None]:
import re
import nltk
import uuid

import os
import numpy as np
import pandas as pd

import tensorflow as tf
import keras
from keras import models, layers, optimizers, regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical

print('Keras version: ', keras.__version__)
print('Tensorflow version: ', tf.__version__)

In [None]:
import urllib.request

data_location = './data'
base_data_url = 'https://raw.githubusercontent.com/idiWork/Experiment_102/master/'
filesToDownload = ['reviews_labels.txt', 'reviews_texts.txt', 'contractions.py', 'textanalytics.py']

os.makedirs(data_location, exist_ok=True)

for file in filesToDownload:
    data_url = os.path.join(base_data_url, file)
    local_file_path = os.path.join(data_location, file)
    urllib.request.urlretrieve(data_url, local_file_path)
    print('Downloaded file: ', file)

In [None]:
nltk.download('stopwords')

In [None]:
import sys
sys.path.append(data_location)
import textanalytics as ta

In [None]:
reviews_corpus = [review for review in open(os.path.join(data_location, 'reviews_texts.txt'))]
reviews_corpus

In [None]:
labels = [int(re.sub("\n", "", label)) for label in open(os.path.join(data_location, 'reviews_labels.txt'))]
print(len(labels))
print(labels[0:5]) # first 5 labels
print(labels[20:25]) # middle 5 labels
print(labels[-5:]) # last 5 labels

In [None]:
labels = to_categorical(labels, 3)
print(labels.shape)
print(labels[0:3]) # first 3 categorical labels
print(labels[-3:]) # last 3 categorical labels

In [None]:
nltk.download('punkt')

In [None]:
norm_corpus = ta.normalize_corpus(reviews_corpus)
norm_corpus

In [None]:
vectorizer, tfidf_matrix = ta.build_feature_matrix(norm_corpus) 
data = tfidf_matrix.toarray()
print(data.shape)
data

In [None]:
np.random.seed(125)
model = Sequential()
model.add(Dense(60, input_dim=data.shape[1], kernel_regularizer=regularizers.l2(0.02)))
model.add(Activation('relu'))
model.add(Dense(3))
model.add(Activation('sigmoid'))

model.summary()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=0)

In [None]:
opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
epochs = 100
batch_size = 16
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

In [None]:
test_reviews = ['The room was very nice and the beds were especially comfortable.', 
              'The kids loved going to the Kids Club at the swimming pool.', 
              'The food was great and the buffet was priced very reasonably.']
test_reviews = ta.normalize_corpus(test_reviews)
test_reviews = vectorizer.transform(test_reviews)

test_reviews = test_reviews.toarray()
print(test_reviews.shape)

In [None]:
pred = model.predict(test_reviews)
pred_label = pred.argmax(axis=1)
pred_df = pd.DataFrame(np.column_stack((pred,pred_label)), columns=['class_0_room', 'class_1_diner', 'class_2_pool', 'label'])
pred_df.label = pred_df.label.astype(int)
print('Predictions')
pred_df

In [None]:
from sklearn.externals import joblib

output_folder = './output'
model_filename = 'final_model.hdf5'
os.makedirs(output_folder, exist_ok=True)
model.save(os.path.join(output_folder, model_filename))

vectorizer_name = 'vectorizer'
joblib.dump(vectorizer, os.path.join(output_folder, vectorizer_name))

In [None]:
from keras.models import load_model
loaded_model = load_model(os.path.join(output_folder, model_filename))
loaded_model.summary()

In [None]:
pred = loaded_model.predict(test_reviews)
pred_label = pred.argmax(axis=1)
pred_df = pd.DataFrame(np.column_stack((pred,pred_label)), columns=['class_0_room', 'class_1_diner', 'class_2_pool', 'label'])
pred_df.label = pred_df.label.astype(int)
print('Predictions')
pred_df