In [None]:
# pip install nltk

In [1]:
import pandas as pd
import numpy
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
file = pd.read_csv("DisneylandReviews.csv", encoding='ISO-8859-1')
file

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong
...,...,...,...,...,...,...
42651,1765031,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,Disneyland_Paris
42652,1659553,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,Disneyland_Paris
42653,1645894,5,missing,South Africa,My eleven year old daughter and myself went to...,Disneyland_Paris
42654,1618637,4,missing,United States,"This hotel, part of the Disneyland Paris compl...",Disneyland_Paris


In [3]:
text = file["Review_Text"]
text

0        If you've ever been to Disneyland anywhere you...
1        Its been a while since d last time we visit HK...
2        Thanks God it wasn   t too hot or too humid wh...
3        HK Disneyland is a great compact park. Unfortu...
4        the location is not in the city, took around 1...
                               ...                        
42651    i went to disneyland paris in july 03 and thou...
42652    2 adults and 1 child of 11 visited Disneyland ...
42653    My eleven year old daughter and myself went to...
42654    This hotel, part of the Disneyland Paris compl...
42655    I went to the Disneyparis resort, in 1996, wit...
Name: Review_Text, Length: 42656, dtype: object

In [4]:
corpus = text.to_string()
corpus



In [5]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [6]:
input = tokenize_words(corpus)
input



In [10]:
chars = sorted(list(set(input)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [11]:
input_len = len(input)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 1496250
Total vocab: 37


In [12]:
seq_length = 50
x_data = []
y_data = []

In [13]:
for i in range(0, input_len - seq_length, 1):
    in_seq =input[i:i + seq_length]
    out_seq = input[i + seq_length]
    
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [14]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 1496200


In [15]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)
X

array([[[0.02702703],
        [0.        ],
        [0.40540541],
        ...,
        [0.78378378],
        [0.51351351],
        [0.81081081]],

       [[0.        ],
        [0.40540541],
        [0.86486486],
        ...,
        [0.51351351],
        [0.81081081],
        [0.        ]],

       [[0.40540541],
        [0.86486486],
        [0.40540541],
        ...,
        [0.81081081],
        [0.        ],
        [0.48648649]],

       ...,

       [[0.64864865],
        [0.37837838],
        [0.        ],
        ...,
        [0.27027027],
        [0.18918919],
        [0.        ]],

       [[0.37837838],
        [0.        ],
        [0.7027027 ],
        ...,
        [0.18918919],
        [0.        ],
        [0.89189189]],

       [[0.        ],
        [0.7027027 ],
        [0.2972973 ],
        ...,
        [0.        ],
        [0.89189189],
        [0.51351351]]])

In [16]:
y = np_utils.to_categorical(y_data)
y

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [17]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [19]:
filepath = "Resources/full_model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [20]:
model.fit(X, y, epochs=6, batch_size=400, callbacks=desired_callbacks)

Epoch 1/6

Epoch 00001: loss improved from inf to 2.06809, saving model to Resources/full_model_weights_saved.hdf5
Epoch 2/6

Epoch 00002: loss improved from 2.06809 to 1.58838, saving model to Resources/full_model_weights_saved.hdf5
Epoch 3/6

Epoch 00003: loss improved from 1.58838 to 1.44983, saving model to Resources/full_model_weights_saved.hdf5
Epoch 4/6

Epoch 00004: loss improved from 1.44983 to 1.36552, saving model to Resources/full_model_weights_saved.hdf5
Epoch 5/6

Epoch 00005: loss improved from 1.36552 to 1.30548, saving model to Resources/full_model_weights_saved.hdf5
Epoch 6/6

Epoch 00006: loss improved from 1.30548 to 1.26116, saving model to Resources/full_model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7fcef813ef50>

In [21]:
filename = "Resources/full_model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [22]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [23]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" riety younge 30604 disneyland paris years ago 3060 "
