In [1]:
import pandas as pd
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.transforms import Bbox
from random import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

In [19]:
reviews=pd.read_csv("../tripadvisor_dataset/reviews.csv")
reviews.head(2)

Unnamed: 0,id,reviewer name,title,date,rating,review
0,13969825,bertd818,supper snelle en lekkere lunch,"September 26, 2022",5.0,"supper lekker gegeten tijdens de middag, als w..."
1,13969825,593laetitiad,Un délicieux repas aux saveurs de la Thaïlande,"September 24, 2022",5.0,Un menu lunch très bien équilibré aux niveaux ...


In [20]:
reviews_positive = reviews[reviews["rating"] == 5]
reviews_negative = reviews[reviews["rating"] == 1]
print(reviews_negative.shape)
reviews_positive.shape

(8060, 6)


(58117, 6)

In [21]:
reviews_positive = reviews_positive.head(2500)
reviews_negative = reviews_negative.head(2500)
reviews = pd.concat([reviews_negative, reviews_positive])
reviews

Unnamed: 0,id,reviewer name,title,date,rating,review
57,13969825,398maartjeg,"Wel ok, maar zeker niet de allerbeste","June 11, 2022",1.0,Bediening is aardig maar eten is niet heel spe...
118,13969825,michellD9555DF,Waardeloze service,"March 5, 2022",1.0,Niet eens aan eten toegekomen. Je reserveert e...
141,13969825,143andreic,Ungenügend,"January 6, 2022",1.0,Wir haben nicht reserviert kamen rein und habe...
208,13969825,559zul,teleurstelling,"September 9, 2021",1.0,Be ready to wait LONG for your food.\nWe had t...
494,13969825,sv20172017,Rendement per m2 optimaal,"January 3, 2020",1.0,Na het lezen van de recentie van Gault & Milla...
...,...,...,...,...,...,...
2946,10032417,593sannew,Excellent !,"August 6, 2018",5.0,Wat een prachtige plek en een enorm toffe eige...
2947,10032417,513veerlev,Love the experience,"August 3, 2018",5.0,It is not a place where you just have somethin...
2948,10032417,76sandrar,Lovely Lunch and Owner!,"July 30, 2018",5.0,"Not only is the place super cute and original,..."
2949,10032417,PuchPuch,Tres bon goûter sur une agréable,"July 7, 2018",5.0,Le lieu mérite une visite ne serait-ce que pou...


In [22]:
reviews_txt = open("reviews.txt", "w", encoding="utf-8")

reviews_cleaned = ""
for rev in list(reviews["review"]):
    rev = str(rev)
    reviews_txt.write(rev + "\n")

reviews_txt.close()
with open("reviews.txt", "r", encoding="utf-8") as f:
    reviews_cleaned = f.read()


In [23]:
print(len(reviews_cleaned))

2159953


## Preprocessing data

In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(reviews_cleaned)
max_id = len(tokenizer.word_index)



In [6]:
dataset_size = tokenizer.document_count # total number of characters
[encoded] = np.array(tokenizer.texts_to_sequences([reviews_cleaned])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

2022-12-19 13:23:12.859769: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-19 13:23:15.399710: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30974 MB memory:  -> device: 0, name: Tesla V100-SXM3-32GB, pci bus id: 0000:96:00.0, compute capability: 7.0


In [7]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [8]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [9]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [10]:
dataset = dataset.prefetch(1)
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 2864) (32, 100)


In [11]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=10)

Epoch 1/10


2022-12-19 13:23:31.785806: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8500


 129719/Unknown - 9558s 74ms/step - loss: 1.6625

KeyboardInterrupt: 

In [12]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [13]:
X_new = preprocess(["How are yo"])
#Y_pred = model.predict_classes(X_new)
Y_pred = np.argmax(model(X_new), axis=-1)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char

'u'

In [14]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]



In [15]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text



In [45]:
print(complete_text("good food",100, temperature=0.2))

good food. the staff were very friendly and the staff were very friendly and service was very friendly and se


## save the model

In [24]:
model.save('./review_generator_v1/')



INFO:tensorflow:Assets written to: ./review_generator_v1/assets


INFO:tensorflow:Assets written to: ./review_generator_v1/assets
