In [None]:
# Import Tensorflow & Pathlib librairies
import tensorflow as tf 
import pathlib 
import pandas as pd 
import os
import io
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [None]:
# Import dataset with Pandas 
dataset = pd.read_csv("https://go.aws/314bBDq", error_bad_lines=False, encoding="utf-8")
dataset.head()

In [None]:
# Taking only french reviews
french_reviews = dataset[dataset.review_lang == "french"]
french_reviews.head()

In [None]:
# Let's take the columns we're interested in 
french_reviews = french_reviews[["review", "stars"]]
french_reviews.head()

In [None]:
!python -m spacy download fr_core_news_md -q


In [None]:
# Import Spacy and french initialisation
import fr_core_news_md
nlp = fr_core_news_md.load()

In [None]:
# Import Stop words 
from spacy.lang.fr.stop_words import STOP_WORDS

In [None]:
# cleaning our texts in order to prepare them for training
french_reviews["review_clean"] = french_reviews["review"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" " or ch=="'"))
french_reviews["review_clean"] = french_reviews["review_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
french_reviews["review_clean"] = french_reviews["review_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))

french_reviews

In [None]:
french_reviews = pd.read_csv("https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Deep+Learning/sentiment-analysis/french_reviews_clean.csv")


In [None]:
mask = french_reviews.review_clean.apply(lambda x: type(x)==str)
mask.value_counts()

In [None]:
french_reviews = french_reviews.loc[mask,:]


In [None]:
#instanciating the tokenizer, make sure you set it up to keep only the 1000 most common words 
import numpy as np
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000, oov_token="out_of_vocab") # instanciate the tokenizer
tokenizer.fit_on_texts(french_reviews.review_clean)
french_reviews["review_encoded"] = tokenizer.texts_to_sequences(french_reviews.review_clean)

In [None]:
french_reviews.head()

In [None]:
full_ds = tf.data.Dataset.from_tensor_slices((french_reviews.review_encoded, french_reviews.stars.values-1))

In [None]:
reviews_pad = tf.keras.preprocessing.sequence.pad_sequences(french_reviews.review_encoded, padding="post")

In [None]:
xtrain, xval, ytrain, yval = train_test_split(reviews_pad,french_reviews.stars, test_size=0.3)

In [None]:
train = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
val = tf.data.Dataset.from_tensor_slices((xval, yval))

In [None]:
train_batch = train.shuffle(len(train)).batch(64)
val_batch = val.shuffle(len(val)).batch(64)

In [None]:
for review, star in train_batch.take(1):
  print(review, star)

In [None]:
#creating a model in order to train an embedding
vocab_size = tokenizer.num_words
model = tf.keras.Sequential([
                  # Couche d'Input Word Embedding           
                  tf.keras.layers.Embedding(vocab_size+1, 8, input_shape=[review.shape[1],],name="embedding"),
                  # Gobal average pooling
                  tf.keras.layers.GlobalAveragePooling1D(),

                  # Couche Dense classique
                  tf.keras.layers.Dense(16, activation='relu'),

                  # Couche de sortie avec le nombre de neurones en sortie égale au nombre de classe avec fonction softmax
                  tf.keras.layers.Dense(1, activation="linear")
])

In [None]:
model.summary()


In [None]:
optimizer= tf.keras.optimizers.Adam()

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [None]:
history = model.fit(train_batch, 
                    epochs=20, 
                    validation_data=val_batch)

In [None]:
import matplotlib.pyplot as plt

# Visualization of the training process on the loss function 
plt.plot(history.history["loss"], color="b")
plt.plot(history.history["val_loss"], color="r")
plt.ylabel("loss")
plt.xlabel("Epochs")
plt.show()

In [None]:
# Visualization of accuracy training 
plt.plot(history.history["mean_absolute_error"], color="b")
plt.plot(history.history["val_mean_absolute_error"], color="r")
plt.ylabel("mean_absolute_error")
plt.xlabel("Epochs")
plt.show()

In [None]:
vocab = [value for value in tokenizer.index_word.values()]
vocab = vocab[:1000]
weights = model.get_layer('embedding').get_weights()[0]

log_dir = "/content/logs/embed"
os.makedirs(log_dir, exist_ok=True)
out_v = io.open(log_dir+"/vectors.tsv", 'w', encoding='utf-8')
out_m = io.open(log_dir+"/metadata.tsv", 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()