In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import pickle
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout

In [None]:
import spacy
import spacy.cli
from scipy import spatial
spacy.cli.download("en_core_web_md")
nlp = spacy.load('en_core_web_md')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


Definition of a plot function for training result visualization

In [None]:
def plot_results(history):
    hist_df = pd.DataFrame(history.history)
    hist_df.columns=["loss", "accuracy", "val_loss", "val_accuracy"]
    hist_df.index = np.arange(1, len(hist_df)+1)
    
    fig, axs = plt.subplots(nrows=2, sharex=True, figsize=(16, 10))
    axs[0].plot(hist_df.val_accuracy, lw=3, label='Validation Accuracy')
    axs[0].plot(hist_df.accuracy, lw=3, label='Training Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].grid()
    axs[0].legend(loc=0)
    axs[1].plot(hist_df.val_loss, lw=3, label='Validation Loss')
    axs[1].plot(hist_df.loss, lw=3, label='Training Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].grid()
    axs[1].legend(loc=0)
    
    plt.show();

IMDB database loading

In [None]:
# Mounting the google drive to google colab in order to load the data files directly from it
from google.colab import drive
drive.mount('/content/drive')
imdb_df = pd.read_csv("/content/drive/MyDrive/EPITA_NLP/Course2/IMDB Dataset.csv")

Mounted at /content/drive


In [None]:
# These data from IMDB correspond to movie review, with sentiment (postive/negative) labels
imdb_df.head

In [None]:
len(imdb_df)

## Preprocessing of the data

We get the IMDB dataset

Here is the original code to transform the texts into 300-dimension embedding vectors with spaCy pretrained model.
However, in order to save most of the computation time needed, we just do the operation **on a small subsample** of the complete data. On the **next cells**, we load directly the **embedding already computed on all the data** to use it directly.

Let's have a look to an example of review

In [None]:
review_example = imdb_df["review"][0]
review_example

Let's see how to use spaCy to turn it into a 300-dimension **text** embedding vector

In [None]:
text_embedding_example =  nlp(review_example).vector
text_embedding_example

In [None]:
text_embedding_example.shape

As explained just before, we only compute the embedding on the 100 first element in order to avoid a too long computation time

In [None]:
size_data = 100#len(test_df)
list_embed = [float('nan')] * size_data
list_label = [float('nan')] * size_data

compt = 0
for sentence in list(imdb_df.itertuples())[0:size_data]:
    text_embed = nlp(sentence.review).vector
    observed_sentiment = sentence.sentiment
    if  observed_sentiment=="positive":
      label = 1.0
    else:
      label = 0.0
    list_embed[compt] = np.asarray(text_embed, dtype ="float32").reshape(1,300)
    list_label[compt] = label
    compt += 1

In [None]:
print(list_label)

In [None]:
len(list_embed)

To save time processing, we download the **embeddings already computed on the complete dataset**. It may take some time anyway but it is still much less than doing the whole operation once again.

In [None]:
df_imdb_embed_label = pd.read_pickle("/content/drive/MyDrive/EPITA_NLP/Course2/df_imdb_embed_label.pkl")
list_embed = list(df_imdb_embed_label["embedding"])
list_label = list(df_imdb_embed_label["label"])

In [None]:
len(list_embed)

## Use of a neural network to perform sentiment analysis from the spaCy embeddings

## Neural network model definition

Build a neural network using keras sequential layers

(you may have a look at https://keras.io/api/layers/)

In [None]:
# Question 1: Build a neural network using relevant layers, dimensions and activation function (the input layer is already defined to help you)
model = tf.keras.models.Sequential([
    Dense(300, activation="relu"),
    #??????
    #??????
    #....
])

We build the model and check that everything is fine

In [None]:
model(list_embed[0])
model.summary()

We compile the model, choosing the relevant loss function, optimizer and metrics

(You may have a look at
https://keras.io/api/losses/
and
https://keras.io/api/optimizers/)

In [None]:
# Question 2: Choose a relevant loss fonction and optimizer for the training
loss_function = # ?????
optimizer = # ??????

model.compile(loss=loss_function, optimizer=optimizer,
              metrics=["accuracy"])

We train the model on the dataset

In [None]:
n_dim_embedding = 300
size_data = 10000
array_embed = np.asarray(list_embed[0:size_data]).reshape(size_data,n_dim_embedding)
array_label = np.asarray(list_label[0:size_data])

In [None]:
# Question 3: Choose relevant values for epochs, batch_size and validation_split
# (Start with small values for epochs in order to save some computation time)
epochs = # ?????
batch_size = # ?????
validation_split = # ?????

history = model.fit(x = array_embed, y = array_label, epochs = epochs, batch_size= 1, validation_split=0.1)
#history = model.fit(x = np.asarray(list_embed), y = np.asarray(list_label), epochs = epochs, batch_size= batch_size, validation_split=validation_split)

## Result visualization

In [None]:
plot_results(history)

In [None]:
# Question 4: What can you tell about the results? Does it seem satisfying to you? Do you see any hint of an over-fitting? If yes, what kind of layers can you use into the Keras model in order to prevent this phenomenon?