In [1]:
# ! pip install nltk

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

# Objectives
1. Text cleaning
2. Text preprocessing for custom embedding Neural Network
3. Train RNN model for sentiment analysis

- This notebook will be your final deliverable. 
- Make sure it can run "restart and run all"
- Do not "clear output"

# 0. Load data

Our dataset contains 30,000 french reviews of movies, along with the binary class 1 (positive) or 0 (negative) score

In [3]:
# We load the dataset for you
data = pd.read_csv('https://wagon-public-datasets.s3.amazonaws.com/certification_paris_2021Q1/movies.csv')
data

Unnamed: 0,review,polarity
0,Ça commence à devenir énervant d'avoir l'impre...,0
1,"J'ai aimé ce film, si il ressemble a un docume...",1
2,Une grosse merde ce haneke ce faire produire p...,0
3,"Beau mélodrame magnifiquement photographié, ""V...",1
4,A la poursuite du diamant vers est un film pro...,1
...,...,...
29946,Le meilleur film de super-héros derrière le ba...,1
29947,Un drame qui est d'une efficacité remarquable....,1
29948,"Une daube hollywoodienne de plus, aucun intérê...",0
29949,Et voilà un nouveau biopic sur la star du X Li...,0


In [5]:
# Create features
y = data.polarity
X = data.review

# Analyse class balance
print(pd.value_counts(y))

1    15051
0    14900
Name: polarity, dtype: int64


In [8]:
# Check various reviews
print(f'polarity: {y[0]} \n')
print(X[0])

polarity: 0 

Ça commence à devenir énervant d'avoir l'impression de voir et revoir le même genre de film à savoir : la comédie romantique, surement le genre le plus prolifique de le production française actuelle. Le problème c'est que l'on a souvent affaire à des niaiseries de faible niveau comme celui ci. Avec un scenario ultra balisé et conventionnel, c'est à se demander comment ça peut passer les portes d'un producteur. Bref cette sempiternel histoire d'un homme mentant au nom de l'amour pour reconquérir une femme et qui à la fin se prend son mensonge en pleine figure est d'une originalité affligeante, et ce n'est pas la présence au casting de l'ex miss météo Charlotte Le Bon qui rêve surement d'avoir la même carrière que Louise Bourgoin qui change la donne.


# 1. Clean Text

❓ We need to give a quick & dirty cleaning to all the sentences in the dataset. Create a variable `X_clean` of similar shape, but with the following cleaning:
- Replace french accents by their non-accentuated equivalent using the [unidecode.unidecode()](https://pypi.org/project/Unidecode/) method
- Reduce all uppercases to lowercases
- Remove any characters outside of a-z, for instance using `string.isalpha()`

⚠️ You will be given the solution `X_clean` in the next question to start from clean data

In [16]:
# Setup words to removes
from nltk.tokenize import word_tokenize
from unidecode import unidecode

In [40]:
def clean_sentence(sentence):
    # replace accents etc...
    sentence_wo_accents = unidecode(sentence)
    sentence_lower = sentence_wo_accents.lower()
    # split sentence into individual words using nltk as better alternative than .split()
    words = word_tokenize(sentence_lower, language='french')
    # remove words that are not wanted
    words_cleaned = [w for w in words if w.isalpha()]
    # re-create sentence
    sentencence_cleaned = " ".join(words_cleaned)
    return sentencence_cleaned

In [42]:
X_clean = X.map(clean_sentence)

In [44]:
#X_clean.to_csv("X_clean_NLP.csv", index=False)

In [None]:
from nbresult import ChallengeResult

result = ChallengeResult('C14',
    shape = X_clean.shape
    first_sentence = X_clean[0]
)
result.write()

# Preprocess data

Now that we have clean sentences, we need to convert each one into a list of integer of fixed size
- For example, the sentence: `"this was good"` will become `[1, 3, 18, 0, 0, 0, ...0]`

❓ Create a numpy ndarray `X_input` of shape (29951, 100) that will be the direct input to your Neutral Network. 

- 29951 represents the number of reviews in the dataset `X_clean`
- 100 represents the maximum number of words to keep for each movie review.
- It must contain only numerical values of `dtype='float32'`, without any `NaN`

❓ In the process, compute and save the number of _unique_ words in your cleaned corpus under `vocab_size` variable

👉 First, you must start back from the solution below (14Mo)

In [48]:
X_clean = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/certification_paris_2021Q1/movies_X_clean.csv")['review']
X_clean

0        ca commence a devenir enervant de voir et revo...
1        aime ce film si il ressemble a un documentaire...
2        une grosse merde ce haneke ce faire produire p...
3        beau melodrame magnifiquement photographie ver...
4        a la poursuite du diamant vers est un film pro...
                               ...                        
29946    le meilleur film de derriere le batman de nola...
29947    un drame qui est efficacite remarquable un fil...
29948    une daube hollywoodienne de plus aucun interet...
29949    et voila un nouveau biopic sur la star du x li...
29950    un film qui fait vieux avec des acteurs pas to...
Name: review, Length: 29951, dtype: object

In [78]:
### YOUR CODE

In [49]:
# Spit clean sentences into list of words
X_clean_split = [sentence.split() for sentence in X_clean]

In [52]:
# Create an integer per unique word
word_to_id = {}
iter_ = 1
for sentence in X_clean_split :
    for word in sentence:
        if word in word_to_id:
            continue
        word_to_id[word] = iter_
        iter_ += 1

In [53]:
vocab_size = len(word_to_id)
vocab_size

62379

In [55]:
# Create the list of list of integers
X_token = [[word_to_id[w] for w in sentence if w in word_to_id] for sentence in X_clean_split]

In [57]:
# Then, pad them all into a same lenght of 100
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_input = pad_sequences(X_token, dtype='int32', padding='post', maxlen=100)

In [None]:
# X_intput = np.savetxt("X_input.csv", X_input, delimiter=",", dtype='int32')

In [90]:
from nbresult import ChallengeResult

result = ChallengeResult('C14-15',
    type_X = type(X_input), # numpy.ndarray
    shape = X_input.shape, # (29951, 100)
    one_input = X_input[1],
)

# Neural Network

❓Create and fit a Neural Netork that takes `X_input` and `y` as input, to binary classify each sentence's sentiment

- You cannot use transfer learning or other pre-existing Word2Vec models
- You must use a "recurrent" architecture to _capture_ a notion of order in the sentences' words
- The performance metrics for this task is "accuracy"
- You don't need to cross-validate your model: simply store the `history` of your model fit in a variable. It must comprises a validation_accuracy!

⚠️ We are no judging on your computer power. Feel free to stop after few epochs as soon as you see a score significantly better than 0.5

👉 First, you must start back from the solution below (70Mo)

In [69]:
X_input = np.genfromtxt('X_input.csv', delimiter=',', dtype='int32')

In [70]:
X_input_reloaded[1]

array([ 86,  72,  13,  87,  88,  89,   3,  36,  90,  91,   3,  15,  92,
        93,  10,  13,  94,  95,  96,  97,  98,  99, 100,  15, 101,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [91]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers 

def init_model(vocab_size):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size+1, output_dim=10, mask_zero=True))
    model.add(layers.LSTM(10))
    model.add(layers.Dense(5))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    return model

In [92]:
model = init_model(vocab_size=vocab_size)
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 10)          623800    
_________________________________________________________________
lstm_5 (LSTM)                (None, 10)                840       
_________________________________________________________________
dense_10 (Dense)             (None, 5)                 55        
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 6         
Total params: 624,701
Trainable params: 624,701
Non-trainable params: 0
_________________________________________________________________


In [98]:
from tensorflow.keras.callbacks import EarlyStopping

model = init_model(len(word_to_id))

es = EarlyStopping(verbose=2, patience=2, restore_best_weights=True)

history_training = model.fit(X_input, y,
          epochs=20, 
          batch_size=64,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Restoring model weights from the end of the best epoch.
Epoch 00004: early stopping


In [100]:
history_training.history

{'loss': [0.46774721145629883,
  0.22969667613506317,
  0.1653718203306198,
  0.1353389322757721],
 'accuracy': [0.7735750079154968,
  0.9103267192840576,
  0.9365609288215637,
  0.9482947587966919],
 'val_loss': [0.29574984312057495,
  0.23980697989463806,
  0.24028624594211578,
  0.24784930050373077],
 'val_accuracy': [0.8830403089523315,
  0.9035165905952454,
  0.9015134572982788,
  0.9046294093132019]}