**Exercice 1**

In [None]:
import urllib.request
import os


def download_text8(url, target_folder):
    os.makedirs(target_folder, exist_ok=True)
    target_file = os.path.join(target_folder, 'text8.zip')

    if not os.path.isfile(target_file):
        print("Downloading text8 dataset...")
        urllib.request.urlretrieve(url, target_file)
        print("Download complete.")


if __name__ == "__main__":
    text8_url = "http://mattmahoney.net/dc/text8.zip"
    target_folder = "text8_dataset"

    download_text8(text8_url, target_folder)

Downloading text8 dataset...
Download complete.


In [None]:
# This example is using a small chunk of Wikipedia articles to train from.

from __future__ import division, print_function, absolute_import

import collections
import os
import random
import urllib.request
import zipfile

import numpy as np
import tensorflow as tf

# # Set Parameters

# Training Parameters.
learning_rate = 0.1
# nombre de paires mot-contexte traitées à chaque itération.
batch_size = 100
# nombre total d’itérations d’entraînement
num_steps = 300000
#  fréquence d’affichage de la perte (loss)
display_step = 10000
# fréquence d’évaluation du modèle
eval_step = 200000

# Evaluation Parameters : Liste de mots qu’on utilisera pour évaluer la qualité des embeddings
eval_words = [b'five', b'of', b'going', b'hardware', b'american', b'britain']

# Word2Vec Parameters.
embedding_size = 200   # Chaque mot aura un vecteur de 200 dimensions.
# Total number of different words in the vocabulary.
max_vocabulary_size = 50000
min_occurrence = 10  # Remove all words that does not appears at least n times.
skip_window = 3  # Nombre de mots de contexte à gauche/droite.
num_skips = 2  # Combien de fois on va générer des couples à partir du mot central.
num_sampled = 64  # Nombre d’exemples négatifs pour le negative sampling

# text_words contient une liste de tous les mots du corpus
data_path = 'text8_dataset/text8.zip'
with zipfile.ZipFile(data_path) as f:
    text_words = f.read(f.namelist()[0]).lower().split()

# Build the dictionary and replace rare words with UNK token.
# garde les max_vocabulary_size - 1 mots les plus fréquents.

count = [('UNK', -1)]
count.extend(collections.Counter(
    text_words).most_common(max_vocabulary_size - 1))

# Remove samples with less than 'min_occurrence' occurrences.
for i in range(len(count) - 1, -1, -1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # The collection is ordered, so stop when 'min_occurrence' is reached.
        break
# Compute the vocabulary size.
vocabulary_size = len(count)

# Création d’un dictionnaire word2id qui assigne un id unique à chaque mot

word2id = dict()
for i, (word, _) in enumerate(count):
    word2id[word] = i

data = list()
unk_count = 0
for word in text_words:
    # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary.
    index = word2id.get(word, 0)
    if index == 0:
        unk_count += 1
    data.append(index)
count[0] = ('UNK', unk_count)

# Dictionnaire inverse : pour récupérer un mot à partir de son ID

id2word = dict(zip(word2id.values(), word2id.keys()))

print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("Vocabulary size:", vocabulary_size)
print("Most common words:", count[:10])

data_index = 0

# générer un batch de paires (mot_central, mot_contexte) pour entraîner le modèle Skip-Gram pour l'entraînement

def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    # span est la taille de la fenêtre.
    span = 2 * skip_window + 1
    # buffer est une file circulaire contenant les mots autour du mot central.
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    # Pour chaque mot central, on choisit aléatoirement num_skips mots de contexte dans la fenêtre (excluant le mot central).On génère donc des couples (central, contexte)
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch.
    data_index = (data_index + len(data) - span) % len(data)
    # On retourne les batch de mots centraux et labels de contextes correspondants.
    return batch, labels


# Ensure the following ops & var are assigned on CPU
# (some ops are not compatible on GPU).
with tf.device('/cpu:0'):
    # Create the embedding variable (each row represent a word embedding vector).
    embedding = tf.Variable(tf.random.normal(
        [vocabulary_size, embedding_size]))
    # Construct the variables for the NCE loss.
    nce_weights = tf.Variable(tf.random.normal(
        [vocabulary_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# récupère les vecteurs des mots à partir de leurs IDs.
def get_embedding(x):
    with tf.device('/cpu:0'):
        # Lookup the corresponding embedding vectors for each sample in X.
        x_embed = tf.nn.embedding_lookup(embedding, x)
        return x_embed

# calcule la perte entre vecteurs d'entrée et leurs vrais/faux voisins ,  pour entraîner le modèle Word2Vec avec negative sampling et eviter le softmax sur tous le vocab
def nce_loss(x_embed, y):
    with tf.device('/cpu:0'):
        # Compute the average NCE loss for the batch.
        y = tf.cast(y, tf.int64)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=y,
                           inputs=x_embed,
                           num_sampled=num_sampled,
                           num_classes=vocabulary_size))
        return loss

# Evaluation.calcule la similarité cosinus entre un mot de test et tous les autres.

def evaluate(x_embed):
    with tf.device('/cpu:0'):
        # Compute the cosine similarity between input data embedding and every embedding vectors
        x_embed = tf.cast(x_embed, tf.float32)
        x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed)))
        embedding_norm = embedding / \
            tf.sqrt(tf.reduce_sum(tf.square(embedding),
                                  1, keepdims=True), tf.float32)
        cosine_sim_op = tf.matmul(
            x_embed_norm, embedding_norm, transpose_b=True)
        return cosine_sim_op


# Define the optimizer : On utilise l’algorithme SGD (Stochastic Gradient Descent)
optimizer = tf.optimizers.SGD(learning_rate)

# Optimization process.

def run_optimization(x, y):
    with tf.device('/cpu:0'):
        # Wrap computation inside a GradientTape for automatic differentiation.
        with tf.GradientTape() as g:
            emb = get_embedding(x)
            loss = nce_loss(emb, y)

        # Compute gradients.
        gradients = g.gradient(loss, [embedding, nce_weights, nce_biases])

        # Update W and b following gradients.
        optimizer.apply_gradients(
            zip(gradients, [embedding, nce_weights, nce_biases]))


# Words for testing.
x_test = np.array([word2id[w] for w in eval_words])

# Run training for the given number of steps.
for step in range(1, num_steps + 1):
    batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)
    run_optimization(batch_x, batch_y)

    if step % display_step == 0 or step == 1:
        loss = nce_loss(get_embedding(batch_x), batch_y)
        print("step: %i, loss: %f" % (step, loss))

    # Evaluation : affiche Les mots sémantiquement proches (vecteurs similaires)
    if step % eval_step == 0 or step == 1:
        print("Evaluation...")
        sim = evaluate(get_embedding(x_test)).numpy()
        for i in range(len(eval_words)):
            top_k = 8  # number of nearest neighbors.
            nearest = (-sim[i, :]).argsort()[1:top_k + 1]
            log_str = '"%s" nearest neighbors:' % eval_words[i]
            for k in range(top_k):
                log_str = '%s %s,' % (log_str, id2word[nearest[k]])
            print(log_str)

Words count: 17005207
Unique words: 253854
Vocabulary size: 47135
Most common words: [('UNK', 444176), (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764), (b'in', 372201), (b'a', 325873), (b'to', 316376), (b'zero', 264975), (b'nine', 250430)]
step: 1, loss: 507.480927
Evaluation...
"b'five'" nearest neighbors: b'parapsychology', b'trumpets', b'oncology', b'narrators', b'seminar', b'jacobus', b'weak', b'ebola',
"b'of'" nearest neighbors: b'mallet', b'imply', b'wozniak', b'alabaster', b'cod', b'resemblance', b'usefully', b'redemption',
"b'going'" nearest neighbors: b'messages', b'schwarzschild', b'stamps', b'shrub', b'minsk', b'holst', b'angola', b'femininity',
"b'hardware'" nearest neighbors: b'closeness', b'ahura', b'replaced', b'pinpoint', b'gunships', b'reshaping', b'rectal', b'ability',
"b'american'" nearest neighbors: b'edo', b'hence', b'neal', b'nonviolent', b'romantics', b'exegetical', b'vandals', b'stylings',
"b'britain'" nearest neighbors: b'pall', b'marseil

In [None]:
def afficher_contextes(corpus, target_word, window_size):
    if target_word not in word2id:
        print(f' Le mot "{target_word}" est introuvable dans le vocabulaire.')
        return

    target_id = word2id[target_word]
    print(f' Le mot "{target_word}" est présent avec l\'ID : {target_id}')
    print(f' Recherche dans le corpus...')

    contextes = []
    count = 0

    for idx, word_id in enumerate(corpus):
        if word_id == target_id:
            count += 1
            start = max(0, idx - window_size)
            end = min(len(corpus), idx + window_size + 1)
            for i in range(start, end):
                if i != idx:
                    contextes.append(id2word[corpus[i]])

    print(f' Le mot "{target_word}" apparaît {count} fois dans le corpus.')
    print(f' Contextes vus : {contextes}')


In [None]:
afficher_contextes(data, b"private", skip_window)


 Le mot "b'private'" est présent avec l'ID : 818
 Recherche dans le corpus...
 Le mot "b'private'" apparaît 2098 fois dans le corpus.


In [None]:

!pip install --force-reinstall --no-deps --quiet gensim


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[?25h

**Exercice 2**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models.keyedvectors import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

embedding_file = './GoogleNews-vectors-negative300.bin.gz'
word_vectors = KeyedVectors.load_word2vec_format(embedding_file, binary=True, limit=1000000)

df = pd.read_csv('yelp_review_polarity_csv/train.csv', names=['sentiment', 'review'])
df['sentiment'].replace({1: 0, 2: 1}, inplace=True)
df = df.sample(frac=1, random_state=1)[:100000]

train_df, val_df = train_test_split(df, train_size=0.85, random_state=1)
train_texts = train_df['review']
y_train = np.array(train_df['sentiment'])
val_texts = val_df['review']
y_val = np.array(val_df['sentiment'])

tokenizer = keras.preprocessing.text.Tokenizer(num_words=20000,
                                               filters='0123456789!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                                               lower=True)
tokenizer.fit_on_texts(train_texts)

X_train = tokenizer.texts_to_sequences(train_texts)
X_val = tokenizer.texts_to_sequences(val_texts)

MAX_REVIEW_LEN = 200
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_REVIEW_LEN)
X_val = keras.preprocessing.sequence.pad_sequences(X_val, maxlen=MAX_REVIEW_LEN)

embedding_dim = 300
num_tokens = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word_vectors.has_index_for(word):
        embedding_matrix[i] = word_vectors[word]

embedding_layer = layers.Embedding(
    input_dim=num_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    input_length=MAX_REVIEW_LEN,
    trainable=False
)

model = keras.Sequential()
model.add(embedding_layer)
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(X_train, y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stop])

test_df = pd.read_csv('yelp_review_polarity_csv/test.csv', names=['sentiment', 'review'])
test_df['sentiment'].replace({1: 0, 2: 1}, inplace=True)
X_test = tokenizer.texts_to_sequences(test_df['review'])
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_REVIEW_LEN)
y_test = np.array(test_df['sentiment'])

model.evaluate(X_test, y_test)