# Deep Learning for NLP - Project

RULES:

* Do not create any additional cell

* Fill in the blanks

* All cells should be runnable (modulo trivial compatibility bugs that we'd fix)

* 4 / 20 points will be allocated to the clarity of your code

* Efficient code will have a bonus

DELIVERABLE:

* this notebook
* the predictions of the SST test set

DO NOT INCLUDE THE DATASETS IN THE DELIVERABLE..

In [1]:
import io
import os
import numpy as np
import scipy

In [2]:
PATH_TO_DATA = "data/"

# 1) Monolingual (English) word embeddings 

In [3]:
class Word2vec():
    def __init__(self, fname, nmax=100000):
        self.load_wordvec(fname, nmax)
#         self.word2id = dict.fromkeys(self.word2vec.keys())
#         self.id2word = {v: k for k, v in self.word2id.items()}
#         self.embeddings = np.array(self.word2vec.values())
    
    def load_wordvec(self, fname, nmax):
        self.word2vec = {}
        with io.open(fname, encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                if i == (nmax - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(self.word2vec)))

    def most_similar(self, w, K=5):
        # K most similar words: self.score  -  np.argsort
        # Brute force
        scores = []
        index_dict = {}
        for index, key in enumerate(self.word2vec.keys()):
            scores += [self.score(w, key)]
            index_dict[index] = key
        indexes = np.flip(np.argsort(scores))
        return [index_dict[indexes[i]] for i in range(K)]

    def score(self, w1, w2):
        # cosine similarity: np.dot  -  np.linalg.norm
        e1, e2 = self.word2vec[w1], self.word2vec[w2]
        return np.dot(e1, e2)/(np.linalg.norm(e1)*np.linalg.norm(e2))


In [4]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'))

# You will be evaluated on the output of the following:
for w1, w2 in zip(('cat', 'dog', 'dogs', 'paris', 'germany'), ('dog', 'pet', 'cats', 'france', 'berlin')):
    print(w1, w2, w2v.score(w1, w2))
for w1 in ['cat', 'dog', 'dogs', 'paris', 'germany']:
    print(w2v.most_similar(w1))

Loaded 100000 pretrained word vectors
cat dog 0.671683666279249
dog pet 0.6842064029669219
dogs cats 0.7074389328052404
paris france 0.7775108541288563
germany berlin 0.7420295235998394
['cat', 'cats', 'kitty', 'kitten', 'feline']
['dog', 'dogs', 'puppy', 'Dog', 'doggie']
['dogs', 'dog', 'pooches', 'Dogs', 'doggies']
['paris', 'france', 'Paris', 'london', 'berlin']
['germany', 'austria', 'europe', 'german', 'berlin']


In [5]:
import ipdb

class BoV():
    def __init__(self, w2v):
        self.w2v = w2v
    
    def encode(self, sentences, idf={}, keepdims=False):
        # takes a list of sentences, outputs a numpy array of sentence embeddings
        # see TP1 for help
        sentemb = []
        N = len(sentences)
        for sent in sentences:
            list_embeds = []
            for word in sent:
                try:
                    list_embeds += [self.w2v.word2vec[word]]
                except KeyError:
                    sent.remove(word)

            embeds = np.array(list_embeds)
            
            if idf != {}:
                # idf-weighted mean of word vectors
                coeffs = np.array([max(1, np.log10(N / (idf[word]))) for word in sent])

                try:
                    average = np.average(embeds, axis=0, weights=coeffs)
                except ValueError:
                    break
                sentemb += [average]
            else:
                emb = np.mean(embeds, axis=0)
                # mean of word vectors
                if emb.shape == ():
                    emb = np.zeros(300)
                sentemb += [emb]

        return np.vstack(sentemb)

    def most_similar(self, s, sentences, idf={}, K=5):
        # get most similar sentences and **print** them
        print("Nearest neighbors of \"%s\":" % " ".join(s))
        keys = self.encode(sentences, idf)
        query = self.encode([s], idf)
        # Brute force
        scores = []
        index_dict = {}
        for index, key in enumerate(keys):
            scores += [self.score(query, key, idf=idf, encoded=True)]
            index_dict[index] = sentences[index]
        indexes = np.flip(np.argsort(scores))
        neighbors = [index_dict[indexes[i]] for i in range(K)]
        for elem_list in neighbors:
            print(" ".join(elem_list) + '\n')
        return neighbors

    def score(self, s1, s2, idf={}, encoded=False):
        if encoded == False:
            s1_, s2_ = s1, s2
            s1 = self.encode([s1], idf)
            s2 = self.encode([s2], idf)
            s2 = np.squeeze(s2, axis=0)
            print("Compute score of \"%s\" and" % " ".join(s1_), "\%s\" " % " ".join(s2_))
        # cosine similarity: use   np.dot  and  np.linalg.norm
        score = float(np.dot(s1, s2)/(np.linalg.norm(s1, 2)*np.linalg.norm(s2, 2)))
        if encoded == False: print(score)
        return score
    
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        idf = {}
        for sent in sentences:
            for w in set(sent):
                idf[w] = idf.get(w, 0) + 1
        return idf

In [6]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'))

# Load sentences in "PATH_TO_DATA/sentences.txt"
sentences = []
nmax = 10000
with io.open(os.path.join(PATH_TO_DATA, 'sentences.txt'), encoding='utf-8') as f:
    next(f)
    for i, line in enumerate(f):
        if line.endswith('. \n'):
            try:
                line, _ = line.split(' .', 1)
            except ValueError:
                print(line)
            line = str(line)
        words = line.split(' ')
        sentences += [words]
        if i == (nmax - 1):
            break

s2v = BoV(w2v)
# You will be evaluated on the output of the following:
s2v.most_similar('' if not sentences else sentences[10], sentences)  # BoV-mean
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13])

# Build idf scores for each word
idf = s2v.build_idf(sentences)
s2v.most_similar('' if not sentences else sentences[10], sentences, idf)  # BoV-idf
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13], idf)

Loaded 100000 pretrained word vectors
Nearest neighbors of "1 woman in a black jacket is drinking out of a bottle while others are smiling":
1 woman in a black jacket is drinking out of a bottle while others are smiling

a black man wearing a black shirt with yellow leaves on it , is sitting and drinking a cup of coffee

a black man in a striped shirt stands eating out of a cup , holding a yellow water bottle

a blue-eyed woman with and glasses looks up at the camera while holding a hamburger with a bite out of it , as she sits on a boat with four other young people who are drinking and eating

a blond woman in a leather jacket is sitting outside with a man in a blue and red jacket , and there is a dog between them

Compute score of "1 man standing and several people sitting down waiting on a subway train" and \10 women dressed in long black dresses holding a booklet up sheet music in front of them singing in a you can see the back of 3 older gentlemen heads who appear to be the audien

0.7797889678205021

# 2) Multilingual (English-French) word embeddings

Let's consider a bilingual dictionary of size V_a (e.g French-English).

Let's define **X** and **Y** the **French** and **English** matrices.

They contain the embeddings associated to the words in the bilingual dictionary.

We want to find a **mapping W** that will project the source word space (e.g French) to the target word space (e.g English).

Procrustes : **W\* = argmin || W.X - Y ||  s.t  W^T.W = Id**
has a closed form solution:
**W = U.V^T  where  U.Sig.V^T = SVD(Y.X^T)**

In what follows, you are asked to: 

In [7]:
# 1 - Download and load 50k first vectors of
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec

# TYPE CODE HERE
def load_vectors(fname, nmax=50000):
    word2vec = {}
    with io.open(fname, encoding='utf-8') as f:
        next(f)
        for i, line in enumerate(f):
            word, vec = line.split(' ', 1)
            word2vec[word] = np.fromstring(vec, sep=' ')
            if i == (nmax - 1):
                break
    print('Loaded %s pretrained word vectors' % (len(word2vec)))
    return word2vec

w2v_en = load_vectors(os.path.join(PATH_TO_DATA, 'wiki.en.vec'))
w2v_fr = load_vectors(os.path.join(PATH_TO_DATA, 'wiki.fr.vec'))

Loaded 50000 pretrained word vectors
Loaded 50000 pretrained word vectors


In [8]:
# 2 - Get words that appear in both vocabs (= identical character strings)
#     Use it to create the matrix X and Y (of aligned embeddings for these words)

# TYPE CODE HERE
list_en, list_fr = list(w2v_en.keys()), list(w2v_fr.keys())
common_elements = [element for element in list_en if element in list_fr]
X = np.array([w2v_fr[key] for key in common_elements])
Y = np.array([w2v_en[key] for key in common_elements])
X = X.T
Y = Y.T

In [9]:
# 3 - Solve the Procrustes using the scipy package and: scipy.linalg.svd() and get the optimal W
#     Now W*French_vector is in the same space as English_vector

# TYPE CODE HERE
import scipy.linalg 
U, s, Vh = scipy.linalg.svd(Y @ X.T)
W = U @ Vh

In [10]:
# 4 - After alignment with W, give examples of English nearest neighbors of some French words (and vice versa)
#     You will be evaluated on that part and the code above

# TYPE CODE HERE
def score(emb_1, emb_2):
    return (emb_1 / np.linalg.norm(emb_1)).dot(emb_2 / np.linalg.norm(emb_2))

def get_nn(word, src_w2v, tgt_w2v, rot, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word_emb = src_w2v[word]
    word_emb_to_tgt = rot @ word_emb
    scores = []
    index_dict = {}
    for index, key in enumerate(tgt_w2v.keys()):
        scores += [score(word_emb_to_tgt, tgt_w2v[key])]
        index_dict[index] = key
    scores = np.array(scores)
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], index_dict[idx]))

for word in  ['ordinateur', 'table', 'cheval']:
    get_nn(word, w2v_fr, w2v_en, rot = W)
for word in  ['tomato', 'car', 'philosophy']:
    get_nn(word, w2v_en, w2v_fr, rot = W.T)

Nearest neighbors of "ordinateur":
0.6838 - computers
0.6785 - computer
0.6444 - mainframe
0.6360 - workstation
0.6221 - programmable
Nearest neighbors of "table":
0.6442 - table
0.5380 - tables
0.4071 - billiard
0.3884 - menus
0.3865 - hash
Nearest neighbors of "cheval":
0.6120 - horse
0.5969 - horses
0.5676 - cheval
0.5195 - horseman
0.5071 - dressage
Nearest neighbors of "tomato":
0.6906 - tomates
0.6880 - tomate
0.6434 - haricots
0.6255 - oignon
0.6046 - patate
Nearest neighbors of "car":
0.7470 - voiture
0.7055 - voitures
0.6378 - automobile
0.6320 - porsche
0.6193 - automobiles
Nearest neighbors of "philosophy":
0.7947 - philosophie
0.7556 - philosophy
0.7215 - philosophies
0.6678 - philosophique
0.6665 - métaphysique


If you want to dive deeper on this subject: https://github.com/facebookresearch/MUSE

# 3) Sentence classification with BoV and scikit-learn

In [11]:
# 1 - Load train/dev/test of Stanford Sentiment TreeBank (SST)
#     (https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)
import ipdb 
# TYPE CODE HERE
def load_sentences(fname, nmax=10000, label=True):
    sentences = []
    labels = []
    with io.open(fname, encoding='utf-8') as f:
        next(f)
        for i, line in enumerate(f):
            words = line.split(' ')
            if label:
                labels += [int(words.pop(0))]
            sentences += [words]
            if i == (nmax - 1):
                break

    return np.array(sentences), np.array(labels)
    
x_train, y_train = load_sentences(os.path.join(PATH_TO_DATA, 'SST', 'stsa.fine.train'))
x_dev, y_dev = load_sentences(os.path.join(PATH_TO_DATA, 'SST', 'stsa.fine.dev'))
x_test, _ = load_sentences(os.path.join(PATH_TO_DATA, 'SST', 'stsa.fine.test'), label=False)

In [12]:
# 2 - Encode sentences with the BoV model above
s2v = BoV(w2v)
# TYPE CODE HERE
x_train_emb = s2v.encode(x_train)
x_dev_emb = s2v.encode(x_dev)
x_test_emb = s2v.encode(x_test)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [13]:
# 3 - Learn Logistic Regression on top of sentence embeddings using scikit-learn
#     (consider tuning the L2 regularization on the dev set)

# TYPE CODE HERE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
C = 0.002
x_train_emb = scaler.fit_transform(x_train_emb)
x_dev_emb = scaler.transform(x_dev_emb)
clf_l2 = LogisticRegression(C=C, penalty='l2', tol=0.01, solver='saga')
clf_l2.fit(x_train_emb, y_train)

print("Training score with L2 penalty: %.4f" % clf_l2.score(x_train_emb, y_train))
print("Dev score with L2 penalty: %.4f" % clf_l2.score(x_dev_emb, y_dev))

Training score with L2 penalty: 0.4726
Dev score with L2 penalty: 0.4209


In [26]:
# 4 - Produce 2210 predictions for the test set (in the same order). One line = one prediction (=0,1,2,3,4).
#     Attach the output file "logreg_bov_y_test_sst.txt" to your deliverable.
#     You will be evaluated on the results of the test set.

# TYPE CODE HERE
predictions = clf_l2.predict(x_test_emb)
np.savetxt('logreg_bov_y_test_sst.txt', predictions)

In [27]:
# BONUS!
# 5 - Try to improve performance with another classifier
#     Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)

# TYPE CODE HERE

[3 3 3 ... 3 3 3]


# 4) Sentence classification with LSTMs in Keras

## 4.1 - Preprocessing

In [16]:
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [17]:
# 1 - Load train/dev/test sets of SST
# PATH_TO_DATA = "../../data/"

# TYPE CODE HERE
train_name = os.path.join(PATH_TO_DATA, 'SST', 'stsa.fine.train')
dev_name = os.path.join(PATH_TO_DATA, 'SST', 'stsa.fine.dev')
test_name = os.path.join(PATH_TO_DATA, 'SST', 'stsa.fine.test')
train_data = io.open(train_name, encoding='utf-8')
dev_data = io.open(dev_name, encoding='utf-8')
test_data = io.open(test_name, encoding='utf-8')

In [18]:
# 2 - Transform text to integers using keras.preprocessing.text.one_hot function
#     https://keras.io/preprocessing/text/

# TYPE CODE HERE
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical

VOCAB_SIZE = len(train_data.read())

def one_hot_encode(fname, label=True):
    sentences = []
    labels = []
    with io.open(fname, encoding='utf-8') as f:
        next(f)
        for i, line in enumerate(f):
            words = line.split(' ')
            if label:
                labels += [int(words.pop(0))]
            line = " ".join(words)
            encoded = one_hot(line, VOCAB_SIZE)
            sentences += [encoded]

    return np.array(sentences), to_categorical(np.array(labels), num_classes=5)

x_train, y_train = one_hot_encode(train_name)
x_dev, y_dev = one_hot_encode(dev_name)
x_test, _ = one_hot_encode(test_name, label=False)

**Padding input data**

Models in Keras (and elsewhere) take batches of sentences of the same length as input. It is because Deep Learning framework have been designed to handle well Tensors, which are particularly suited for fast computation on the GPU.

Since sentences have different sizes, we "pad" them. That is, we add dummy "padding" tokens so that they all have the same length.

The input to a Keras model thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence in the batch.

In [19]:
# 3 - Pad your sequences using keras.preprocessing.sequence.pad_sequences
#     https://keras.io/preprocessing/sequence/

# TYPE CODE HERE
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LEN = max([len(x) for x in x_train])

def pad_sequence(input_sequences):
    return np.array(pad_sequences(input_sequences, maxlen=MAX_SEQUENCE_LEN, padding='pre'))

x_train = pad_sequence(x_train)
x_dev = pad_sequence(x_dev)
x_test = pad_sequence(x_test)

## 4.2 - Design and train your model

In [32]:
# 4 - Design your encoder + classifier using keras.layers
#     In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
#     Then we add components to this contained : the lookuptable, the LSTM, the classifier etc.
#     All of these components are contained in the Sequential() and are trained together.


# ADAPT CODE BELOW


from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation

embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = VOCAB_SIZE  # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(LSTM(nhid, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(n_classes, activation='sigmoid'))



In [33]:
# 5 - Define your loss/optimizer/metrics

# MODIFY CODE BELOW

loss_classif     =  'categorical_crossentropy' # find the right loss for multi-class classification
optimizer        =  'adam' # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 32)          28887040  
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
Total params: 28,912,197
Trainable params: 28,912,197
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# 6 - Train your model and find the best hyperparameters for your dev set
#     you will be evaluated on the quality of your predictions on the test set

# ADAPT CODE BELOW
bs = 64
n_epochs = 6

history = model.fit(x_train, y_train, batch_size=bs, nb_epoch=n_epochs, validation_data=(x_dev, y_dev))



Train on 8543 samples, validate on 1100 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

In [None]:
# 7 - Generate your predictions on the test set using model.predict(x_test)
#     https://keras.io/models/model/
#     Log your predictions in a file (one line = one integer: 0,1,2,3,4)
#     Attach the output file "logreg_lstm_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE
score_train = model.evaluate(x_train, y_train)
score_dev = model.evaluate(x_dev, y_dev)
print("\n Training score: %.4f" % score_train[1])
print("\n Dev score: %.4f" % score_dev[1])
predictions = model.predict(x_test)
predictions = np.argmax(predictions, axis=1)
np.savetxt('logreg_lstm_y_test_sst.txt', predictions)

## 4.3 -- innovate !

In [24]:
# 8 - Open question: find a model that is better on your dev set
#     (e.g: use a 1D ConvNet, use a better classifier, pretrain your lookup tables ..)
#     you will get point if the results on the test set are better: be careful of not overfitting your dev set too much..
#     Attach the output file "XXX_XXX_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE
from keras.layers import Bidirectional, Conv1D, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers

# Set callback functions to early stop training and save the best model so far
callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

model = Sequential()
model.add(Embedding(vocab_size, 10))
model.add(Bidirectional(LSTM(nhid, dropout_W=0.3, dropout_U=0.3, kernel_regularizer=regularizers.l2(0.001))))
model.add(Dense(n_classes, activation='sigmoid'))

loss_classif     =  'categorical_crossentropy' # find the right loss for multi-class classification
optimizer        =  'adam' # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

bs = 32
n_epochs = 20

history = model.fit(x_train, y_train, batch_size=bs, nb_epoch=n_epochs, validation_data=(x_dev, y_dev), callbacks=callbacks)

model.load_weights('best_model.h5') # revert to the best model
score_train = model.evaluate(x_train, y_train)
score_dev = model.evaluate(x_dev, y_dev)
print("\n Training score: %.4f" % score_train[1])
print("\n Dev score: %.4f" % score_dev[1])

predictions = model.predict(x_test)
predictions = np.argmax(predictions, axis=1)
np.savetxt('bilstm_y_test_sst.txt', predictions)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 10)          9027200   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               38400     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 645       
Total params: 9,066,245
Trainable params: 9,066,245
Non-trainable params: 0
_________________________________________________________________
None




Train on 8543 samples, validate on 1100 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
 Training score: 0.5205

 Dev score: 0.4064
