# Deep Learning for NLP - Project

RULES:

* Do not create any additional cell

* Fill in the blanks

* All cells should be runnable (modulo trivial compatibility bugs that we'd fix)

* 4 / 20 points will be allocated to the clarity of your code

* Efficient code will have a bonus

DELIVERABLE:

* the pdf with your answers
* this notebook
* the predictions of the SST test set

DO NOT INCLUDE THE DATASETS IN THE DELIVERABLE..

In [34]:
# Python 3.6 or above is required
from collections import defaultdict
import gzip
import numpy as np
from pathlib import Path
from urllib.request import urlretrieve
from itertools import chain
from tqdm.notebook import tqdm
import pandas as pd

In [2]:
PATH_TO_DATA = Path('data/')
# Download word vectors, might take a few minutes and about ~3GB of storage space
en_embeddings_path = PATH_TO_DATA / 'cc.en.300.vec.gz'
if not en_embeddings_path.exists():
    urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz', en_embeddings_path)
fr_embeddings_path = PATH_TO_DATA / 'cc.fr.300.vec.gz'
if not fr_embeddings_path.exists():
    urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz', fr_embeddings_path)

# 1) Monolingual (English) word embeddings 

In [12]:
class Word2Vec():

    def __init__(self, filepath, vocab_size=50000):
        self.words, self.embeddings = self.load_wordvec(filepath, vocab_size)
        self.words_set = set(self.words)
        # Mappings for O(1) retrieval:
        self.word2id = {word: idx for idx, word in enumerate(self.words)}
        self.id2word = {idx: word for idx, word in enumerate(self.words)}
    
    def load_wordvec(self, filepath, vocab_size):
        assert str(filepath).endswith('.gz')
        words = []
        embeddings = []
        with gzip.open(filepath, 'rt', encoding="utf8") as f:  # Read compressed file directly
            next(f)  # Skip header
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                words.append(word)
                embeddings.append(np.fromstring(vec, sep=' '))
                if i == (vocab_size - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(words)))
        return words, np.vstack(embeddings)
    
    def encode(self, word):
        # Returns the 1D embedding of a given word
        wid = self.word2id[word]
        return self.embeddings[wid]
    
    def score(self, word1, word2):
        # Return the cosine similarity: use np.dot & np.linalg.norm
        vec1 = self.encode(word1)
        vec2 = self.encode(word2)
        norms = np.linalg.norm(vec1) * np.linalg.norm(vec2)
        score = np.dot(vec1, vec2) / norms
        return score
    
    def most_similar(self, word, k=5):
        # Returns the k most similar words: self.score & np.argsort
        scores = np.array([self.score(word, word2) for word2 in self.words])
        top = np.argsort(scores)[::-1][:k]
        return [self.id2word[i] for i in top]

In [13]:
word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)

# You will be evaluated on the output of the following:
for word1, word2 in zip(('cat', 'cat', 'cat', 'Paris', 'Paris', 'Paris', 'Paris'), ('tree', 'dog', 'pet', 'France', 'Germany', 'baguette', 'donut')):
    print(word1, word2, word2vec.score(word1, word2))
for word in ['cat', 'dog', 'dogs', 'Paris', 'Germany']:
    print(word, word2vec.most_similar(word))

Loaded 50000 pretrained word vectors
cat tree 0.26449754661654756
cat dog 0.7078641298542564
cat pet 0.6753313359976382
Paris France 0.6892958925806543
Paris Germany 0.4051242286737549
Paris baguette 0.29399958277802224
Paris donut -0.006588507552348003
cat ['cat', 'cats', 'kitty', 'kitten', 'feline']
dog ['dog', 'dogs', 'puppy', 'pup', 'canine']
dogs ['dogs', 'dog', 'cats', 'puppies', 'Dogs']
Paris ['Paris', 'France', 'Parisian', 'Marseille', 'Brussels']
Germany ['Germany', 'Austria', 'Europe', 'Berlin', 'Hamburg']


In [17]:
class BagOfWords:
    def __init__(self, word2vec):
        self.word2vec = word2vec
    
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        # -> idf = {word: idf_value, ...}
        def df(word, sentences_set):
            return sum([word in s for s in sentences_set])
        
        idfs = dict()
        nb_sentences = len(sentences)
        sentences_set = [set(s.split()) for s in sentences]
        words_in_sentences = set(chain(*sentences_set))
        for word in tqdm(words_in_sentences):
            idfs[word] = np.log(nb_sentences / (1 + df(word, sentences_set)))

        for word in self.word2vec.words_set.difference(words_in_sentences):
            idfs[word] = 0
        return idfs
    
    def encode(self, sentence, idf=None):
        # Takes a sentence as input, returns the sentence embedding
        if idf is None:
            # mean of word vectors
            encoded_words = [self.word2vec.encode(w) 
                             for w in sentence.split() 
                             if w in self.word2vec.words_set]
        else:
            # idf-weighted mean of word vectors
            encoded_words = [idf[w]*self.word2vec.encode(w)
                             for w in sentence.split()
                             if w in self.word2vec.words_set]
        vec = np.mean(encoded_words, axis=0)
        if len(vec) == 0:
            return np.zeros(self.word2vec.embeddings.shape[1])
        else:
            return vec

    def score(self, sentence1, sentence2, idf=None):
        # cosine similarity: use np.dot & np.linalg.norm
        vec1 = self.encode(sentence1, idf=idf)
        vec2 = self.encode(sentence2, idf=idf)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1)* np.linalg.norm(vec2))
   
    def most_similar(self, sentence, sentences, idf=None, k=5):
        # Return most similar sentences
        query = self.encode(sentence, idf)
        keys = np.vstack([self.encode(sentence, idf=idf) for sentence in sentences])
        scores = np.dot(keys, query) / (np.linalg.norm(query) * np.linalg.norm(keys, axis=1))
        top_ix = np.argsort(scores)[::-1][:k]
        return np.array(sentences)[top_ix]

In [18]:
word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)
sentence2vec = BagOfWords(word2vec)

# Load sentences in "PATH_TO_DATA/sentences.txt"
filepath = PATH_TO_DATA / 'sentences.txt'
with open(filepath, 'r') as f:
    sentences = [line.strip('\n') for line in f]

Loaded 50000 pretrained word vectors


In [20]:
# You will be evaluated on the output of the following:
print('\n\tAverage of word embeddings')
sentence1 = sentences[7]
sentence2 = sentences[13]
print(sentence1)
print(sentence2)
print(sentence2vec.score(sentence1, sentence2))
sentence = sentences[10]

similar_sentences = sentence2vec.most_similar(sentence, sentences)  # BagOfWords-mean
print(sentence)
for i, sentence in enumerate(similar_sentences):
    print(str(i+1) + ')', sentence)

# Build idf scores for each word
# takes approx 8 min
idf = sentence2vec.build_idf(sentences)
print('\n\tidf weighted average of word embeddings')
print(sentence1)
print(sentence2)
print(sentence2vec.score(sentence1, sentence2, idf))
similar_sentences = sentence2vec.most_similar(sentence, sentences, idf)  # BagOfWords-idf
print(sentence)
for i, sentence in enumerate(similar_sentences):
    print(str(i+1) + ')', sentence)


	Average of word embeddings
1 man singing and 1 man playing a saxophone in a concert . 
10 people venture out to go crosscountry skiing . 
0.7065220648251475
1 smiling african american boy . 
1) 1 smiling african american boy . 
2) 2 woman dancing while pointing . 
3) 5 women and 1 man are smiling for the camera . 
4) a small boy following 4 geese . 
5) 2 female babies eating chips . 


HBox(children=(FloatProgress(value=0.0, max=19976.0), HTML(value='')))



	idf weighted average of word embeddings
1 man singing and 1 man playing a saxophone in a concert . 
10 people venture out to go crosscountry skiing . 
0.6400773156424754
2 female babies eating chips . 
1) 2 female babies eating chips . 
2) 2 kids holding hands and smiling . 
3) 2 chinese people wearing traditional clothes 
4) five children , 3 boys and 2 girls playing soccer in a grass field . 
5) 3 couples in white clothes are playing piggyback . 


# 2) Multilingual (English-French) word embeddings

Let's consider a bilingual dictionary of size V_a (e.g French-English).

Let's define **X** and **Y** the **French** and **English** matrices.

They contain the embeddings associated to the words in the bilingual dictionary.

We want to find a **mapping W** that will project the source word space (e.g French) to the target word space (e.g English).

Procrustes : **W\* = argmin || W.X - Y ||  s.t  W^T.W = Id**
has a closed form solution:
**W = U.V^T  where  U.Sig.V^T = SVD(Y.X^T)**

In what follows, you are asked to: 

In [21]:
class MultilingualWordAligner:
    
    def __init__(self, fr_word2vec, en_word2vec):
        self.fr_word2vec = fr_word2vec
        self.en_word2vec = en_word2vec
        self.aligned_fr_embeddings = self.get_aligned_fr_embeddings()
        
    def get_aligned_fr_embeddings(self):
        # 1 - Get words that appear in both vocabs (= identical character strings)
        #     Use it to create the matrix X (emb_dim, vocab_size) and Y (emb_dim, vocab_size) (of embeddings for these words)
        both_vocabs = set(self.fr_word2vec.words).intersection(self.en_word2vec.words)
        X = np.vstack([fr_word2vec.encode(w) for w in both_vocabs]).T
        Y = np.vstack([en_word2vec.encode(w) for w in both_vocabs]).T
        assert X.shape[0] == 300 and Y.shape[0] == 300
        
        # 2 - Solve the Procrustes using the numpy package and: np.linalg.svd() and get the optimal W
        #     Now self.fr_word2vec.embeddings * W.transpose() is in the same space as en_word2vec.embeddings
        u, s, vh = np.linalg.svd(np.matmul(Y, X.T))
        W = np.matmul(u, vh)
        assert W.shape == (300, 300)
        return np.matmul(self.fr_word2vec.embeddings, W.transpose())
        
    def get_closest_english_words(self, fr_word, k=3):
        # 3 - Return the top k English nearest neighbors to the input French word
        wid = self.fr_word2vec.word2id[fr_word]
        vec = self.aligned_fr_embeddings[wid]
        norms = np.linalg.norm(vec) * np.linalg.norm(self.en_word2vec.embeddings, axis=1)
        scores = np.dot(self.en_word2vec.embeddings, vec) / norms
        top = np.argsort(scores)[::-1][:k]
        return [self.en_word2vec.id2word[i] for i in top]


In [22]:
fr_word2vec = Word2Vec(fr_embeddings_path, vocab_size=50000)
en_word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)
multilingual_word_aligner = MultilingualWordAligner(fr_word2vec, en_word2vec)

# You will be evaluated on the output of the following:
fr_words = ['chat', 'chien', 'voiture', 'zut']
k = 3
for fr_word in fr_words:
    print('-' * 10)
    print(f'fr: "{fr_word}"')
    en_words = multilingual_word_aligner.get_closest_english_words(fr_word, k=3)
    for en_word in en_words:
        print(f'en: "{en_word}"')

Loaded 50000 pretrained word vectors
Loaded 50000 pretrained word vectors
----------
fr: "chat"
en: "cat"
en: "kitten"
en: "kitty"
----------
fr: "chien"
en: "dog"
en: "cat"
en: "pet"
----------
fr: "voiture"
en: "car"
en: "vehicle"
en: "automobile"
----------
fr: "zut"
en: "oops"
en: "Ah"
en: "ah"


If you want to dive deeper on this subject: https://github.com/facebookresearch/MUSE

# 3) Sentence classification with BoV and scikit-learn

In [23]:
# 1 - Load train/dev/test of Stanford Sentiment TreeBank (SST)
#     (https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)
train_filepath = PATH_TO_DATA / 'SST/stsa.fine.train'
dev_filepath = PATH_TO_DATA / 'SST/stsa.fine.dev'
test_filepath = PATH_TO_DATA / 'SST/stsa.fine.test.X'

def load_traindata(filepath):
    with open(filepath, 'r') as f:
        train_lines = f.readlines()
    y = np.empty(len(train_lines))
    sentences = []
    for i, line in enumerate(train_lines):
        num, sent = line.split(" ", 1)
        y[i] = num
        sentences.append(sent.strip())
    return sentences, y

def load_testdata(filepath):
    with open(filepath, 'r') as f:
        test_lines = f.readlines()
    test_sent = []
    for i, line in enumerate(test_lines):
        test_sent.append(line.strip())
    return test_sent

train_sent, y_train = load_traindata(train_filepath)
dev_sent, y_dev = load_traindata(dev_filepath)
test_sent = load_testdata(test_filepath)

In [24]:
print(len(train_sent))

8544


In [25]:
len(dev_sent)

1101

In [26]:
len(test_sent)

2210

In [28]:
# 2 - Encode sentences with the BoV model above

word2vec = Word2Vec(en_embeddings_path)
sent2vec = BagOfWords(word2vec)

Loaded 50000 pretrained word vectors


In [29]:
# 3 - Learn Logistic Regression on top of sentence embeddings using scikit-learn
#     (consider tuning the L2 regularization on the dev set)
#     In the paper, the accuracy for average of word vectors is 32.7%
#     (VecAvg, table 1, https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.scorer import accuracy_scorer

lr = LogisticRegression(solver='lbfgs', multi_class='auto')
X_train = [sent2vec.encode(s) for s in train_sent]
X_dev = [sent2vec.encode(s) for s in dev_sent]
lr.fit(X_train, y_train)
accuracy_dev = accuracy_scorer(lr, X_dev, y_dev)
print("Accuracy on the dev set", accuracy_dev)

Accuracy on the dev set 0.3887375113533152




In [31]:
# 4 - Produce 2210 predictions for the test set (in the same order). One line = one prediction (=0,1,2,3,4).
#     Attach the output file "logreg_bov_y_test_sst.txt" to your deliverable.
#     You will be evaluated on the results of the test set.
train_dev_sent = train_sent + dev_sent
sent2vec = BagOfWords(word2vec)
X_tr_dev = [sent2vec.encode(s) for s in train_dev_sent]

y_tr_dev = np.hstack((y_train, y_dev))
lr.fit(X_tr_dev, y_tr_dev)
X_test = [sent2vec.encode(s) for s in test_sent]
pred = lr.predict(X_test)

pred_path = 'logreg_bov_y_test_sst.txt'
with open(pred_path, 'w') as f:
    f.writelines('\n'.join([str(int(p)) for p in pred]))



In [32]:
# BONUS!
# 5 - Try to improve performance with another classifier
#     Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(min_samples_split=2, 
                            min_samples_leaf=1,
                            n_estimators=100, 
                            n_jobs=-1)
grid = {
    'min_samples_split': [2, 3, 5, 7],
    'min_samples_leaf': [1, 2, 3, 4, 5], 
    'max_depth': [2, 4, 6, 8]
}

search = GridSearchCV(rf, grid, cv=3)
search.fit(X_tr_dev, y_tr_dev)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_samples_split': [2, 3, 5, 7], 'min_samples_leaf': [1, 2, 3, 4, 5], 'max_depth': [2, 4, 6, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
pd.DataFrame(search.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,2.551896,1.918895,0.124998,6.329952e-06,2,1,2,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.316755,0.321928,0.336757,0.325143,0.008476,64,0.340386,0.342613,0.341573,0.341524,0.000910
1,1.312391,0.077594,0.135413,7.358447e-03,2,1,3,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.323593,0.322862,0.335823,0.327424,0.005944,61,0.343030,0.341835,0.339708,0.341524,0.001374
2,1.301976,0.014726,0.124997,7.377719e-06,2,1,5,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.319242,0.325350,0.328354,0.324313,0.003792,68,0.341475,0.340280,0.339241,0.340332,0.000913
3,1.296784,0.022096,0.124993,1.259237e-05,2,1,7,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.312714,0.322551,0.329287,0.321514,0.006806,78,0.343808,0.337170,0.339241,0.340073,0.002773
4,1.260322,0.014723,0.124991,1.367303e-06,2,2,2,"{'max_depth': 2, 'min_samples_leaf': 2, 'min_s...",0.315511,0.325661,0.332088,0.324417,0.006824,67,0.340853,0.342613,0.337376,0.340280,0.002176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,4.530926,1.160870,0.161447,5.155655e-02,8,4,7,"{'max_depth': 8, 'min_samples_leaf': 4, 'min_s...",0.354057,0.349300,0.362589,0.355314,0.005497,8,0.615744,0.626439,0.640547,0.627576,0.010158
76,4.520513,0.757324,0.130200,7.365864e-03,8,5,2,"{'max_depth': 8, 'min_samples_leaf': 5, 'min_s...",0.350948,0.345257,0.354497,0.350233,0.003806,22,0.592719,0.614774,0.635106,0.614200,0.017309
77,3.926802,0.109984,0.140614,6.836514e-07,8,5,3,"{'max_depth': 8, 'min_samples_leaf': 5, 'min_s...",0.352813,0.357387,0.356054,0.355417,0.001921,7,0.598320,0.614152,0.630752,0.614408,0.013242
78,3.822645,0.115048,0.124992,6.836514e-07,8,5,5,"{'max_depth': 8, 'min_samples_leaf': 5, 'min_s...",0.349083,0.350233,0.356054,0.351788,0.003051,18,0.591475,0.617574,0.631374,0.613474,0.016545


# 4) Sentence classification with LSTMs in Keras

## 4.1 - Preprocessing

In [36]:
import tensorflow as tf

In [37]:
# 1 - Using the same dataset, transform text to integers using tf.keras.preprocessing.text.one_hot function
#     https://keras.io/preprocessing/text/

one_hot = tf.keras.preprocessing.text.one_hot
train_sent_int = [one_hot(sent, 50_000) for sent in train_sent]
dev_sent_int = [one_hot(sent, 50_000) for sent in dev_sent]
test_sent_int = [one_hot(sent, 50_000) for sent in test_sent]

**Padding input data**

Models in Keras (and elsewhere) take batches of sentences of the same length as input. It is because Deep Learning framework have been designed to handle well Tensors, which are particularly suited for fast computation on the GPU.

Since sentences have different sizes, we "pad" them. That is, we add dummy "padding" tokens so that they all have the same length.

The input to a Keras model thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence in the batch.

In [38]:
# 2 - Pad your sequences using tf.keras.preprocessing.sequence.pad_sequences
#     https://keras.io/preprocessing/sequence/
MAXLEN = 50
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences
train_sent_pad = pad_sequences(train_sent_int, MAXLEN)
dev_sent_pad = pad_sequences(dev_sent_int, MAXLEN)
test_sent_pad = pad_sequences(test_sent_int, MAXLEN)

## 4.2 - Design and train your model

In [39]:
# 3 - Design your encoder + classifier using tensorflow.keras.layers
#     In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
#     Then we add components to this container : the lookup-table, the LSTM, the classifier etc.
#     All of these components are contained in the Sequential() and are trained together.
#     Note that the embedding layer is initialized randomly and does not take advantage of pre-trained word embeddings.


# ADAPT CODE BELOW


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation

embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = 50000  # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(LSTM(nhid, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(n_classes, activation='sigmoid'))

In [40]:
# 4 - Define your loss/optimizer/metrics

# MODIFY CODE BELOW

loss_classif     =  'categorical_crossentropy' # find the right loss for multi-class classification
optimizer        =  'Adam' # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          1600000   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 5)                 325       
Total params: 1,625,157
Trainable params: 1,625,157
Non-trainable params: 0
_________________________________________________________________
None


In [41]:
# 5 - Train your model and find the best hyperparameters for your dev set
#     you will be evaluated on the quality of your predictions on the test set
#     Keras expects y_train and y_dev to be one-hot encodings of the labels, i.e. with shape=(n_samples, 5)
from sklearn.preprocessing import OneHotEncoder

x_train = train_sent_pad
x_dev = dev_sent_pad
enc = OneHotEncoder(n_values=5)
y_tr = enc.fit_transform(y_train.reshape(-1, 1))
y_dv = enc.transform(y_dev.reshape(-1, 1))
# ADAPT CODE BELOW
bs = 64
n_epochs = 6

history = model.fit(x_train, y_tr, batch_size=bs, nb_epoch=n_epochs, validation_data=(x_dev, y_dv))



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 8544 samples, validate on 1101 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [42]:
# 6 - Generate your predictions on the test set using model.predict(x_test)
#     https://keras.io/models/model/
#     Log your predictions in a file (one line = one integer: 0,1,2,3,4)
#     Attach the output file "logreg_lstm_y_test_sst.txt" to your deliverable.

pred = model.predict_classes(test_sent_pad)

pred_path = 'logreg_lstm_y_test_sst.txt'
with open(pred_path, 'w') as f:
    f.writelines('\n'.join([str(int(p)) for p in pred]))

## 4.3 - innovate !

In [44]:
# 7 - Open question: find a model that is better on your dev set
#     (e.g: use a 1D ConvNet, use a better classifier, pretrain your lookup tables ..)
#     you will get point if the results on the test set are better: be careful of not overfitting your dev set too much..
#     Attach the output file "XXX_XXX_y_test_sst.txt" to your deliverable.

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation, Conv1D

embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = 50000  # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(Conv1D(, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(n_classes, activation='sigmoid'))

TypeError: __init__() missing 1 required positional argument: 'kernel_size'

In [None]:
loss_classif     =  'categorical_crossentropy' # find the right loss for multi-class classification
optimizer        =  'Adam' # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

In [None]:
# 5 - Train your model and find the best hyperparameters for your dev set
#     you will be evaluated on the quality of your predictions on the test set
#     Keras expects y_train and y_dev to be one-hot encodings of the labels, i.e. with shape=(n_samples, 5)
from sklearn.preprocessing import OneHotEncoder

x_train = train_sent_pad
x_dev = dev_sent_pad
enc = OneHotEncoder(n_values=5)
y_tr = enc.fit_transform(y_train.reshape(-1, 1))
y_dv = enc.transform(y_dev.reshape(-1, 1))
# ADAPT CODE BELOW
bs = 64
n_epochs = 6

history = model.fit(x_train, y_tr, batch_size=bs, nb_epoch=n_epochs, validation_data=(x_dev, y_dv))