# Deep Learning for NLP - Project

RULES:

* Do not create any additional cell

* Fill in the blanks

* All cells should be runnable (modulo trivial compatibility bugs that we'd fix)

* 4 / 20 points will be allocated to the clarity of your code

* Efficient code will have a bonus

DELIVERABLE:

* this notebook
* the predictions of the SST test set

DO NOT INCLUDE THE DATASETS IN THE DELIVERABLE..

In [49]:
import io
import os
import numpy as np
import scipy

import pandas as pd
import urllib

# Keras imports
import keras
from keras.preprocessing import text
from keras.preprocessing import sequence 
from keras.preprocessing.text import Tokenizer

# For modelling
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation, Dropout
from keras.utils import to_categorical

# From string manipulation
from string import punctuation
from collections import Counter

# Sklearn imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# For plotting
import matplotlib.pyplot as plt

In [2]:
PATH_TO_DATA = "/Users/.../semester_2/deep_learning/nlp_project/data"

# 1) Monolingual (English) word embeddings 

In [4]:
class Word2vec():
    def __init__(self, fname, nmax=100000):
        self.load_wordvec(fname, nmax)
        self.word2id = dict.fromkeys(self.word2vec.keys())
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.embeddings = np.array(self.word2vec.values())
    
    def load_wordvec(self, fname, nmax):
        self.word2vec = {}
        with io.open(fname, encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                if i == (nmax - 1):
                    break
        print('Loaded %s pretrained word vectors \n' % (len(self.word2vec)))

    def most_similar(self, w, K=5):
        # Dict for storing words and similarities
        self.sim_words = {}
        
        # For each element in the dict keys
        for i in list(self.word2vec.keys()):
            # Compute the similarity between the two vectors
            cos = self.score(w, i)
            
            # When the lenght of the dictionary is lower than 5.. auto update
            if len(self.sim_words) < K:
                self.sim_words[i] = cos
                
            # If the computed score is greater than the minimal best score
            elif cos > min(self.sim_words.values()):
                # Delete the key to the minimum best score word
                del self.sim_words[min(self.sim_words, 
                                          key=self.sim_words.get)] 
                # update dict
                self.sim_words[i] = cos
                
            else:
                pass
            
        return sorted(self.sim_words, key=self.sim_words.get, reverse=True)

    def score(self, w1, w2):
        # cosine similarity: np.dot  -  np.linalg.norm
        w1_vec, w2_vec = self.word2vec[w1], self.word2vec[w2]
        
        # Simple cosine calculations
        self.cos = np.dot(w1_vec, w2_vec)/(np.linalg.norm(w1_vec)*np.linalg.norm(w2_vec))
        return self.cos


In [7]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=200000)

# You will be evaluated on the output of the following:
for w1, w2 in zip(('cat', 'dog', 'dogs', 'paris', 'germany'), ('dog', 'pet', 'cats', 'france', 'berlin')):
    print(w1, w2, w2v.score(w1, w2))
    
for w1 in ['cat', 'dog', 'dogs', 'paris', 'germany']:
    print(w2v.most_similar(w1))

Loaded 200000 pretrained word vectors 

cat dog 0.6716836662792491
dog pet 0.6842064029669219
dogs cats 0.7074389328052404
paris france 0.7775108541288561
germany berlin 0.7420295235998392
['cat', 'cats', 'kitty', 'kitten', 'feline']
['dog', 'dogs', 'puppy', 'Dog', 'doggie']
['dogs', 'dog', 'pooches', 'Dogs', 'doggies']
['paris', 'france', 'Paris', 'parisian', 'london']
['germany', 'austria', 'europe', 'german', 'berlin']


In [5]:
class BoV():
    def __init__(self, w2v):
        self.w2v = w2v
        self.idf_dict = {}
        
    def encode(self, sentences, idf=False):
        # takes a list of sentences, outputs a numpy array of sentence embeddings
        # see TP1 for help
        sentemb = []
        for sent in sentences:
            if idf is False:
                # mean of word vectors
                sent_vec = [self.w2v.word2vec[w] for w in sent if w in self.w2v.word2vec]
                
            else:
                # idf-weighted mean of word vectors
                sent_vec = [self.w2v.word2vec[w]*self.idf_dict[w] for w in sent 
                            if (w in self.w2v.word2vec) and (w in self.idf_dict)]
        
            sentemb.append(np.mean(sent_vec, axis=0))   
        
        if len(sentences) is 1:
            return sentemb[0]
        return np.vstack(sentemb)

    def most_similar(self, s, sentences, idf=False, K=5):
        # get most similar sentences and **print** them
        self.sim_sent, query = {}, self.encode([s], idf)
        
        # Normalizing the vector.
        query = query/np.linalg.norm(query, 2)
            
        for sent in sentences:
            # Rule out the case where it is the same sentence. 
            if sent != s:
                sent_vec = self.encode([sent], idf)

                # Normalizing the vector & Computing the cosine similarity.
                sent_vec = sent_vec/np.linalg.norm(sent_vec)
                cos = np.dot(sent_vec, query)/(np.linalg.norm(sent_vec)*np.linalg.norm(query))

                # When the lenght of the dictionary is lower than 5.. auto update
                if len(self.sim_sent) < K:
                    self.sim_sent[' '.join(sent)] = cos

                # If the computed score is greater than the minimal best score
                elif cos > min(self.sim_sent.values()):
                    # Delete the key to the minimum best score word
                    del self.sim_sent[min(self.sim_sent, 
                                              key=self.sim_sent.get)] 
                    # update dict
                    self.sim_sent[' '.join(sent)] = cos

                else:
                    pass
        
        similarity_dict = sorted(self.sim_sent, key=self.sim_sent.get, reverse=True)
        
        # Printing out.
        print("The {} most similar sentences to '{}' are: \n ".format(K, ' '.join(s)))
        for i in range(K):
            print('{}) {}'.format(i, similarity_dict[i]))
        print('\n')
        return None

    def score(self, s1, s2, idf=False):
        # cosine similarity: use   np.dot  and  np.linalg.norm
        s1_vec, s2_vec = self.encode([s1], idf), self.encode([s2], idf)
        
        # Need to normalize both sentence vectors
        s1_vec, s2_vec = s1_vec/np.linalg.norm(s1_vec), s2_vec/np.linalg.norm(s2_vec)
        sim = np.dot(s1_vec.T, s2_vec)
    
        print("Sentence similarity between '{}' and '{}' is {} \n".format(' '.join(s1), ' '.join(s2), sim))
        return None
    
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        for sent in sentences:
            for w in set(sent):
                # Default set to 0 or add 1 if present
                self.idf_dict[w] = self.idf_dict.get(w, 0) + 1
        
        # Update all values of the dict by the idf value max(1, np.log10(doc_size/term_freq))
        self.idf_dict = dict([(
            k, max(1, np.log10(len(sentences)/v))
        ) 
                    for (k, v) in self.idf_dict.items()])
        
        return self.idf_dict

In [389]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=200000)
s2v = BoV(w2v)

# Load sentences in "PATH_TO_DATA/sentences.txt"
sentences = []
with open(PATH_TO_DATA+'/sentences.txt') as f:
    for i, line in enumerate(f):
        sent = line.rstrip().split()
        sentences.append(sent)

# Build idf scores for each word
idf_dict = s2v.build_idf(sentences)

# You will be evaluated on the output of the following:
s2v.most_similar('' if not sentences else sentences[10], sentences)  # BoV-mean
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13])


s2v.most_similar('' if not sentences else sentences[10], sentences, idf=True)  # BoV-idf
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13], idf=True)

Loaded 200000 pretrained word vectors 

The 5 most similar sentences to '1 smiling african american boy .' are: 
 
0) an african american man smiling .
1) a little african american boy and girl looking up .
2) an afican american woman standing behind two small african american children .
3) an african american man is sitting .
4) a girl in black hat holding an african american baby .


Sentence similarity between '1 man singing and 1 man playing a saxophone in a concert .' and '10 people venture out to go crosscountry skiing .' is 0.5726258859719606 

The 5 most similar sentences to '1 smiling african american boy .' are: 
 
0) an african american man smiling .
1) an african american man is sitting .
2) a little african american boy and girl looking up .
3) an afican american woman standing behind two small african american children .
4) a girl in black hat holding an african american baby .


Sentence similarity between '1 man singing and 1 man playing a saxophone in a concert .' and 

# 2) Multilingual (English-French) word embeddings

Let's consider a bilingual dictionary of size V_a (e.g French-English).

Let's define **X** and **Y** the **French** and **English** matrices.

They contain the embeddings associated to the words in the bilingual dictionary.

We want to find a **mapping W** that will project the source word space (e.g French) to the target word space (e.g English).

Procrustes : **W\* = argmin || W.X - Y ||  s.t  W^T.W = Id**
has a closed form solution:
**W = U.V^T  where  U.Sig.V^T = SVD(Y.X^T)**

In what follows, you are asked to: 

In [4]:
# 1 - Download and load 50k first vectors of
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec

# TYPE CODE HERE
def download_to_dict(url, n_first=50000):
    re = urllib.request.urlopen(url)
    word_embed_dict = {}
    for i, l in enumerate(re):
        # Limit download size to default value
        if i > n_first: break
        elif i is 0: pass
        else:
            word, embed = l.decode('utf-8').split(' ',maxsplit=1)
            # Convert to a numpy array
            embed = np.array([float(i) for i in embed.split(' ') if i is not '\n'])
            word_embed_dict[word] = embed
    print('Finished downloading {} vectors for url: {} \n'.format(len(word_embed_dict), url))
    return word_embed_dict

eng_word_dict = download_to_dict('https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec')
french_word_dict = download_to_dict('https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec')

Finished downloading 50000 vectors for url: https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec 

Finished downloading 50000 vectors for url: https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec 



In [305]:
# 2 - Get words that appear in both vocabs (= identical character strings)
#     Use it to create the matrix X and Y (of aligned embeddings for these words)

# TYPE CODE HERE
common_keys = list(set(eng_word_dict.keys()) & set(french_word_dict.keys()))

def create_mat(word_dict, common_keys):
    vect_word_list = []
    for k in common_keys:
        vect = [k] + list(word_dict[k])
        vect_word_list.append(np.array(vect))
    # Return the vertical stacking of the list of numpy array. 
    return np.vstack(vect_word_list)

Y, X = create_mat(eng_word_dict, common_keys), create_mat(french_word_dict, common_keys)

In [306]:
# 3 - Solve the Procrustes using the scipy package and: scipy.linalg.svd() and get the optimal W
#     Now W*French_vector is in the same space as English_vector

X_words, Y_words = X[:, 0].reshape((X.shape[0], 1)), Y[:, 0].reshape((Y.shape[0], 1))
Y, X = Y[:, 1:].astype(float), X[:, 1:].astype(float)


U, sig, V_T = np.linalg.svd(np.dot(Y, X.T))
W = np.dot(U, V_T)
X_W = np.dot(W, X)

# Add the word column to X_W
X_W, Y = np.hstack((X_words, X_W)), np.hstack((Y_words, Y))

In [312]:
# 4 - After alignment with W, give examples of English nearest neighbors of some French words (and vice versa)
#     You will be evaluated on that part and the code above

# TYPE CODE HERE
def nn_words(words, X_W, Y, french_to_eng=True, n=10, verbose=True):
    # Check that we have a list. Make a dict containing all the word under consideration.
    if type(words) is str:
        words = [words]
    nn_dict = {}
    nn_dict['french_to_eng'] = french_to_eng
    
    for word in words:
        # Make key pointing to a dict for word in consideration
        nn_dict[word] = {}
        
        if french_to_eng is True:
            # Get and Normalize vector
            word_v = X_W[np.array(np.where(X_W == word))[0], 1:].astype(float)
            word_v = word_v/np.linalg.norm(word_v)
            
            for i in np.arange(Y.shape[0]):
                # Get the minimum sim value up to know when dict is not empty
                if len(nn_dict[word]) is not 0:
                    best_min_sim = min(list(nn_dict[word].keys()))
                sim = np.dot(
                word_v, Y[i, 1:].astype(float)
                )[0]
                
                
                # When dict did not fill up to user-provided limit
                if len(nn_dict[word]) < n:
                    nn_dict[word][sim] = Y[i, 0]
                    
                # If filled up, start selection
                elif sim > best_min_sim:
                    del nn_dict[word][best_min_sim]
                    nn_dict[word][sim] = Y[i, 0]
                    
                else:pass
                    
        else:
            # Get and normalize vector. Same as previous, starting matrix is simply Y instead of WX.
            word_v = Y[np.array(np.where(Y == word))[0], 1:].astype(float)
            word_v = word_v/np.linalg.norm(word_v)
            
            for i in np.arange(X_W.shape[0]):
                if len(nn_dict[word]) is not 0:
                    best_min_sim = min(list(nn_dict[word].keys()))
                sim = np.dot(
                word_v, X_W[i, 1:].astype(float)
                )[0]
                
                if len(nn_dict[word]) < n:
                    nn_dict[word][sim] = X_W[i, 0]
                    
                elif sim > best_min_sim:
                    del nn_dict[word][best_min_sim]
                    nn_dict[word][sim] = X_W[i, 0]
                else:pass
        
    if verbose is True: 
        # For printing results
        print("Mode french_to_eng is {} \n".format(french_to_eng))
        for word in words:
            print("For '{}', the {} most similar words are: ".format(word, n))
            for i, key in enumerate(sorted(nn_dict[word], reverse=True)):
                print("{}) '{}', similarity: {}".format(i, nn_dict[word][key], key))
            print('\n')
            
    # Returns dict, whose keys are considered words and values dict to the n most similar words.
    return nn_dict

print(nn_words('automobile', X_W, Y, french_to_eng=False, n=20), nn_words('maison', X_W, Y))

Mode french_to_eng is False 

For 'automobile', the 20 most similar words are: 
0) 'automobile', similarity: 3.60475435273381
1) 'automobiles', similarity: 2.7896719427395182
2) 'automotive', similarity: 2.548950634310399
3) 'auto', similarity: 2.336638025740818
4) 'motor', similarity: 2.258243193359415
5) 'peugeot', similarity: 2.1870605957286005
6) 'suv', similarity: 2.0405373875784356
7) 'car', similarity: 2.0299109279698118
8) 'chrysler', similarity: 1.9407918065894374
9) 'bugatti', similarity: 1.9138707825451826
10) 'citroën', similarity: 1.8975421240174004
11) 'chevrolet', similarity: 1.8890310202169673
12) 'motors', similarity: 1.886694080433701
13) 'roadster', similarity: 1.846398040556708
14) 'buick', similarity: 1.8323967516072264
15) 'bicycle', similarity: 1.81310302140496
16) 'volkswagen', similarity: 1.7796050990111425
17) 'maserati', similarity: 1.7795186108185654
18) 'lancia', similarity: 1.7692075618333871
19) 'cars', similarity: 1.7659284474350994


Mode french_to_eng 

If you want to dive deeper on this subject: https://github.com/facebookresearch/MUSE

# 3) Sentence classification with BoV and scikit-learn

In [22]:
# 1 - Load train/dev/test of Stanford Sentiment TreeBank (SST)
#     (https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)

# TYPE CODE HERE
list_data_file = ['stsa.fine.dev', 'stsa.fine.test.X', 'stsa.fine.train']

def load_sst_data(list_data_file):
    dev_set = {'label':[], 'sentence':[]}
    test_set = {'sentence':[]}
    train_set = {'label':[], 'sentence':[]}
    for i, f in enumerate(list_data_file):
        
        with open(PATH_TO_DATA+'/SST/'+f, 'r') as data:
            for l in data:
                if i is 0: 
                    label, sentence = l.split(maxsplit=1)
                    dev_set['label'].append(label), dev_set['sentence'].append(sentence)
                elif i is 1:
                    # Test set has no label
                    test_set['sentence'].append(l)
                else:
                    label, sentence = l.split(maxsplit=1)
                    train_set['label'].append(label), train_set['sentence'].append(sentence)
    return pd.DataFrame(train_set), pd.DataFrame(dev_set), pd.DataFrame(test_set)

# Get data
df_train, df_dev, df_test = load_sst_data(list_data_file)

In [23]:
# 2 - Encode sentences with the BoV model above

# TYPE CODE HERE
s2v = BoV(w2v)

# Function for adding the bov features to the df
def add_bov_features(df, s2v, weighted=False):
    sent_embeds = []
    
    if weighted is False:
        for i in range(df.shape[0]):
            # For each row, calculate the sentence embedding. 
            embed = s2v.encode([df.loc[i, 'sentence']])
            sent_embeds.append(embed)

        embeds_df = pd.DataFrame(sent_embeds)

        # update dataframe (concatenation)
        df = pd.concat([df, embeds_df], axis=1)
        return df
    
    else:

        # Now do the BoV embedding, each weighted by the tf-idf of the word
        for i in range(df.shape[0]):
            # For each row, calculate the sentence embedding.
            embed = s2v.encode([df.loc[i, 'sentence']], idf=True)
            sent_embeds.append(embed)

        embeds_df = pd.DataFrame(sent_embeds)

        # update dataframe (concatenation)
        df = pd.concat([df, embeds_df], axis=1)
        return df        
        

def clean_s(s, translate_table):
    s = s.translate(translate_table)
    return s

translate_table = str.maketrans(dict.fromkeys(punctuation, None))

df_train['sentence'] = df_train['sentence'].apply(clean_s, translate_table=translate_table)  
df_dev['sentence'] = df_dev['sentence'].apply(clean_s, translate_table=translate_table)  
df_test['sentence'] = df_test['sentence'].apply(clean_s, translate_table=translate_table)  


# Taking the same dfs but with weighted bov.
dfs_idf_dict = s2v.build_idf(sentences_dfs)

df_dev_noweight, df_test_noweight, df_train_noweight = add_bov_features(df_dev, s2v), add_bov_features(df_test, s2v), add_bov_features(df_train, s2v)
df_dev_withweight, df_test_withweight, df_train_withweight = add_bov_features(df_dev, s2v, weighted=True), add_bov_features(df_test, s2v, weighted=True), add_bov_features(df_train, s2v, weighted=True)


# Need to rebuild the idf dict for the bov
sentences_dfs = pd.concat([df_train_withweight["sentence"], df_test_withweight["sentence"],
                           df_dev_withweight['sentence']], axis=0).tolist()
sentences_dfs = [s.strip().split() for s in sentences_dfs]




In [30]:
# 3 - Learn Logistic Regression on top of sentence embeddings using scikit-learn
#     (consider tuning the L2 regularization on the dev set)



# TYPE CODE HERE
log_clf = LogisticRegression()
param_grid = {'C':np.arange(0.01, 2, 0.1)}
gridcv = GridSearchCV(log_clf, param_grid)
gridcv.fit(df_dev_noweight.iloc[:, 2:], df_dev_noweight.loc[:, 'label'])

# Now fitting the best parameters
log_clf = LogisticRegression(**gridcv.best_params_)
log_fit = log_clf.fit(df_train_noweight.iloc[:, 2:], 
                      df_train_noweight.loc[:, 'label'])

accuracy_train = accuracy_score(log_fit.predict(df_train_noweight.iloc[:, 2:]), 
                                df_train_noweight.loc[:, 'label'])


preds = log_fit.predict(df_dev_noweight.iloc[:, 2:])
accuracy_dev = accuracy_score(df_dev_noweight.loc[:, 'label'], preds)
print('Accuracy on train set: {}\nAccuracy on dev. set: {}\n'.format(accuracy_train, 
                                                                   accuracy_dev))




log_clf = LogisticRegression(**gridcv.best_params_)
log_fit = log_clf.fit(df_train_withweight.iloc[:, 2:], df_train_withweight.loc[:, 'label'])
accuracy_train_weighted = accuracy_score(log_fit.predict(df_train_withweight.iloc[:, 2:]),  
                                                        df_train_withweight.loc[:, 'label'])

preds = log_fit.predict(df_dev_withweight.iloc[:, 2:])
accuracy_dev_weighted = accuracy_score(df_dev_withweight.loc[:, 'label'], preds)
                                        
print('For weighted BoV,\nAccuracy on train set: {}\nAccuracy on dev. set: {}'.format(accuracy_train_weighted, 
                                                           accuracy_dev_weighted))



Accuracy on train set: 0.30067883895131087
Accuracy on dev. set: 0.3169845594913715

For weighted BoV,
Accuracy on train set: 0.3026685393258427
Accuracy on dev. set: 0.3151680290644868


In [430]:
# 4 - Produce 2210 predictions for the test set (in the same order). One line = one prediction (=0,1,2,3,4).
#     Attach the output file "logreg_bov_y_test_sst.txt" to your deliverable.
#     You will be evaluated on the results of the test set.

test_preds = pd.DataFrame.from_dict({'preds':log_fit.predict(df_test_noweight.iloc[:, 1:])})
test_preds.to_csv(PATH_TO_DATA+'/logreg_bov_y_test_sst.txt', header=None, index=None, sep=' ', mode='a')

# TYPE CODE HERE

In [563]:
# BONUS!
# 5 - Try to improve performance with another classifier
#     Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)

# TYPE CODE HERE
# Preprocess text into sequences
sentences_dfs = df_train['sentence'].tolist() + df_dev['sentence'].tolist() + df_test['sentence'].tolist()

def longest_sentence_voc_size(sent_list):
    # Simple function for returning the biggest sentence length and the voc size
    max_sent = 10
    for s in sent_list:
        if len(s.split()) > max_sent:
            max_sent = len(s.split())
        else: pass
        
    # Now get the voc size
    whole_text = ' '.join(sent_list)
    voc_size = len(Counter(whole_text.split()))
    
    return max_sent, voc_size

max_sent, voc_size = longest_sentence_voc_size(sentences_dfs)

def get_sentences_sequences(df_list, voc_size, max_sent):
    df_padded_sequences = []
    # Get the sentence into a properly padded sequence
    for df in df_list:
        # One hot encoding
        df['one_hot'] = df['sentence'].apply(lambda x: text.one_hot(x, n=voc_size))
        
        # Pad the sequences
        padded_sequences = sequence.pad_sequences(df['one_hot'], max_sent)
        df_padded_sequences.append(padded_sequences)
        
    return df_padded_sequences

train_seq, dev_seq, test_seq = get_sentences_sequences([df_train, 
                                                        df_dev, 
                                                        df_test], voc_size, max_sent)

n_neurons, n_classes = 100, len(np.unique(df_train.loc[:, 'label']))
embed_dim = 300

# Build the model
bonus_model = Sequential()
bonus_model.add(Embedding(voc_size, 300))
bonus_model.add(LSTM(n_neurons, return_sequences=True, 
                     dropout_U=0.5, dropout_W=0.5))
bonus_model.add(LSTM(n_neurons))

# Going to the dense part. 
bonus_model.add(Dropout(0.4))
bonus_model.add(Dense(n_neurons, activation='relu'))
bonus_model.add(Dense(n_classes, activation='softmax'))

# Compiling
bonus_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [571]:
bonus_model.fit(train_seq, target_train, epochs=5, batch_size=60, 
                validation_data=(dev_seq, target_dev))

Train on 8544 samples, validate on 1101 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x4243bbc18>

In [573]:
preds = bonus_model.predict(test_seq)
preds_labels = preds.argmax(axis=-1)
pd.DataFrame(preds_labels).to_csv(PATH_TO_DATA+'LSTM_bov_y_test_sst.txt', index=False)

# 4) Sentence classification with LSTMs in Keras

## 4.1 - Preprocessing

In [41]:
# 1 - Load train/dev/test sets of SST
# TYPE CODE HERE


df_train, df_dev, df_test = load_sst_data(list_data_file)


In [42]:
# 2 - Transform text to integers using keras.preprocessing.text.one_hot function
#     https://keras.io/preprocessing/text/

# TYPE CODE HERE

punctuation = punctuation + '\n'
trans_table = str.maketrans(dict.fromkeys(punctuation))

def process_sentence(sentence, trans_table):
    # For text preprocessing.
    sentence = sentence.translate(trans_table)
    sentence = sentence.strip()
    return sentence

# Use voc size after having pre process the sentence
def voc_size(sentence_list):
    voc = []
    max_sentence_length = 10
    # Simply appending a list of unique words if not in list. 
    for s in sentence_list:
        tokenized_s = s.split()
        # Keep track of the largest sentence
        if len(tokenized_s) > 10: 
            max_sentence_length = len(tokenized_s)
        for w in s.split():
            if w not in voc:
                voc.append(w)
            else:pass
    # Returns integer, size of list of unique words.
    return len(voc)
    
def return_with_padded_seq(df, maxlen):
    # returns the dataframe with the padded one-hot sequences.
    return pd.concat(
        [df, 
         pd.DataFrame(sequence.pad_sequences(df["one_hot_seq"], maxlen=maxlen))
        ], axis=1
    )

def max_seq_size(df):
    df["seq_len"] = df["one_hot_seq"].apply(lambda x:len(x))
    max_seq = max(df["seq_len"])
    del df["seq_len"]
    return max_seq

# Process train sentences, get vocab size and max sent length, encode in one hot and get padded vect.
df_train["sentence"] = df_train["sentence"].apply(process_sentence, trans_table=trans_table)
train_voc_size = voc_size(df_train["sentence"])
df_train["one_hot_seq"] = df_train["sentence"].apply(lambda x: text.one_hot(x, n=train_voc_size))

# Get the maximum lenght of sequence.
max_train_seq_size = max_seq_size(df_train)


**Padding input data**

Models in Keras (and elsewhere) take batches of sentences of the same length as input. It is because Deep Learning framework have been designed to handle well Tensors, which are particularly suited for fast computation on the GPU.

Since sentences have different sizes, we "pad" them. That is, we add dummy "padding" tokens so that they all have the same length.

The input to a Keras model thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence in the batch.

In [43]:
# 3 - Pad your sequences using keras.preprocessing.sequence.pad_sequences
#     https://keras.io/preprocessing/sequence/

# TYPE CODE HERE
df_train = return_with_padded_seq(df_train, maxlen=max_train_seq_size)

# Same for dev, using the train numbers. 
df_dev["sentence"] = df_dev["sentence"].apply(process_sentence, trans_table=trans_table)
df_dev["one_hot_seq"] = df_dev["sentence"].apply(lambda x: text.one_hot(x, n=train_voc_size))
df_dev = return_with_padded_seq(df_dev, maxlen=max_train_seq_size)

# Finally simply preprocess test_df
df_test["sentence"] = df_test["sentence"].apply(process_sentence, trans_table=trans_table)
df_test["one_hot_seq"] = df_test["sentence"].apply(lambda x: text.one_hot(x, n=train_voc_size))
df_test = return_with_padded_seq(df_test, maxlen=max_train_seq_size)

## 4.2 - Design and train your model

In [44]:
# 4 - Design your encoder + classifier using keras.layers
#     In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
#     Then we add components to this contained : the lookuptable, the LSTM, the classifier etc.
#     All of these components are contained in the Sequential() and are trained together.


# ADAPT CODE BELOW


embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = train_voc_size # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(LSTM(nhid, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(n_classes, activation='sigmoid'))




In [46]:
# 5 - Define your loss/optimizer/metrics

# MODIFY CODE BELOW

loss_classif     =  'categorical_crossentropy' # find the right loss for multi-class classification
optimizer        =  'RMSprop' # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          526432    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 325       
Total params: 551,589
Trainable params: 551,589
Non-trainable params: 0
_________________________________________________________________
None


In [47]:
# 6 - Train your model and find the best hyperparameters for your dev set
#     you will be evaluated on the quality of your predictions on the test set

x_train, x_val = df_train.iloc[:, 3:], df_dev.iloc[:, 3:]
y_train, y_val = to_categorical(df_train.iloc[:, 0]), to_categorical(df_dev.iloc[:, 0])

# ADAPT CODE BELOW
bs = 64
n_epochs = 6

history = model.fit(x_train, y_train, batch_size=bs, 
                    nb_epoch=n_epochs, validation_data=(x_val, y_val))



Train on 8544 samples, validate on 1101 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [53]:
# 7 - Generate your predictions on the test set using model.predict(x_test)
#     https://keras.io/models/model/
#     Log your predictions in a file (one line = one integer: 0,1,2,3,4)
#     Attach the output file "logreg_lstm_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE
def get_label(preds):
    # Simple function to get column label for the predicted label.
    df_preds = pd.DataFrame(preds)
    df_preds['label'] = 0
    for i in range(df_preds.shape[0]):
        index = list(df_preds.loc[i, :]).index(max(list(df_preds.loc[i, :])))
        df_preds.loc[i, 'label'] = index
    return df_preds

df_preds = get_label(model.predict(df_test.iloc[:, 2:]))
df_preds['label'].to_csv(PATH_TO_DATA+"logreg_lstm_y_test_sst.txt", header=None, index=False)

## 4.3 -- innovate !

In [245]:
# 8 - Open question: find a model that is better on your dev set
#     (e.g: use a 1D ConvNet, use a better classifier, pretrain your lookup tables ..)
#     you will get point if the results on the test set are better: be careful of not overfitting your dev set too much..
#     Attach the output file "XXX_XXX_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE

# Starting the tokenization of data
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), 
               nmax=500000)
df_train, df_dev, df_test = load_sst_data(list_data_file)

# Concatenating the dfs together only for the embedding matrix
df_sentences = pd.concat([df_train, df_dev, df_test])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_sentences.loc[:, 'sentence'])
sequences = tokenizer.texts_to_sequences(df_sentences.loc[:, 'sentence'])

# Some functions for proper padding and initialize the weight matrix

def max_seq(list_seq):
    for l in list_seq:
        yield len(l)

def load_embedding_matrix(embedding_dim, tokenizer):
    # Embedding matrix is of size (voc, embed_dim).
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 
                                 embedding_dim))
    for w, v in tokenizer.word_index.items():
        try:
            word_vector = w2v.word2vec[w]
            embedding_matrix[v] = word_vector
        except:
            # Vector is initialized to zeros if not present in dict.
            continue
    return embedding_matrix

# Get the biggest sequence
max_seq_length = max(max_seq(sequences))

# Padding the sequences
padded_sequences = sequence.pad_sequences(sequences, 
                                          maxlen=max_seq_length)

# Get training and test data
x_train, y_train = padded_sequences[:df_train.shape[0]], to_categorical(df_train.loc[:, 'label'])
x_dev, y_dev = padded_sequences[df_train.shape[0]:df_train.shape[0]+df_dev.shape[0]], to_categorical(df_dev.loc[:, 'label'])
x_test = padded_sequences[df_train.shape[0]+df_dev.shape[0]:df_train.shape[0]+df_dev.shape[0]+df_test.shape[0]]

# Make the matrix for the embedding layer
embed_dim = w2v.word2vec['hey'].shape[0]
embed_mat = load_embedding_matrix(embed_dim, tokenizer)



embedding_layer = Embedding(len(tokenizer.word_index)+1, embed_dim,
                            weights=[embed_mat], 
                            trainable=False)

model = Sequential()

# Add the pretrained embedding layer
model.add(embedding_layer)
model.add(LSTM(80, dropout_W=0.2, 
               dropout_U=0.2, 
               return_sequences=True))
model.add(LSTM(80))
model.add(Dropout(0.3))

# Use softmax instead of sigmoid
model.add(Dense(n_classes, activation='softmax'))

# Compile the model
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())


In [299]:
# Fit model
model.fit(x_train, y_train, batch_size=bs, 
          epochs=10, validation_data=(x_dev, y_dev))

Train on 8544 samples, validate on 1101 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x177aafef0>

In [462]:
# Get predictions out in csv format

preds = model.predict(x_test)
preds_labels = preds.argmax(axis=-1)

preds_labels = pd.DataFrame(preds_labels)
preds_labels.to_csv(PATH_TO_DATA+'DEEP_LSTM_y_test_sst.txt', index=False)