#### Licenses
GloVe
Public Domain Dedication and License v1.0 whose full text can be found at: http://www.opendatacommons.org/licenses/pddl/1.0/

Facebookresearch / FastText words embeddings
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

@article{bojanowski2016enriching,
  title={Enriching Word Vectors with Subword Information},
  author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
  journal={arXiv preprint arXiv:1607.04606},
  year={2016}
}

License Creative Commons Attribution-Share-Alike License 3.0 (https://creativecommons.org/licenses/by-sa/3.0/)

In [1]:
# optional - plays a sound when a cell completed
# note: for any reason this should be executed after keras imports

from time import time
from IPython import get_ipython
from IPython.display import Audio, display


class InvisibleAudio(Audio):
    def _repr_html_(self):
        audio = super()._repr_html_()
        audio = audio.replace('<audio', f'<audio onended="this.parentNode.removeChild(this)"')
        return f'<div style="display:none">{audio}</div>'

class Beeper:

    def __init__(self, threshold, **audio_kwargs):
        self.threshold = threshold
        self.start_time = None    # time in sec, or None
        self.audio = audio_kwargs

    def pre_execute(self):
        if not self.start_time:
            self.start_time = time()

    def post_execute(self):
        end_time = time()
        if self.start_time and end_time - self.start_time > self.threshold:
            audio = InvisibleAudio(**self.audio, autoplay=True)
            display(audio)
        self.start_time = None


beeper = Beeper(5, url='http://www.soundjay.com/button/beep-07.wav')

ipython = get_ipython()
ipython.events.register('pre_execute', beeper.pre_execute)
ipython.events.register('post_execute', beeper.post_execute)

In [2]:
import os
import io
import pickle
import numpy as np
from keras.utils import to_categorical

class DataManager:
    
    root_dir_ = '.'
    
    
    def __init__(self):
        self.root_dir_ = os.getcwd()
        
    def load_dummy_data(self):
        """
        This method makes available some dummy training data.
        """
        X = self.pickle_load("data/fr_X.pkl")
        Y = self.pickle_load("data/fr_Y.pkl")
        vocab_mots = self.pickle_load("data/fr_vocab_mots.pkl")
        vocab_pdd = self.pickle_load("data/fr_vocab_pdd.pkl")
        vocab_liaisons = self.pickle_load("data/fr_vocab_liaisons.pkl")
        return X, Y, vocab_mots, vocab_pdd, vocab_liaisons
        

    # some utilities for saving results
    def safe_pickle_dump(self, filename, obj):
        """
        Serializes an object to file, creating directory structure if it does not exist.
        """
        name = filename
        print("Saving "+name)
        try:
            os.makedirs(os.path.dirname(name), exist_ok=True)
            f = open(name,"wb")
            pickle.dump(obj,f)
            f.close()
        except:
            return False
    
        return True
    
    def pickle_load(self, filename):
        """
        Deserialize an object from a file created with pickle.dump.
        Returns False if this failed.
        """
        name = filename
        print("Loading "+name)
        try:
            f = open(name,"rb")
            obj = pickle.load(f)
            f.close()
            return obj
        except Exception as e:
            print(e)
            return False
      
        return True
    

    
    def load_glove_embeddings(self):
        EMBEDDING_DIM = 100
        GLOVE_DIR='Gloves/dataset'
        f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding="utf8")

        embeddings_index = {}
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
    
            embeddings_index[word] = coefs
        f.close()
        print('Found %s word vectors.' % len(embeddings_index))
        
    def load_embeddings(self, lang, type='fasttext'):
        """
        Loads an embeddings file depending on its type and language.
        
        Parameters
        ----------
        
        type: str
            Only "fasttext" is supported.
            
        lang: str
            See load_fasttext_embeddings(lang)
        
        """
        
        return self.load_fasttext_embeddings(lang)
        
    def load_fasttext_embeddings(self, lang):
        """
        Loads a fasttext embedding, chosen depending on lang parameter provided.
        File expected as root_dir_/data/embeddings/facebookresearch/wiki.{lang}.vec
        (or as root_dir_/data/embeddings/facebookresearch/wiki.{lang}.vec.pkl if already loaded once)
        
        Parameters
        ----------
        
        lang: str
            One of 'fr', 'ja', 'en', 'nl' (or additional ones depending on embeddings present on disk).
        
        """
        data_dict = {}
        
        pickle_fname = "wiki.{lang}.vec.pkl".format(lang=lang)
        pickle_ffname = os.path.join(self.root_dir_, 'data', 'embeddings', 'facebookresearch', pickle_fname)
        
        if os.path.isfile(pickle_ffname):
            data_dict = self.pickle_load(pickle_ffname)
            print("Embedding for {lang} loaded from {fname}".format(lang=lang, fname=pickle_ffname))
        else: 
            fname = "wiki.{lang}.vec".format(lang=lang)
            data_file = os.path.join(self.root_dir_, 'data', 'embeddings', 'facebookresearch', fname)
        
            fin = io.open(data_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
            n, d = map(int, fin.readline().split())
            
            for line in fin:
                tokens = line.rstrip().split(' ')
                data_dict[tokens[0]] = list(map(float, tokens[1:]))
            print("Embedding for {lang} loaded from {fname}".format(lang=lang, fname=data_file))
            # save embeddings as array format to improve speed next time
            self.safe_pickle_dump(pickle_ffname, data_dict)

        print("Read ", len(data_dict), " words vectors")
        return data_dict
    
    def get_words_to_match_for_embeddings(self, word):
        return [word, word.lower(), word.lower().replace('-', ''), word.lower().replace('-', '').replace('\\xa0', ' ')]
    
    def align_embeddings(self, vocab, embeddings):
        """
        Generates aligned embeddings from original embeddings, and a vocabulary of words.
        Words from vocabulary may not exist in original embeddings, in this case a random vector is generated.
        Words matching is done as (by priority) : exact match, then case insensitively, then with dash ('-') removed.
        
        Parameters
        ----------
        
        vocab: array
            An array containing each word in the vocabulary.
            
        embeddings: dict
            A dict object with words as keys and their embeddings (as a vector array) as values.
            
        Returns
        -------
        
        aligned_embeddings: list
            An array containing:
          - for words from vocab found in embeddings, the corresponding embedding at same index as in vocab.
          - for words not found in embeddings, a random vector, at same index as in vocab.
        
        words_not_found: list
            An array containing indices (based on vocab) of words not found in embeddings and replaced by random
            values.
            
        words_matched: list
            An array of strings of words based on vocab, as they were matched in embeddings.
            For example if lowercased word from vocab was matched, then lowercase version of this word will be found
            in this table (whereas the original case sensitive word will remain as is in vocab array)
        
        """
        dim_embedding = len(embeddings[list(embeddings.keys())[0]]) # find length from value of 'first' key
        aligned_embeddings = np.zeros((len(vocab), dim_embedding))
        words_not_found = []
        words_matched = [None] * len(vocab)
        for idx_mot, mot in enumerate(vocab):
            words_to_match = self.get_words_to_match_for_embeddings(mot)
            for word_to_match in words_to_match:
                if word_to_match in embeddings.keys():
                    aligned_embeddings[idx_mot] = embeddings[word_to_match]
                    words_matched[idx_mot] = word_to_match
                    break
            if words_matched[idx_mot] is None:
                words_not_found.append(idx_mot)
                words_matched[idx_mot] = mot
                aligned_embeddings[idx_mot] = np.random.rand(dim_embedding)
                
        return aligned_embeddings, words_not_found, words_matched
    
dm = DataManager()

In [3]:
from keras.utils import to_categorical
print(dm.root_dir_)
X, Y, vocab_mots, vocab_pdd, vocab_liaisons = dm.load_dummy_data()

print("|Y| ", len(Y))
print("|{Y}|", len(np.unique(Y)))
x = X[50025]
print("A sample : ")
print(x[0], x[1], x[2], x[3], x[4], " --> ", Y[50025])
print(vocab_mots[x[0]], vocab_mots[x[1]], vocab_pdd[x[2]], vocab_pdd[x[3]], x[4], " --> ", vocab_liaisons[Y[50025]])
print("vocab size: ", len(vocab_mots))

# treat data
X = np.array(X)
pdds = X[:,2]
print(X.shape)
print(pdds)
X_ = np.array(X)
cats_pdds_1 = to_categorical(X_[:, 2], num_classes=len(np.unique(vocab_pdd)))
cats_pdds_2 = to_categorical(X_[:, 3], num_classes=len(np.unique(vocab_pdd)))
min_dist = np.abs(np.min(X_[:, 4]))
cats_dist = to_categorical(X_[:, 4] + min_dist, num_classes=15)
print(cats_pdds_1)
print(cats_pdds_2.shape)
print(len(vocab_pdd))
print(np.unique(X_[:, 4]))
print(cats_dist.shape)

Y__ = to_categorical(Y)

X__ = np.column_stack((X_[:, 0], X_[:, 1], cats_pdds_1, cats_pdds_2, cats_dist))
print(X__.shape)
print(X__[50025])
#dm.safe_pickle_dump(os.path.join(dm.root_dir_, 'data', 'embeddings', 'facebookresearch', 'wiki.fr.pkl'), fr_embeddings)

#print(vocab_mots[vocab_mots])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


C:\IAAA\tal-github
Loading data/fr_X.pkl
Loading data/fr_Y.pkl
Loading data/fr_vocab_mots.pkl
Loading data/fr_vocab_pdd.pkl
Loading data/fr_vocab_liaisons.pkl
|Y|  446113
|{Y}| 93
A sample : 
17764 1036 11 4 -3  -->  77
Ibn dit PROPN VERB -3  -->  RIGHT_nsubj
vocab size:  42278
(446113, 5)
[9 9 9 ... 8 8 8]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(446113, 19)
19
[-7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7]
(446113, 15)
(446113, 55)
[1.7764e+04 1.0360e+03 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00

In [4]:
print(X__[:, 2:X__.shape[1]])
print(Y__.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(446113, 93)


In [5]:
word_vectors_fr = dm.load_embeddings('fr')
print("Loaded ", len(word_vectors_fr), " embeddings")

Loading C:\IAAA\tal-github\data\embeddings\facebookresearch\wiki.fr.vec.pkl
Embedding for fr loaded from C:\IAAA\tal-github\data\embeddings\facebookresearch\wiki.fr.vec.pkl
Read  1152449  words vectors
Loaded  1152449  embeddings


In [6]:

aligned_embeddings_fr, words_not_found_fr, words_matched_fr = dm.align_embeddings(vocab_mots, word_vectors_fr)

print("{found} words not found in pre-trained embeddings upon {total} from vocab"
      .format(found=len(words_not_found_fr), total=len(vocab_mots)))
print()
print("embeddings: ", aligned_embeddings_fr.shape)
print()
print("words not found:")
print([vocab_mots[w] for w in words_not_found_fr])


4546 words not found in pre-trained embeddings upon 42278 from vocab

embeddings:  (42278, 300)

words not found:
["qu'", "L'", 'vis-à-vis', "n'", "c'", 'petit-déjeuner', "d'", ';', "l'", 'abanbonné', "aujourd'hui", 'à-pic', ':', '1915', '93', '30e', '--', '155', '39', "s'", '105,3', '5633', '2010', '17e', '«', 'sous-espèce', '1527', 'Ymbrechts', "D'", 'A887', 'A87', 'Glenmoriston', '1986', 'Montsinéry-Tonnegrande', '...', '27', '2011', 'non-lieu', 'Paydrètes', 'celle-ci', 'ki-Moon', 'non-convexe', 'U59', "C'", "jusqu'", 'mi-août', '1984', '1983', '10 000', '48', '1962', 'États-Unis', 'Nouveau-Brunswick', 'Nouvelle-Angleterre', '1972', '846', '临济', 'Línjì', 'Yìxuán', '义', '玄', '869', '1991', '55', '1974', '1975', 'lui-même', '2', '1', 'Hadadah', '6', '10', '500', '1964', '1970', '1976', '1982', '1988', '1994', '2000', '2006', '69', '31', '50', '1936', '1987', '25', '322', '1934', 'mi-lourds', '1937', '1676', 'Crochais', '13', '81', '2004', '90', 'Gdeim', 'recludere', 'reclaudare', 'inc

In [7]:
print(vocab_mots.index('contre-feux'))
print('contre-feux' in word_vectors_fr.keys())
res = []
for w in word_vectors_fr.keys():
    if w.startswith('contre'):
        res.append(w)

print(sorted(res))

41239
False
['contre', 'contre,', 'contre/', 'contre/neutres', 'contre/pour', 'contre_plongeant_lethwei', 'contrebalance', 'contrebalancement', 'contrebalancent', 'contrebalancer', 'contrebalancera', 'contrebalancerait', 'contrebalancé', 'contrebalancée', 'contrebalancées', 'contrebalancés', 'contrebalançaient', 'contrebalançait', 'contrebalançant', 'contreband', 'contrebande', 'contrebandes', 'contrebandier', 'contrebandiers', 'contrebandiers\xa0»', 'contrebandière', 'contrebandières', 'contrebas', 'contrebasse', 'contrebasse,', 'contrebasses', 'contrebassine', 'contrebassiste', 'contrebassistes', 'contrebassite', 'contrebasson', 'contrebassons', 'contrebatterie', 'contrebattre', 'contrebia', 'contrebombarde', 'contrebutant', 'contrebutement', 'contrebutements', 'contrebutent', 'contrebuter', 'contrebuté', 'contrebutée', 'contrebutées', 'contrebutés', 'contrecarra', 'contrecarraient', 'contrecarrait', 'contrecarrant', 'contrecarre', 'contrecarrent', 'contrecarrer', 'contrecarrèrent', 

In [8]:

# free some memory as we don't need original embeddings anymore normally
del word_vectors_fr


In [13]:
import os
import pickle
import numpy as np
import pandas as pd

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Concatenate, Embedding, concatenate, Flatten, Dropout
from keras.engine.input_layer import Input
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

class RNNManager:
    
    models_ = {}
    networks_ = {}
    path_ = '.'
    
    def __init__(self, path='.'):
        if path != '.':
            os.makedirs(os.path.dirname(path), exist_ok=True)
        self.path_ = path
        self.dm_ = DataManager()
        
    def get_model(self, model_name):
        """
        Returns an existing TAL model, or None.
        
        Parameters
        ----------
        
        model_name: str
            Name of the model.
        """
        if model_name not in self.models_.keys():
            print("Model {m} not found".format(m=model_name))
            return None
        return self.models_[model_name]
        
    def create_model(self, model_name, lang, featureset, vocabs):
        """
        Creates a new TAL model.
        
        Parameters
        ----------
        
        model_name: str
            Name of this model.
            
        lang: str
            'fr', 'ja', 'nl', 'en'
            
        featureset: str
            'f1', 'f2' or 'f3'
            
        vocabs: dict
            Vocabs for learning task, with keys 'WORDS', 'POS', 'MORPHO' and/or 'LEMMA' (or 'DEPS' for targets).
        
        """
        model = {'name': model_name,
                'lang': lang,
                'featureset': featureset,
                'vocabs': vocabs}
        if model_name not in self.models_.keys():
            self.models_[model_name] = model
            
        return model
            
    def remove_model(self, model_name):
        """
        """
        return self.models_.pop(model_name, None)
    
    def save_model(self, model_name):
        """
        """
        if model_name in self.models_:
            dm.safe_pickle_dump(os.path.join(self.path_, model_name + '-model.pkl'), self.models_[model_name])
        else:
            print("Model " + model_name + " not found")
            
    def load_model(self, model_fname):
        """
        """
        model_conf = dm.pickle_load(os.path.join(self.path_, model_name, '-model.pkl'))
        if model:
            self.create_model(model_name=model['name'], lang=model['lang'], featureset=model['featureset'], 
                              vocabs=model['vocabs'])
            return 
            
        
    def create_network(self, network_name, model_name, nb_classes, embeddings=None, dropout=False):
        """
        
        
        Parameters
        ----------
        
        language: 'fr', 'ja', 'nl', 'en'
            For french, japanese, dutch, or english. 
        
        featureset: 'f1', 'f2', 'f3'
            
        
        embeddings: None, or ndarray
            If embeddings is None, then no embedding layer is added at input of the network.
            If an array is passed, it is considered the weights of embedding layer.
        
        """
        net_model = None
        
        if model_name not in self.models_.keys():
            print("Model {m} not found".format(m=model_name))
            return None
        
        model = self.models_[model_name]
        featureset = model['featureset']
        lang = model['lang']
        vocabs = model['vocabs']
        
        net_model = Model()
                       
        #if embeddings != None:
                
        if embeddings is not None:
            
            if len(vocab_mots) != embeddings.shape[0]:
                print("Vocab size {v} must equal embeddings length {e}".format(v=len(vocab_mots), e=embeddings.shape[0]))
                return None
            
            # Embedding layer for 2 words
            input_word1 = Input(shape=(1,), dtype='int32', name='main_input_1')
            embeddings_1 = Embedding(input_dim=embeddings.shape[0], 
                                 output_dim=embeddings.shape[1], 
                                 weights=[embeddings], 
                                 input_length=1)(input_word1)
            embeddings_1 = Flatten()(embeddings_1)
                
            # Embedding layer for second word
            input_word2 = Input(shape=(1,), dtype='int32', name='main_input_2')
            embeddings_2 = Embedding(input_dim=embeddings.shape[0], 
                                 output_dim=embeddings.shape[1], 
                                 weights=[embeddings], 
                                 input_length=1)(input_word2)
            embeddings_2 = Flatten()(embeddings_2)
            
        # note: we could also use input_length=2 then share embeddings for both input words...
            
        # define input for additional features
        # note: dist is restricted to [-7 ... 7] so 15 values
        if featureset == 'f1':
            """ S.0.POS
                B.0.POS
                DIST"""
            dim_features = len(vocabs['POS']) * 2 + 15
        elif featureset == 'f2':
            """ S.0.POS
                S.0.LEMMA
                S.0.MORPHO
                S.-1.POS
                B.0.POS
                B.0.LEMMA
                B.0.MORPHO
                B.-1.POS
                B.1.POS
                DIST"""
            dim_features = len(vocabs['POS']) * 4 + len(vocabs['LEMMA']) * 2 + len(vocabs['MORPHO']) * 2 + 15
        else:
            # same size as 'f2'
            dim_features = len(vocabs['POS']) * 4 + len(vocabs['LEMMA']) * 2 + len(vocabs['MORPHO']) * 2 + 15
        print("  expecting {n} features dim".format(n=dim_features))
        
        # define input for features
        features_input = Input(shape=(dim_features,))
        
        if embeddings is not None:
            # concatenate features and embeddings
            l = concatenate([embeddings_1, embeddings_2, features_input])
        else:
            l = features_input
        
        l = Dense(128)(l)
        l = Activation('relu')(l)
        if dropout:
            l = Dropout(0.15)(l)
        l = Dense(128)(l)
        l = Activation('relu')(l)
        if dropout:
            l = Dropout(0.15)(l)
        
        l = Dense(nb_classes)(l)
        out = Activation('softmax')(l)
        
        if embeddings is not None:
            net_model = Model([input_word1, input_word2, features_input], out)
        else:
            net_model = Model(features_input, out)
        
        # not sure where to compile the model ...
        net_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
            
        if net_model:
            self.networks_[network_name] = net_model
        
        return net_model

    def save_network(self, network_name):
        if name in self.networks_:
            self.networks_[network_name].save(os.path.join(self.path_, '/', network_name, '-net.h5'))
        else:
            print("Net " + network_name + " not found")
            
            
    def preprocess_data(self, model_name, X, Y, split=False):
        """
        """
        
        if model_name not in self.models_.keys():
            print("Model {m} not found".format(m=model_name))
            return None
        model = self.models_[model_name]
        featureset = model['featureset']
        lang = model['lang']
        vocabs = model['vocabs']

        # treat data
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        
        if featureset == 'f1':
            cats_pos1 = to_categorical(X[:, 2], num_classes=len(np.unique(vocabs['POS'])))
            cats_pos2 = to_categorical(X[:, 3], num_classes=len(np.unique(vocabs['POS'])))
            min_dist = np.abs(np.min(X_[:, 4]))
            cats_dist = to_categorical(X_[:, 4] + min_dist, num_classes=15)
            X__ = np.column_stack((X_[:, 0], X_[:, 1], cats_pdds_1, cats_pdds_2, cats_dist))
        
        Y__ = to_categorical(Y)

        if split:
            x_train, x_test, y_train, y_test = train_test_split(X__, Y__)
            return X_train, X_test, y_train, y_test 
        
        return X__, Y__
    

In [10]:
vocabs = {"WORDS" : vocab_mots, "POS": vocab_pdd, "DEPS": vocab_liaisons}
nb_classes = Y__.shape[1]


# testing the TALRNNManager class
rnn = RNNManager()
model = rnn.create_model("test", "fr", "f1", vocabs)
net = rnn.create_network("nn-test-1", "test", nb_classes=nb_classes, embeddings=aligned_embeddings_fr, dropout=True)
 
rnn.save_model("test")
print(rnn.networks_)

  expecting 53 features dim
Saving .\test-model.pkl
{'nn-test-1': <keras.engine.training.Model object at 0x000001C84A0EA3C8>}


In [12]:
history = net.fit([X__[:, 0], X__[:, 1], X__[:, 2:X__.shape[1]]], Y__, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
