#### Licenses
GloVe
Public Domain Dedication and License v1.0 whose full text can be found at: http://www.opendatacommons.org/licenses/pddl/1.0/

Facebookresearch / FastText words embeddings
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

@article{bojanowski2016enriching,
  title={Enriching Word Vectors with Subword Information},
  author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
  journal={arXiv preprint arXiv:1607.04606},
  year={2016}
}

License Creative Commons Attribution-Share-Alike License 3.0 (https://creativecommons.org/licenses/by-sa/3.0/)

In [2]:
# optional - plays a sound when a cell completed
# note: for any reason this should be executed after keras imports

from time import time
from IPython import get_ipython
from IPython.display import Audio, display


class InvisibleAudio(Audio):
    def _repr_html_(self):
        audio = super()._repr_html_()
        audio = audio.replace('<audio', f'<audio onended="this.parentNode.removeChild(this)"')
        return f'<div style="display:none">{audio}</div>'

class Beeper:

    def __init__(self, threshold, **audio_kwargs):
        self.threshold = threshold
        self.start_time = None    # time in sec, or None
        self.audio = audio_kwargs

    def pre_execute(self):
        if not self.start_time:
            self.start_time = time()

    def post_execute(self):
        end_time = time()
        if self.start_time and end_time - self.start_time > self.threshold:
            audio = InvisibleAudio(**self.audio, autoplay=True)
            display(audio)
        self.start_time = None


beeper = Beeper(5, url='http://www.soundjay.com/button/beep-07.wav')

ipython = get_ipython()
ipython.events.register('pre_execute', beeper.pre_execute)
ipython.events.register('post_execute', beeper.post_execute)

In [32]:
import os
import io
import pickle
import numpy as np

class DataManager:
    
    root_dir_ = '.'
    
    
    def __init__(self):
        self.root_dir_ = os.getcwd()
        
    def load_dummy_data(self):
        """
        This method makes available some dummy training data.
        """
        X = self.pickle_load("data/fr_X.pkl")
        Y = self.pickle_load("data/fr_Y.pkl")
        vocab_mots = self.pickle_load("data/fr_vocab_mots.pkl")
        vocab_pdd = self.pickle_load("data/fr_vocab_pdd.pkl")
        vocab_liaisons = self.pickle_load("data/fr_vocab_liaisons.pkl")
        return X, Y, vocab_mots, vocab_pdd, vocab_liaisons
        

    # some utilities for saving results
    def safe_pickle_dump(self, filename, obj):
        """
        Serializes an object to file, creating directory structure if it does not exist.
        """
        name = filename
        print("Saving "+name)
        try:
            os.makedirs(os.path.dirname(name), exist_ok=True)
            f = open(name,"wb")
            pickle.dump(obj,f)
            f.close()
        except:
            return False
    
        return True
    
    def pickle_load(self, filename):
        """
        Deserialize an object from a file created with pickle.dump.
        Returns False if this failed.
        """
        name = filename
        print("Loading "+name)
        try:
            f = open(name,"rb")
            obj = pickle.load(f)
            f.close()
            return obj
        except Exception as e:
            print(e)
            return False
      
        return True
    
    def load_glove_embeddings(self):
        EMBEDDING_DIM = 100
        GLOVE_DIR='Gloves/dataset'
        f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding="utf8")

        embeddings_index = {}
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
    
            embeddings_index[word] = coefs
        f.close()
        print('Found %s word vectors.' % len(embeddings_index))
        
    def load_embeddings(self, lang, type='fasttext'):
        """
        Loads an embeddings file depending on its type and language.
        
        Parameters
        ----------
        
        type: str
            Only "fasttext" is supported.
            
        lang: str
            See load_fasttext_embeddings(lang)
        
        """
        
        return self.load_fasttext_embeddings(lang)
        
    def load_fasttext_embeddings(self, lang):
        """
        Loads a fasttext embedding, chosen depending on lang parameter provided.
        File expected as root_dir_/data/embeddings/facebookresearch/wiki.{lang}.vec
        (or as root_dir_/data/embeddings/facebookresearch/wiki.{lang}.vec.pkl if already loaded once)
        
        Parameters
        ----------
        
        lang: str
            One of 'fr', 'ja', 'en', 'nl' (or additional ones depending on embeddings present on disk).
        
        """
        #words = []
        #vectors = []
        data_dict = {}
        
        pickle_fname = "wiki.{lang}.vec.pkl".format(lang=lang)
        pickle_ffname = os.path.join(self.root_dir_, 'data', 'embeddings', 'facebookresearch', pickle_fname)
        
        if os.path.isfile(pickle_ffname):
            data_dict = self.pickle_load(pickle_ffname)
            #words = data['words']
            #vectors = data['vectors']
            print("Embedding for {lang} loaded from {fname}".format(lang=lang, fname=pickle_ffname))
        else: 
            fname = "wiki.{lang}.vec".format(lang=lang)
            data_file = os.path.join(self.root_dir_, 'data', 'embeddings', 'facebookresearch', fname)
        
            fin = io.open(data_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
            n, d = map(int, fin.readline().split())
            
            for line in fin:
                tokens = line.rstrip().split(' ')
                #words.append(tokens[0])
                #vectors.append(list(map(float, tokens[1:])))
                data_dict[tokens[0]] = list(map(float, tokens[1:]))
            print("Embedding for {lang} loaded from {fname}".format(lang=lang, fname=data_file))
            # save embeddings as array format to improve speed next time
            #data = {'words': words, 'vectors': vectors}
            #print(data)
            self.safe_pickle_dump(pickle_ffname, data_dict)

        print("Read ", len(data_dict), " words vectors")
        return data_dict
    
    def align_embeddings(self, vocab, embeddings):
        """
        Generates aligned embeddings from original embeddings, and a vocabulary of words.
        Words from vocabulary may not exist in original embeddings, in this case a random vector is generated.
        
        Parameters
        ----------
        
        vocab: array
            An array containing each word in the vocabulary.
            
        embeddings: dict
            A dict object with words as keys and their embeddings (as a vector array) as values.
            
        Returns
        -------
        
        aligned_embeddings: list
            An array containing:
          - for words from vocab found in embeddings, the corresponding embedding at same index as in vocab.
          - for words not found in embeddings, a random vector, at same index as in vocab.
        
        words_not_found: list
            An array containing indices (based on vocab) of words not found in embeddings and replaced by random
            values.
        
        """
        aligned_embeddings = np.zeros((len(vocab), 300))
        words_not_found = []
        for idx_mot, mot in enumerate(vocab):
            # @TODO: VERIFIER S'IL FAUDRAIT GERER LA CORRESPONDANCE SANS PRENDRE EN COMPTE LA CASSE DU MOT ?
            if mot.lower() in embeddings.keys():
                #print("[{mot}][{idx}] found as embedding".format(mot=mot, idx=idx_mot))
                aligned_embeddings[idx_mot] = embeddings[mot.lower()]
            elif mot.lower().replace('-','') in embeddings.keys():
                    aligned_embeddings[idx_mot] = embeddings[mot.lower().replace('-', '')]
            else:
                words_not_found.append(idx_mot)
                aligned_embeddings[idx_mot] = np.random.rand(300)
                
        return aligned_embeddings, words_not_found
    
dm = DataManager()

In [None]:
print(dm.root_dir_)
X, Y, vocab_mots, vocab_pdd, vocab_liaisons = dm.load_dummy_data()

print(X[0])
print(Y[0])
print("|Y| ", len(Y))
print("|{Y}|", len(np.unique(Y)))
print(vocab_mots[0])
print(vocab_pdd[0])
print(vocab_liaisons[0])
x = X[50025]
print(vocab_mots[x[0]], vocab_mots[x[1]], vocab_pdd[x[2]], vocab_pdd[x[3]], vocab_liaisons[x[4]], " --> ", vocab_liaisons[Y[50025]])


#dm.safe_pickle_dump(os.path.join(dm.root_dir_, 'data', 'embeddings', 'facebookresearch', 'wiki.fr.pkl'), fr_embeddings)

#print(vocab_mots[vocab_mots])

C:\IAAA\tal-github
Loading data/fr_X.pkl


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\jerem\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1863, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\jerem\AppData\Local\Continuum\anaconda3\lib\genericpath.py", line 19, in exists
    os.stat(path)
FileNotFoundError: [WinError 3] Le chemin d’accès spécifié est introuvable: 'zmq/backend/cython/checkrc.pxd'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\jerem\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\ultratb.py", line 1095, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "C:\Users\jerem\AppData\Local\Continuum\anaconda3\lib\site-packa

KeyboardInterrupt: 

Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc
    PyErr_CheckSignals()
KeyboardInterrupt


Loading data/fr_Y.pkl
Loading data/fr_vocab_mots.pkl
Loading data/fr_vocab_pdd.pkl
Loading data/fr_vocab_liaisons.pkl
[73, 109, 9, 9, -1]
47
|Y|  446113
|{Y}| 93
Les
DET
LEFT_acl
Ibn dit PROPN VERB RIGHT_vocative  -->  RIGHT_nsubj
Loading C:\IAAA\tal-github\data\embeddings\facebookresearch\wiki.fr.vec.pkl


In [None]:
word_vectors_fr = dm.load_embeddings('fr')

In [36]:
aligned_embeddings_fr, words_not_found_fr = dm.align_embeddings(vocab_mots, word_vectors_fr)

print("{found} words not found in pre-trained embeddings upon {total}".format(found=len(words_not_found_fr), 
                                                                              total=len(vocab_mots)))
print()
print([vocab_mots[w] for w in words_not_found_fr])

4546 words not found in pre-trained embeddings upon 42278

["qu'", "L'", 'vis-à-vis', "n'", "c'", 'petit-déjeuner', "d'", ';', "l'", 'abanbonné', "aujourd'hui", 'à-pic', ':', '1915', '93', '30e', '--', '155', '39', "s'", '105,3', '5633', '2010', '17e', '«', 'sous-espèce', '1527', 'Ymbrechts', "D'", 'A887', 'A87', 'Glenmoriston', '1986', 'Montsinéry-Tonnegrande', '...', '27', '2011', 'non-lieu', 'Paydrètes', 'celle-ci', 'ki-Moon', 'non-convexe', 'U59', "C'", "jusqu'", 'mi-août', '1984', '1983', '10 000', '48', '1962', 'États-Unis', 'Nouveau-Brunswick', 'Nouvelle-Angleterre', '1972', '846', '临济', 'Línjì', 'Yìxuán', '义', '玄', '869', '1991', '55', '1974', '1975', 'lui-même', '2', '1', 'Hadadah', '6', '10', '500', '1964', '1970', '1976', '1982', '1988', '1994', '2000', '2006', '69', '31', '50', '1936', '1987', '25', '322', '1934', 'mi-lourds', '1937', '1676', 'Crochais', '13', '81', '2004', '90', 'Gdeim', 'recludere', 'reclaudare', 'includare', 'retrudere', 'recloore', 'reclore', '1990', '2

In [26]:
print(vocab_mots.index('contre-feux'))
print('contre-feux' in word_vectors_fr.keys())
res = []
for w in word_vectors_fr.keys():
    if w.startswith('contre'):
        res.append(w)

print(sorted(res))

41239
False
['contre', 'contre,', 'contre/', 'contre/neutres', 'contre/pour', 'contre_plongeant_lethwei', 'contrebalance', 'contrebalancement', 'contrebalancent', 'contrebalancer', 'contrebalancera', 'contrebalancerait', 'contrebalancé', 'contrebalancée', 'contrebalancées', 'contrebalancés', 'contrebalançaient', 'contrebalançait', 'contrebalançant', 'contreband', 'contrebande', 'contrebandes', 'contrebandier', 'contrebandiers', 'contrebandiers\xa0»', 'contrebandière', 'contrebandières', 'contrebas', 'contrebasse', 'contrebasse,', 'contrebasses', 'contrebassine', 'contrebassiste', 'contrebassistes', 'contrebassite', 'contrebasson', 'contrebassons', 'contrebatterie', 'contrebattre', 'contrebia', 'contrebombarde', 'contrebutant', 'contrebutement', 'contrebutements', 'contrebutent', 'contrebuter', 'contrebuté', 'contrebutée', 'contrebutées', 'contrebutés', 'contrecarra', 'contrecarraient', 'contrecarrait', 'contrecarrant', 'contrecarre', 'contrecarrent', 'contrecarrer', 'contrecarrèrent', 

In [127]:

# free some memory as we don't need original embeddings anymore normally
del word_vectors_fr


NameError: name 'words_not_found_fr' is not defined

In [13]:
import os
import pickle
import numpy as np
import pandas as pd

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Concatenate, Embedding


class RNNManager:
    
    models_ = {}
    networks_ = {}
    path_ = '.'
    
    def __init__(self, path='.'):
        if path != '.':
            os.makedirs(os.path.dirname(path), exist_ok=True)
        self.path_ = path
        self.dm_ = DataManager()
        
    def create_model(self, model_name, lang, featureset, vocabs):
        """
        Creates a new TAL model.
        
        Parameters
        ----------
        
        model_name: str
            Name of this model.
            
        lang: str
            'fr', 'ja', 'nl', 'en'
            
        featureset: str
            'f1':
            'f2':
            'f3':
            
        vocabs: dict
            Vocabs for learning task, with keys 'mots', 'pdds', 'liaisons'
        
        """
        model = {'lang': lang,
                'featureset': featureset,
                'vocabs': vocabs}
        if model_name not in self.models_.keys():
            self.models_[model_name] = model
            
    def remove_model(self, model_name):
        return self.models_.pop(model_name, None)
        
    def create_network(self, network_name, model_name, input_dim, nb_classes, embeddings=None, dropout=False):
        """
        
        
        Parameters
        ----------
        
        language: 'fr', 'ja', 'nl', 'en'
            For french, japanese, dutch, or english. 
        
        featureset: 'f1', 'f2', 'f3'
            
        
        embeddings: None, or ndarray
            If embeddings is None, then no embedding layer is added at input of the network.
            If an array is passed, it is considered the weights of embedding layer.
        
        """
        net_model = None
        
        if model_name not in self.models_.keys():
            print("Model {m} not found".format(m=model_name))
            return None
        
        model = self.models_[model_name]
        featureset = model['featureset']
        lang = model['lang']
        vocab_mots = model['vocabs']['mots']
        vocab_pdds = model['vocabs']['pdds']
        vocab_liaisons = model['vocabs']['liaisons']
        
        print("debug create mlp")
        net_model = Model()
                       
        #if embeddings != None:
                
        if embeddings is not None:
            
            if len(vocab_mots) != embeddings.shape[0]:
                print("Vocab size {v} must equal embeddings length {e}".format(v=len(vocab_mots), e=embeddings.shape[0]))
                return None
            
            # Embedding layer for 2 words
            embeddings_1 = Embedding(input_dim=embeddings.shape[0], 
                                 output_dim=embeddings.shape[1], 
                                 weights=[embeddings], 
                                 input_length=1)
                
            # Embedding layer for second word
            embeddings_2 = Embedding(input_dim=embeddings.shape[0], 
                                 output_dim=embeddings.shape[1], 
                                 weights=[embeddings], 
                                 input_length=1)
            
        # note: we could also use input_length=2 then share embeddings for both input words
            
        # define input for additional features
            
        if featureset == 'f1':
             
        elif featureset == 'f2':
                
        else:
                
            
        # concatenate input features and embeddings
        x = Concatenate()
            
        net_model.add(Dense(128, input_dim=input_dim))
        net_model.add(Activation('relu'))
        if dropout:
            net_model.add(Dropout(0.15))
        net_model.add(Dense(128))
        net_model.add(Activation('relu'))
        if dropout:
            net_model.add(Dropout(0.15))
        net_model.add(Dense(nb_classes))
        net_model.add(Activation('softmax'))
        
        if net_model:
            self.networks_[network_name] = net_model
        
        return net_model

    def save_network(self, network_name):
        if name in self.networks_:
            self.networks_[network_name].save(self.path_ + '/' + network_name + '.h5')
        else:
            print("Net " + network_name + " not found")
            
            
            
    

In [14]:
# testing the TALRNNManager class
rnn = RNNManager()
rnn.create_model("test", 10, len(np.unique(Y)))
rnn.save_model("test")
print(rnn.networks_)

type  mlp
debug create mlp
{'test': <keras.engine.sequential.Sequential object at 0x0000018BE7768518>}


In [None]:
first_input = Input(shape=(2, ))
first_dense = Dense(1, )(first_input)

second_input = Input(shape=(2, ))
second_dense = Dense(1, )(second_input)

merge_one = concatenate([first_dense, second_dense])

third_input = Input(shape=(1, ))
merge_two = concatenate([merge_one, third_input])

model = Model(inputs=[first_input, second_input, third_input], outputs=merge_two)
model.compile(optimizer=ada_grad, loss='binary_crossentropy',
               metrics=['accuracy'])