#### Licenses
GloVe
Public Domain Dedication and License v1.0 whose full text can be found at: http://www.opendatacommons.org/licenses/pddl/1.0/

Facebookresearch / FastText words embeddings
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

@article{bojanowski2016enriching,
  title={Enriching Word Vectors with Subword Information},
  author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
  journal={arXiv preprint arXiv:1607.04606},
  year={2016}
}

License Creative Commons Attribution-Share-Alike License 3.0 (https://creativecommons.org/licenses/by-sa/3.0/)

In [14]:
# optional - plays a sound when a cell completed
# note: for any reason this should be executed after keras imports

from time import time
from IPython import get_ipython
from IPython.display import Audio, display


class InvisibleAudio(Audio):
    def _repr_html_(self):
        audio = super()._repr_html_()
        audio = audio.replace('<audio', f'<audio onended="this.parentNode.removeChild(this)"')
        return f'<div style="display:none">{audio}</div>'

class Beeper:

    def __init__(self, threshold, **audio_kwargs):
        self.threshold = threshold
        self.start_time = None    # time in sec, or None
        self.audio = audio_kwargs

    def pre_execute(self):
        if not self.start_time:
            self.start_time = time()

    def post_execute(self):
        end_time = time()
        if self.start_time and end_time - self.start_time > self.threshold:
            audio = InvisibleAudio(**self.audio, autoplay=True)
            display(audio)
        self.start_time = None


beeper = Beeper(5, url='http://www.soundjay.com/button/beep-07.wav')

ipython = get_ipython()
ipython.events.register('pre_execute', beeper.pre_execute)
ipython.events.register('post_execute', beeper.post_execute)

In [1]:
import os
import io
import pickle
import numpy as np
from keras.utils import to_categorical

class DataManager:
    
    root_dir_ = '.'
    
    
    def __init__(self):
        self.root_dir_ = os.getcwd()
        
    def load_dummy_data(self):
        """
        This method makes available some dummy training data.
        """
        X = self.pickle_load("data/fr_X.pkl")
        Y = self.pickle_load("data/fr_Y.pkl")
        vocab_mots = self.pickle_load("data/fr_vocab_mots.pkl")
        vocab_pdd = self.pickle_load("data/fr_vocab_pdd.pkl")
        vocab_liaisons = self.pickle_load("data/fr_vocab_liaisons.pkl")
        return X, Y, vocab_mots, vocab_pdd, vocab_liaisons
        
    def load_dummy_data_2(self):
        """
        This method makes available some dummy training data.
        """
        data = self.pickle_load("data/f1_fr_project_ok_bool.pkl")
        return data['X'], data['Y'], data['vocab_mots'], data['vocab_pdd'], data['vocab_liaisons'] 
    
    def load_data(self, phase='train', lang='fr', featureset='f1'):
        """
        Loads a dataset for a specific lang and feature set, and phase (train/dev/test).
        
        Parameters
        ----------
        
        phase: str
            'train', 'dev' or 'test'
            
        lang: str
        
        featureset: str
            'f1', 'f2' or 'f3'
        
        """
        name = "{featureset}_{lang}-{phase}".format(lang=lang, featureset=featureset, phase=phase)
        fname = os.path.join('data', name)
        data = self.pickle_load(fname)
        if data:
            vocabs = {}
            X = np.array(data['X'])
            Y = np.array(data['Y'])
            vocabs['WORDS'] = data['vocab_mots']
            vocabs['POS'] = data['vocab_pdd']
            vocabs['LABELS'] = data['vocab_liaisons']
            if isinstance(vocabs['WORDS'], np.ndarray):
                vocabs['WORDS'] = vocabs['WORDS'].tolist()
            if isinstance(vocabs['POS'], np.ndarray):
                vocabs['POS'] = vocabs['POS'].tolist()
            if isinstance(vocabs['LABELS'], np.ndarray):
                vocabs['LABELS'] = vocabs['LABELS'].tolist()
                
            if featureset == 'f2' or featureset == 'f3':
                vocabs['MORPHO'] = data['vocab_morpho']
                vocabs['LEMMA'] = data['vocab_lemma']
                if isinstance(vocabs['MORPHO'], np.ndarray):
                    vocabs['MORPHO'] = vocabs['MORPHO'].tolist()
                if isinstance(vocabs['LEMMA'], np.ndarray):
                    vocabs['LEMMA'] = vocabs['LEMMA'].tolist()
                       
            print("load_data: loaded X = ", X.shape, ", Y = ", Y.shape, ", vocabs = ", 
                  (''.join("{key} ({len}), ".format(key=k, len=len(vocabs[k])) for k in vocabs.keys())))
            return X, Y, vocabs
        
        return None
    
    def merge_vocabs(self, vocab1, vocab2, data, columns, verbose=False):
        """
        Merges vocab2 into vocab1, updating accordingly words references in data columns.
        For all words in vocab2 with index idx_vocab2:
        - if it exists in vocab1, then words features of data in columns refering to idx_vocab2 will be replaced by idx_vocab1
        - if it does not exist in vocab1, it will be appended to vocab1 then replacement will be done as above in data
        
        Parameters
        ----------
        
        vocab1: list(str)
            A list of words from original vocabulary.
                        
        vocab2: list(str)
            A list of words from vocabulary to be merged into vocab1 (used to build X2).
            
        data: array
            Array containing the words indices to be updated by merge of vocabularies.
            Usually this should be an array with samples as first dimension (rows), then columns for features.

        columns: tuple(int)
            Indices of columns to be updated in data.
            
        Returns:
        
        vocab1: list(str)
            A new vocabulary with missing words appended to vocab1.
            Note: vocab1 is also update in place, meaning this function modified original vocab1.
        
        ! data is updated in place.
        
        """
        
        vocab1_ = vocab1.copy()
        data_ = data.copy()
        
        for idx_vocab2, w in enumerate(vocab2):
            # treat vocabs
            if w in vocab1:
                if verbose:
                    print("word [{i}]={w} found in vocab1 at {j}".format(i=idx_vocab2, w=w, j=vocab1.index(w)))
                idx_vocab1 = vocab1.index(w)
            else:
                if verbose:
                    print("word [{i}]={w} not found in vocab1".format(i=idx_vocab2, w=w))
                vocab1.append(w)
                idx_vocab1 = len(vocab1) - 1
                
            # replace all references in data
            for i in range(len(data_)):
                if len(data_.shape) == 1:
                    if data_[i] == idx_vocab2:
                        if verbose:
                            print("Replacing word [{i}]= {idx}, {w} index {fro} to {to}"
                                    .format(i=i, idx=idx_vocab2, w=vocab2[idx_vocab2], 
                                            fro=idx_vocab2, to=idx_vocab1))
                        data[i] = idx_vocab1
                        #Replacing word [796]= 1, LEFT_advcl index 1 to 2
                        
                else:
                    for idx, j in enumerate(columns):
                        if data_[i][j] == idx_vocab2:
                            if verbose:
                                print("Replacing word [{i},{j}]= {idx}, {w} index {fro} to {to}"
                                      .format(i=i, j=j, idx=idx_vocab2, w=vocab2[idx_vocab2], 
                                              fro=idx_vocab2, to=idx_vocab1))
                            data[i][j] = idx_vocab1

        return vocab1, data
        

    # some utilities for saving results
    def safe_pickle_dump(self, filename, obj):
        """
        Serializes an object to file, creating directory structure if it does not exist.
        """
        name = filename
        print("pickle.dump "+name)
        try:
            os.makedirs(os.path.dirname(name), exist_ok=True)
            f = open(name,"wb")
            pickle.dump(obj,f)
            f.close()
        except:
            return False
    
        return True
    
    def pickle_load(self, filename):
        """
        Deserialize an object from a file created with pickle.dump.
        Returns False if this failed.
        """
        name = filename
        print("pickle.load "+name)
        try:
            f = open(name,"rb")
            obj = pickle.load(f)
            f.close()
            return obj
        except Exception as e:
            print(e)
            return None
      
        return None
        
    def load_embeddings(self, lang, type='fasttext'):
        """
        Loads an embeddings file depending on its type and language.
        
        Parameters
        ----------
        
        type: str
            Only "fasttext" is supported.
            
        lang: str
            See load_fasttext_embeddings(lang)
        
        """
        
        return self.load_fasttext_embeddings(lang)
        
    def load_fasttext_embeddings(self, lang):
        """
        Loads a fasttext embedding, chosen depending on lang parameter provided.
        File expected as root_dir_/data/embeddings/facebookresearch/wiki.{lang}.vec
        (or as root_dir_/cache/wiki.{lang}.vec.pkl if already loaded once)
        
        Parameters
        ----------
        
        lang: str
            One of 'fr', 'ja', 'en', 'nl' (or additional ones depending on embeddings present on disk).
        
        """
        data_dict = {}
        
        pickle_fname = "wiki.{lang}.vec.pkl".format(lang=lang)
        pickle_ffname = os.path.join(self.root_dir_, 'cache', pickle_fname)
        
        if os.path.isfile(pickle_ffname):
            data_dict = self.pickle_load(pickle_ffname)
            print("Embedding for {lang} loaded from {fname}".format(lang=lang, fname=pickle_ffname))
        else: 
            fname = "wiki.{lang}.vec".format(lang=lang)
            data_file = os.path.join(self.root_dir_, 'data', 'embeddings', 'facebookresearch', fname)
        
            fin = io.open(data_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
            n, d = map(int, fin.readline().split())
            
            for line in fin:
                tokens = line.rstrip().split(' ')
                data_dict[tokens[0]] = list(map(float, tokens[1:]))
            print("Embedding for {lang} loaded from {fname}".format(lang=lang, fname=data_file))
            # save embeddings as array format to improve speed next time
            self.safe_pickle_dump(pickle_ffname, data_dict)

        print("Read ", len(data_dict), " words vectors")
        return data_dict
    
    def get_words_to_match_for_embeddings(self, word):
        """
        Returns a list with same word, word lowercased, word lowercased and dashes removed, 
        then all this plus \xa0 removed.
        
        Parameters
        ----------
        
        word: str
            Word to be transformed.
            
        Returns: list
            List of transformed word forms.
        """
        return [word, word.lower(), word.lower().replace('-', ''), word.lower().replace('-', '').replace('\\xa0', ' ')]
    
    def align_embeddings(self, vocab, embeddings, augment_vocab=True, max_vocab_size=-1):
        """
        Generates aligned embeddings from original embeddings, and a vocabulary of words.
        Words from vocabulary may not exist in original embeddings, in this case a random vector is generated.
        Words matching is done as (by priority) : exact match, then case insensitively, then with dash ('-') removed.
        
        Parameters
        ----------
        
        vocab: array
            An array containing each word in the vocabulary.
            
        embeddings: dict
            A dict object with words as keys and their embeddings (as a vector array) as values.
            
        augment_vocab: boolean
            If True, then all words from embeddings not existing in vocab, are appended to vocab, after
            alignment is done.
            
        max_vocab_size: int
            Maximum length of resulting vocab if augment_vocab is True and if max_vocab_size < original vocab length.
            If -1 then there is no limit.
            Note: resulting vocab size can't be < original vocab size, whatever the value of max_vocab_size.
            
        Returns
        -------
        
        aligned_embeddings: list
            An array containing:
          - for words from vocab found in embeddings, the corresponding embedding at same index as in vocab.
          - for words not found in embeddings, a random vector, at same index as in vocab.
          - if augment_vocab is True, all remaining words from embeddings (not found in vocab) are added after
            len(vocab)
        
        words_not_found: list
            An array containing indices (based on vocab) of words not found in embeddings and replaced by random
            values.
            
        words_matched: list
            An array of strings of words based on vocab, as they were matched in embeddings.
            For example if lowercased word from vocab was matched, then lowercase version of this word will be found
            in this table (whereas the original case sensitive word will remain as is in vocab array)
        
        """
        dim_embedding = len(embeddings[list(embeddings.keys())[0]]) # find length from value of 'first' key
        
        print("align_embeddings: aligning embeddings ({elen},{edim}) with vocab ({vlen}) using at most {max}"
             .format(elen=len(embeddings), edim=dim_embedding, vlen=len(vocab), max=max_vocab_size))
        
        cur_size = len(vocab) # to avoid computing vocab len at each iteration
        
        # first append missing embeddings to vocab, if required, to limit unknown words
        if augment_vocab:
            for embedding_word in enumerate(embeddings.keys()):
                # adds missing word only up to max vocab size, if used
                if max_vocab_size is not -1 and cur_size > max_vocab_size:
                    break
                elif embedding_word not in vocab:
                    vocab.append(embedding_word)
                    cur_size + cur_size + 1
        
            print("align_embeddings: new vocab size : ", len(vocab))
        
        aligned_embeddings = np.zeros((len(vocab), dim_embedding))
        words_not_found = []
        words_matched = [None] * len(vocab)
        for idx_mot, mot in enumerate(vocab):
            words_to_match = self.get_words_to_match_for_embeddings(mot)
            for word_to_match in words_to_match:
                if word_to_match in embeddings.keys():
                    aligned_embeddings[idx_mot] = embeddings[word_to_match]
                    words_matched[idx_mot] = word_to_match
                    break
            if words_matched[idx_mot] is None:
                words_not_found.append(idx_mot)
                words_matched[idx_mot] = mot
                aligned_embeddings[idx_mot] = np.random.rand(dim_embedding)
        
        print("aligned_embeddings new embeddings shape {shap}, words not found {wnf}, words found {wf}"
             .format(shap=aligned_embeddings.shape, wnf=len(words_not_found), wf=len(words_matched)))
        return aligned_embeddings, words_not_found, words_matched

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [17]:
import os
import pickle
import numpy as np
import pandas as pd

import keras
from keras.models import Sequential, Model
from keras.models import load_model
from keras.layers import Dense, Activation, Concatenate, Embedding, concatenate, Flatten, Dropout
from keras.engine.input_layer import Input
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

class RNNManager:
    
    UNKNOWN_WORD = '<UNK>' #@TODO handle unknown word (apax)
    
    MAX_VOCAB_SIZE = -1
    
    models_ = {}
    networks_ = {}
    path_ = '.'
    
    embeddings_ = None
    embeddings_for_words_ = None
    embeddings_for_lemmas_ = None
    
    dm_ = DataManager()
    
    def __init__(self, path='.', max_vocab_size=-1):
        """
        Class to handle neural networks for TAL - TBP AE purpose.
        
        Parameters
        ----------
        
        path: str
            Root path to find files (path/.), scripts (path/.), cache (path/cache), embeddings (path/data/embeddings/...) ...
            Current path by default.
            
        max_vocab_size: int
            Maximum length of a vocabulary - hence for an embedding matrix used in a network.
            This can be set to limit amount of memory used by vocabs and embeddings.
            In practice, original pre-trained embeddings vectors are truncated up to this length, considering
            that embeddings used (fasttext) are ordered from most frequent to least frequent word.
            -1 (default value) means no limitation.
        
        """
        if path != '.':
            os.makedirs(os.path.dirname(path), exist_ok=True)
        self.path_ = path
        self.MAX_VOCAB_SIZE = max_vocab_size
        self.dm_ = DataManager()
        
    def get_model(self, model_name):
        """
        Returns an existing TAL model, or None.
        
        Parameters
        ----------
        
        model_name: str
            Name of the model.
        """
        if model_name not in self.models_.keys():
            print("Model {m} not found".format(m=model_name))
            return None
        return self.models_[model_name]
        
    def create_model(self, model_name, lang, featureset, use_forms, vocabs, vocabs_dev, vocabs_test, embeddings_file=None):
        """
        Creates a new TAL model.
        
        Parameters
        ----------
        
        model_name: str
            Name of this model.
            
        lang: str
            'fr', 'ja', 'nl', 'en'
            
        featureset: str
            'f1', 'f2' or 'f3'
            
        add_forms: boolean
            If true then both words forms are added on top of the features.
            
        vocabs: dict
            Vocabs for learning task, with keys 'WORDS', 'POS', 'MORPHO' and/or 'LEMMA' (or 'LABELS' for targets).
        
        """
        model = {'name': model_name,
                'lang': lang,
                'featureset': featureset,
                'use_forms': use_forms,
                'vocabs': vocabs,
                'vocabs_dev': vocabs_dev,
                'vocabs_test': vocabs_test,
                'embeddings_file': embeddings_file}
        if model_name not in self.models_.keys():
            self.models_[model_name] = model
            
        return model
            
    def remove_model(self, model_name):
        """
        """
        if model_name in self.models_.keys():
            return self.models_.pop(model_name, None)
    
    def save_model(self, model_name):
        """
        """
        if model_name in self.models_:
            self.dm_.safe_pickle_dump(os.path.join(self.path_, model_name + '-model.pkl'), self.models_[model_name])
        else:
            print("Model " + model_name + " not found")
            
    def load_model(self, model_fname):
        """
        """
        model_conf = self.dm_.pickle_load(os.path.join(self.path_, model_name, '-model.pkl'))
        if model:
            self.create_model(model_name=model['name'], lang=model['lang'], featureset=model['featureset'], 
                              vocabs=model['vocabs'])
            return 
            
        
    def create_network(self, network_name, model_name, nb_classes, dropout=False):
        """
        
        
        Parameters
        ----------
        
        language: 'fr', 'ja', 'nl', 'en'
            For french, japanese, dutch, or english. 
        
        featureset: 'f1', 'f2', 'f3'
            
        """
        net_model = None
        
        if model_name not in self.models_.keys():
            print("Model {m} not found".format(m=model_name))
            return None
        
        model = self.models_[model_name]
        featureset = model['featureset']
        use_forms = model['use_forms']
        lang = model['lang']
        vocabs = model['vocabs']
        
        net_model = Model()
                       
        #if embeddings != None:
            
        embedsw = self.embeddings_for_words_
        embedsl = self.embeddings_for_lemmas_
        
        if embedsw is not None:
            dim_embeddings = embedsw.shape[1]
        else:
            dim_embeddings = 300
        
        if use_forms:
            if embedsw is not None:
            
                if len(vocabs['WORDS']) != embedsw.shape[0]:
                    print("Words vocab size {v} must equal embeddings length {e}".format(v=len(vocabs['WORDS']), 
                                                                               e=embedsw.shape[0]))
                    return None
            
                # Pretrained Embedding layer for 2 words
                input_word1 = Input(shape=(1,), dtype='int32', name='word_input_1')
                embeddings_w1 = Embedding(
                    input_dim=len(vocabs['WORDS']), 
                    output_dim=dim_embeddings, 
                    weights=[embedsw], 
                    input_length=1)(input_word1)
                embeddings_w1 = Flatten()(embeddings_w1)
                
                # Embedding layer for second word
                input_word2 = Input(shape=(1,), dtype='int32', name='word_input_2')
                embeddings_w2 = Embedding(input_dim=len(vocabs['WORDS']), 
                                     output_dim=dim_embeddings, 
                                     weights=[embedsw], 
                                     input_length=1)(input_word2)
                #embeddings_2 = embeddings_layer(input_word2) # sharing weights between both words embeddings
                embeddings_w2 = Flatten()(embeddings_w2)
            
            else:
                # Embedding layer for 2 words
                input_word1 = Input(shape=(1,), dtype='int32', name='word_input_1')
                embeddings_w1 = Embedding(
                    input_dim=len(vocabs['WORDS']), 
                    output_dim=dim_embeddings, 
                    input_length=1)(input_word1)
                embeddings_w1 = Flatten()(embeddings_w1)
                
                # Embedding layer for second word
                input_word2 = Input(shape=(1,), dtype='int32', name='word_input_2')
                embeddings_w2 = Embedding(
                    input_dim=len(vocabs['WORDS']), 
                    output_dim=dim_embeddings, 
                    input_length=1)(input_word2)
                embeddings_w2 = Flatten()(embeddings_w2)
            
        if featureset is not 'f1' and embedsl is not None:
            # we must define also inputs and embeddings for lemmas
            if len(vocabs['LEMMA']) != embedsl.shape[0]:
                print("Lemma vocab size {v} must equal embeddings length {e}".format(v=len(vocabs['LEMMA']), 
                                                                               e=embedsl.shape[0]))
                return None
            
            # Pretrained Embedding layer for 2 lemmas
            input_lemma1 = Input(shape=(1,), dtype='int32', name='lemma_input_1')
            embeddings_l1 = Embedding(
                input_dim=len(vocabs['LEMMA']), 
                output_dim=dim_embeddings, 
                weights=[embedsl], 
                input_length=1)(input_lemma1)
            embeddings_l1 = Flatten()(embeddings_l1)
                
            # Embedding layer for second word
            input_lemma2 = Input(shape=(1,), dtype='int32', name='lemma_input_2')
            embeddings_l2 = Embedding(
                input_dim=len(vocabs['LEMMA']), 
                output_dim=dim_embeddings, 
                weights=[embedsl], 
                input_length=1)(input_lemma2)
            embeddings_l2 = Flatten()(embeddings_l2)
            
        elif featureset is not 'f1':
            
            # Embedding layer for 2 lemmas
            input_lemma1 = Input(shape=(1,), dtype='int32', name='lemma_input_1')
            embeddings_l1 = Embedding(
                input_dim=len(vocabs['LEMMA']), 
                output_dim=dim_embeddings, 
                input_length=1)(input_lemma1)
            embeddings_l1 = Flatten()(embeddings_l1)
                
            # Embedding layer for second word
            input_lemma2 = Input(shape=(1,), dtype='int32', name='lemma_input_2')
            embeddings_l2 = Embedding(
                input_dim=len(vocabs['LEMMA']), 
                output_dim=dim_embeddings, 
                input_length=1)(input_lemma2)
            embeddings_l2 = Flatten()(embeddings_l2)            
            
        
        # define input for additional features
        # note: dist is restricted to [-7 ... 7] so 15 values
        if featureset == 'f1':
            """ S.0.POS
                B.0.POS
                DIST"""
            dim_features = len(vocabs['POS']) * 2 + 15
        elif featureset == 'f2':
            """ S.0.POS
                S.0.LEMMA
                S.0.MORPHO
                S.-1.POS
                B.0.POS
                B.0.LEMMA
                B.0.MORPHO
                B.-1.POS
                B.1.POS
                DIST"""
            dim_features = len(vocabs['POS']) * 4 + len(vocabs['LEMMA']) * 2 + len(vocabs['MORPHO']) * 2 + 15
        else:
            """ S.0.POS
                S.0.LEMMA
                S.0.MORPHO
                S.-1.POS
                B.0.POS
                B.0.LEMMA
                B.0.MORPHO
                B.-2.POS
                B.-1.POS
                B.1.POS
                DIST"""
            # same size as 'f2'
            dim_features = len(vocabs['POS']) * 5 + len(vocabs['LEMMA']) * 2 + len(vocabs['MORPHO']) * 2 + 15
        print("  expecting {n} features dim".format(n=dim_features))
        
        # define input for features
        features_input = Input(shape=(dim_features,))
        
        if embeddings is not None:
            # concatenate features and embeddings
            l = concatenate([embeddings_1, embeddings_2, features_input])
        else:
            l = features_input
        
        # adding dense layers
        
        l = Dense(128)(l)
        l = Activation('relu')(l)
        if dropout:
            l = Dropout(0.15)(l)
        l = Dense(128)(l)
        l = Activation('relu')(l)
        if dropout:
            l = Dropout(0.15)(l)
        
        l = Dense(nb_classes)(l)
        out = Activation('softmax')(l)
        
        if embeddings is not None:
            net_model = Model([input_word1, input_word2, features_input], out)
        else:
            net_model = Model(features_input, out)
        
        # not sure where to compile the model ...
        net_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
            
        if net_model:
            self.networks_[network_name] = net_model
            # save initial state of this network (this also saves embedding layers, so we remove them)
            save_network(network_name)
            #del self.embeddings_for_words_
            #del self.embeddings_for_lemmas_
        
        return net_model

    def save_network(self, network_name):
        if name in self.networks_:
            self.networks_[network_name].save(os.path.join(self.path_, '/', network_name, '-net.h5'))
        else:
            print("Net " + network_name + " not found")
            
    def load_network(self, network_name):
        """
        Loads a keras network model (.h5, architecture and weights) from disk.
        """
        self.networks_[network_name] = load_model(os.path.join(self.path_, network_name, '-net.h5'))
        
        return net_model            

    def remove_network(self, network_name):
        """
        Removes a network model from rnnmanager (not from disk !), if it exists it is returned by this method.
        """
        if network_name in self.networks_.keys():
            return self.networks_.pop(model_name, None)    
    
    def preprocess_embeddings(self, model_name, augment_vocabs=True):
        """
        Preprocess embeddings for training a network for this model.
        Notes:
        - should be run once BEFORE creating a network, if you want to use pre-trained embeddings to train this network, 
          if not embeddings will be completely learned during training
        - loaded embeddings should be deleted before loading/running a new network to avoid exhausting memory.
        - if called again for another model embeddings will be REPLACED
        - for already created network embedded remain and will be saved along with the network (no need to call this if
          you load a saved network)
            - at network creation, once net model is saved to disk with keras model.save(), loaded embeddings are deleted
              to free memory
            - to manually remove loaded embeddings, delete attributes embeddings_, embeddings_for_words_ and embeddings_for_lemmas_
        
        Parameters
        ----------
        
        model_name: str
            Name of the TAL model.
            
        Returns: nothing
        
        """
        
        if model_name not in self.models_.keys():
            print("Model {m} not found".format(m=model_name))
            return None
        model = self.models_[model_name]
        featureset = model['featureset']
        use_forms = model
        lang = model['lang']
        vocabs = model['vocabs']
        
        if (self.embeddings_for_words_ is None and use_forms) or self.embeddings_for_lemmas_ is None:
        
            if self.embeddings_ is None:
                print("preprocess_embeddings: loading original embeddings ...")
                self.embeddings_ = self.dm_.load_embeddings(lang)
            
            faligned = os.path.join(self.path_, 'cache', model_name + '-embeddings-forms-aligned.pkl')
            if os.path.isfile(faligned):
                self.embeddings_for_words_ = self.dm_.pickle_load(faligned)
            if self.embeddings_for_words_ is None:
                print("preprocess_embeddings: aligning embeddings for forms ...")
                self.embeddings_for_words_, words_not_found, words_matched = self.dm_.align_embeddings(
                    vocabs['WORDS'], self.embeddings_, augment_vocab=augment_vocabs)
                self.dm_.safe_pickle_dump(faligned, self.embeddings_for_words_)
                    
            # process vocab for lemmas even if not 'f1', so it can be reused for f2 or f3 for same language
            faligned = os.path.join(self.path_, 'cache', model_name + '-embeddings-lemmas-aligned.pkl')
            if os.path.isfile(faligned):
                self.embeddings_for_lemmas_ = self.dm_.pickle_load(faligned)
            if self.embeddings_for_lemmas_ is None:
                print("preprocess_embeddings: aligning embeddings for lemmas ...")
                self.embeddings_for_lemmas_, words_not_found, words_matched = self.dm_.align_embeddings(
                    vocabs['LEMMA'], self.embeddings_, augment_vocab=augment_vocabs)
                self.dm_.safe_pickle_dump(faligned, self.embeddings_for_lemmas_)
       
        # free some memory - original embeddings should be useless now
        del self.embeddings_
        
        return
    
            
    def preprocess_data(self, model_name, X_train, y_train, X_dev, y_dev, X_test, y_test):
        """
        Preprocess data for training a network for this model.
        - merges vocabs (WORDS) of X_dev into X_train
        - merges vocabs (LEMMA) of X_dev into X_train, if any
        - merges vocabs (MORPHO) of X_dev into X_train, if any
        - merges vocabs (labels) of Y_dev and Y_test into Y_train
        - converts to categorical (one-hot), features of X_* (POS, MORPHO, DIST)
        
        Note: arrays and vocabs are updated in place, so keep a copy of originals if required.
        
        Parameters
        ----------
        
        model_name: str
            TAL model to use for treating data (defines features and vocabs).
            
        X_train, y_train, X_dev, y_dev, X_test, y_test: arrays
            Training, validation and test data to process.
            
        Returns
        -------
        
        X_train, y_train, X_dev, y_dev, X_test, y_test
            
        """
        
        if model_name not in self.models_.keys():
            print("Model {m} not found".format(m=model_name))
            return None
        model = self.models_[model_name]
        featureset = model['featureset']
        use_forms = model['use_forms']
        lang = model['lang']
        vocabs = model['vocabs']
        vocabs_dev = model['vocabs_dev']
        vocabs_test = model['vocabs_test']
        
        # Treat vocabularies
        
        if use_forms:
            print("preprocess_data: merging WORDS vocabs from dev into train ...")
            self.dm_.merge_vocabs(vocab1=vocabs['WORDS'], vocab2=vocabs_dev['WORDS'], data=X_dev, columns=(0, 1))
            # ... vocabs_dev['WORDS'] is now useless
        
        if 'POS' in vocabs:
            nb_classes_pos = len(np.unique(vocabs['POS']))
        if 'MORPHO' in vocabs:
            print("preprocess_data: merging and aligning MORPHO vocabs from dev into train ...")
            self.dm_.merge_vocabs(vocab1=vocabs['MORPHO'], vocab2=vocabs_dev['MORPHO'], data=X_dev, columns=(4, 8))
            # ... vocabs_dev['MORPHO'] is now useless
            nb_classes_morpho = len(np.unique(vocabs['MORPHO']))
        if 'LEMMA' in vocabs:
            print("preprocess_data: merging and aligning LEMMA vocabs from dev into train ...")
            self.dm_.merge_vocabs(vocab1=vocabs['LEMMA'], vocab2=vocabs_dev['LEMMA'], data=X_dev, columns=(3, 7))
            # ... vocabs_dev['LEMMA'] is now useless
        
        print(vocabs.keys())
        print(vocabs_dev.keys())
        print("preprocess_data: merging and aligning LABEL vocabs from dev into train ...")
        self.dm_.merge_vocabs(vocab1=vocabs['LABELS'], vocab2=vocabs_dev['LABELS'], data=y_dev, columns=(0,))
        print("preprocess_data: merging and aligning LABEL vocabs from test into train ...")
        self.dm_.merge_vocabs(vocab1=vocabs['LABELS'], vocab2=vocabs_test['LABELS'], data=y_test, columns=(0,))
        nb_classes = len(vocabs['LABELS'])
            
        # convert some features to one-hot encoding
        
        nb_classes_dist = 8
        
        
        print("preprocess_data: converting {f} features to one-hot encoding...".format(f=featureset))
        
        if featureset == 'f1':
            cats_pos1 = to_categorical(X_train[:, 2], num_classes=nb_classes_pos)
            cats_pos2 = to_categorical(X_train[:, 3], num_classes=nb_classes_pos)
            # dist is positive for arc eager
            cats_dist = to_categorical(np.abs(X_train[:, 4]), num_classes=nb_classes_dist)
            X_train = np.column_stack((X_train[:, 0], X_train[:, 1], cats_pos1, cats_pos2, cats_dist))
            
            cats_pos1 = to_categorical(X_dev[:, 2], num_classes=nb_classes_pos)
            cats_pos2 = to_categorical(X_dev[:, 3], num_classes=nb_classes_pos)
            # dist is positive for arc eager
            cats_dist = to_categorical(np.abs(X_dev[:, 4]), num_classes=nb_classes_dist)
            X_dev = np.column_stack((X_dev[:, 0], X_dev[:, 1], cats_pos1, cats_pos2, cats_dist))
            
            cats_pos1 = to_categorical(X_test[:, 2], num_classes=nb_classes_pos)
            cats_pos2 = to_categorical(X_test[:, 3], num_classes=nb_classes_pos)
            # dist is positive for arc eager
            cats_dist = to_categorical(np.abs(X_test[:, 4]), num_classes=nb_classes_dist)
            X_test = np.column_stack((X_test[:, 0], X_test[:, 1], cats_pos1, cats_pos2, cats_dist))
            
        else: # 'f2' and 'f3' featuresets have same structure
            """ 
            S.0.POS
            S.0.LEMMA
            S.0.MORPHO
            S.-1.POS (f2, or S.1.POS for f3)
            B.0.POS
            B.0.LEMMA
            B.0.MORPHO
            B.-1.POS
            B.1.POS
            DIST"""                
            cats_pos1    = to_categorical(X_train[:, 2], num_classes=nb_classes_pos)
            cats_morpho1 = to_categorical(X_train[:, 4], num_classes=nb_classes_morpho)
            cats_pos2    = to_categorical(X_train[:, 5], num_classes=nb_classes_pos)
            cats_pos3    = to_categorical(X_train[:, 6], num_classes=nb_classes_pos)
            cats_morpho2 = to_categorical(X_train[:, 8], num_classes=nb_classes_morpho)
            cats_pos4    = to_categorical(X_train[:, 9], num_classes=nb_classes_pos)
            cats_pos5    = to_categorical(X_train[:, 10], num_classes=nb_classes_pos)
            cats_dist    = to_categorical(np.abs(X_train[:, 11]), num_classes=nb_classes_dist)
            X_train = np.column_stack((X_train[:, 0], X_train[:, 1], cats_pos1, X_train[:, 3],
                                      cats_morpho1, cats_pos2, cats_pos3,
                               X_train[:, 7], cat_trains_morpho2, cats_pos4, cats_pos5, cats_dist))
            
            cats_pos1    = to_categorical(X_dev[:, 2], num_classes=nb_classes_pos)
            cats_morpho1 = to_categorical(X_dev[:, 4], num_classes=nb_classes_morpho)
            cats_pos2    = to_categorical(X_dev[:, 5], num_classes=nb_classes_pos)
            cats_pos3    = to_categorical(X_dev[:, 6], num_classes=nb_classes_pos)
            cats_morpho2 = to_categorical(X_dev[:, 8], num_classes=nb_classes_morpho)
            cats_pos4    = to_categorical(X_dev[:, 9], num_classes=nb_classes_pos)
            cats_pos5    = to_categorical(X_dev[:, 10], num_classes=nb_classes_pos)
            cats_dist    = to_categorical(np.abs(X_dev[:, 11]), num_classes=nb_classes_dist)
            X_dev = np.column_stack((X_dev[:, 0], X_dev[:, 1], cats_pos1, X_dev[:, 3],
                                      cats_morpho1, cats_pos2, cats_pos3,
                               X_dev[:, 7], cat_trains_morpho2, cats_pos4, cats_pos5, cats_dist))
            

            cats_pos1    = to_categorical(X_test[:, 2], num_classes=nb_classes_pos)
            cats_morpho1 = to_categorical(X_test[:, 4], num_classes=nb_classes_morpho)
            cats_pos2    = to_categorical(X_test[:, 5], num_classes=nb_classes_pos)
            cats_pos3    = to_categorical(X_test[:, 6], num_classes=nb_classes_pos)
            cats_morpho2 = to_categorical(X_test[:, 8], num_classes=nb_classes_morpho)
            cats_pos4    = to_categorical(X_test[:, 9], num_classes=nb_classes_pos)
            cats_pos5    = to_categorical(X_test[:, 10], num_classes=nb_classes_pos)
            cats_dist    = to_categorical(np.abs(X_test[:, 11]), num_classes=nb_classes_dist)
            X_test = np.column_stack((X_test[:, 0], X_test[:, 1], cats_pos1, X_test[:, 3],
                                      cats_morpho1, cats_pos2, cats_pos3,
                               X_test[:, 7], cats_morpho2, cats_pos4, cats_pos5, cats_dist))
                        
        print("preprocess_data: converting LABELs to one-hot encoding ...")
        y_train = to_categorical(y_train, num_classes=nb_classes)
        y_dev = to_categorical(y_dev, num_classes=nb_classes)
        y_test = to_categorical(y_test, num_classes=nb_classes)
                
        return X_train, y_train, X_dev, y_dev, X_test, y_test
    

In [18]:
del rnn

# define a manager with hard limit of 100000 for vocabularies lengths
rnn = RNNManager(path='.', max_vocab_size=100000)

In [None]:
import time

t = time.time()

print()

dm = DataManager()
X_train, y_train, vocabs_train = dm.load_data()
X_dev, y_dev, vocabs_dev = dm.load_data(phase='dev')
X_test, y_test, vocabs_test = dm.load_data(phase='test')

print("loaded data in ", time.time() - t)

nb_classes = len(vocabs_train['LABELS'])

lang = 'fr'
featureset = 'f1'
forms = False
model_name = "{featureset}_{lang}".format(featureset=featureset, lang=lang)



# create a TAL model for this combination, using forms
rnn.remove_model(model_name)
rnn.create_model(model_name, lang, featureset, forms, vocabs_train, vocabs_dev, vocabs_test)
rnn.save_model(model_name)

t = time.time()

print()
# preprocess X/Y data
X_train, y_train, X_dev, y_dev, X_test, y_test = rnn.preprocess_data(model_name, X_train, y_train, X_dev, y_dev, X_test, y_test)

print("Preprocessed data in ", time.time() - t)

t = time.time()
print()
# load pretrained embeddings
rnn.preprocess_embeddings(model_name, augment_vocabs=True)

print("Preprocessed embeddings in ", time.time() - t)

t = time.time()
print()
# create a classifier for this TAL model
network_name = "{model}".format(model=model_name)
net = rnn.create_network(network_name, model_name, nb_classes=nb_classes, dropout=True)

print("Created net in ", time.time() - t)

print(rnn.networks_)


Loading data\f1_fr-train
load_data: loaded X =  (679243, 6) , Y =  (679243,) , vocabs =  WORDS (42278), POS (20), LABELS (90), 
Loading data\f1_fr-dev
load_data: loaded X =  (68105, 6) , Y =  (68105,) , vocabs =  WORDS (9275), POS (20), LABELS (74), 
Loading data\f1_fr-test
load_data: loaded X =  (19119, 6) , Y =  (19119,) , vocabs =  WORDS (3285), POS (20), LABELS (68), 
loaded data in  274.11958408355713
pickle.dump .\f1_fr-model.pkl

dict_keys(['WORDS', 'POS', 'LABELS'])
dict_keys(['WORDS', 'POS', 'LABELS'])
preprocess_data: merging and aligning LABEL vocabs from dev into train ...
preprocess_data: merging and aligning LABEL vocabs from test into train ...
preprocess_data: converting f1 features to one-hot encoding...
preprocess_data: converting LABELs to one-hot encoding ...
Preprocessed data in  2.1951284408569336

preprocess_embeddings: loading original embeddings ...
pickle.load C:\IAAA\tal-github\cache\wiki.fr.vec.pkl


In [12]:
history = net.fit([X_train[:, 0], X_train[:, 1], X_train[:, 2:X_train.shape[1]]], 
                  y_train,
                  validation_data=(
                      [X_dev[:, 0], X_dev[:, 1], X_dev[:, 2:X_dev.shape[1]]], 
                      y_dev)
                  epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
rnn.save_network(network_name)

- il n'y a pas les mots dans les features (seulement les lemmes)

- ajouter les mots uniquement pour l'expérience en plus

- embeddings pour les lemmes: il faudrait ajouter tous les mots des embeddings ! (et pas les enlever lors de l'alignement)

