In [20]:
import numpy as np
from keras.preprocessing.text import Tokenizer, one_hot
from keras.utils import to_categorical
from keras.layers import Input, Embedding, LSTM, Activation, Dense, Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

In [2]:
def load_equations(path):
    with open(path, encoding="utf8") as f :
        lines = f.read()
        lines = lines.split('\n')
        inputs = []
        targets= []
        for line in lines :
            data = line.split('\t')
            if(len(data) == 2):
                inputs.append(data[0])
                targets.append(int(data[1]))
    return (inputs,targets)

In [3]:
inputs, targets = load_equations('./equations.txt')

In [4]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(inputs)

In [5]:
tokens=tokenizer.texts_to_sequences(inputs)

In [6]:
tokens

[[2, 1, 9, 4, 2, 1, 24],
 [13, 2, 6, 2, 1, 11, 5, 7, 9],
 [10, 6, 13],
 [10, 11, 4, 16],
 [14, 6, 2, 1, 7, 12],
 [8, 12, 5, 1, 2, 5, 13, 11],
 [3, 7, 5, 1, 2, 6, 2],
 [2, 5, 3, 18],
 [10, 12, 4, 20],
 [8, 11, 4, 1, 15, 6, 1, 9],
 [13, 17, 4, 3, 19, 14, 4, 15],
 [13, 11, 5, 10],
 [10, 16, 6, 26],
 [13, 12, 4, 13, 12, 6, 7, 12],
 [7, 12, 6, 13, 12, 5, 26],
 [8, 12, 5, 2, 1, 18, 6, 3, 7],
 [10, 15, 4, 2, 1, 18],
 [8, 2, 5, 3, 18],
 [2, 1, 7, 12, 5, 18],
 [2, 1, 11, 5, 3, 7, 11],
 [2, 6, 1, 17],
 [2, 1, 5, 20],
 [1, 2, 6, 13, 9],
 [21, 6, 2, 1, 20],
 [2, 1, 20, 4, 22, 5, 3, 17],
 [10, 2, 5, 7, 12],
 [21, 6, 8, 2, 5, 8, 11],
 [8, 6, 8, 15, 6, 9],
 [2, 1, 7, 12, 6, 2, 1, 2, 5, 10, 16],
 [13, 16, 4, 1, 18],
 [10, 18, 4, 13, 18],
 [18, 4, 8, 19, 14],
 [2, 1, 12, 6, 2, 1, 12],
 [10, 16, 5, 8, 9],
 [2, 1, 16, 4, 2],
 [25, 6, 1, 12],
 [2, 1, 17, 4, 3, 22, 5, 13, 11],
 [1, 17, 6, 3, 17, 4, 3, 22],
 [18, 4, 8],
 [10, 15, 4, 3, 17],
 [22, 6, 10, 9],
 [1, 17, 4, 8, 18],
 [10, 4, 1, 15],
 [2, 1, 25, 6

In [7]:
target=np.array(targets)

In [8]:
tokenizer.index_word

{1: 'vingt',
 2: 'quatre',
 3: 'soixante',
 4: 'moins',
 5: 'plus',
 6: 'fois',
 7: 'dix',
 8: 'quarante',
 9: 'huit',
 10: 'cinquante',
 11: 'neuf',
 12: 'sept',
 13: 'trente',
 14: 'un',
 15: 'deux',
 16: 'trois',
 17: 'six',
 18: 'cinq',
 19: 'et',
 20: 'douze',
 21: 'quatorze',
 22: 'treize',
 23: 'quinze',
 24: 'onze',
 25: 'seize',
 26: 'cent',
 27: 'zéro'}

In [9]:
inputs_pad=pad_sequences(tokens)

In [10]:
inputs_pad.shape

(1000, 13)

In [11]:
import word2vec
model_emb = word2vec.load('./convertvec/frWiki_no_phrase_no_postag_700_cbow_cut100.bin')

In [12]:
indexes,scores=model_emb.cosine('zéro')
model_emb.generate_response(indexes,scores).tolist()

[('adrienne', 0.3597169864147758),
 ('paulin', 0.3526609766799178),
 ('diviseur', 0.3348377761810345),
 ('compteur', 0.33382486971660164),
 ('recommencer', 0.32622177383600376),
 ('nul', 0.31250234747562766),
 ('cal', 0.3093413112138357),
 ('promu', 0.30614208029938256),
 ('affilée', 0.30229481378877815),
 ('numération', 0.2838326885507386)]

In [13]:
model_emb.get_vector('vingt')

array([ 6.49361685e-02,  4.74088974e-02,  2.64277030e-02, -9.85254627e-03,
        2.19576675e-02, -3.07958797e-02,  9.37409422e-05, -7.02790692e-02,
       -8.05385485e-02, -4.98326961e-03, -1.99786592e-02, -2.21430440e-03,
        3.08932140e-02, -5.74806035e-02, -2.09862143e-02, -7.21693132e-03,
       -1.58714317e-02, -1.42607642e-02,  4.06044945e-02,  2.12558992e-02,
        2.98322812e-02, -1.13926819e-02, -7.27641908e-03,  1.14099486e-02,
       -1.30566815e-02, -4.96807881e-02,  5.20690233e-02, -6.24756934e-03,
       -3.35043408e-02,  3.53618562e-02, -1.14390552e-01,  3.40101235e-02,
        4.31646267e-03, -5.36387321e-03, -4.37185206e-02,  6.08459115e-03,
       -1.74392350e-02,  5.85248061e-02, -3.37524489e-02,  5.87566104e-03,
       -2.74813455e-03,  3.23909447e-02,  3.04225441e-02,  6.86018961e-03,
        4.04522084e-02, -1.94166917e-02, -1.18606966e-02, -1.39788091e-02,
       -1.68824885e-02,  4.70676720e-02,  8.06638598e-02,  9.68034007e-03,
        4.66484837e-02,  

In [14]:
index_word=tokenizer.index_word

In [15]:
def pretrained_embedding_layer(model_emb, index_word):
    vocab_len=len(index_word)+1
    emb_dim=model_emb.get_vector('zéro').shape[0]
    emb_matrix=np.zeros((vocab_len,emb_dim))
    for index, word in index_word.items():
        emb_matrix[index,:]=model_emb.get_vector(word)
    embedding_layer=Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

In [16]:
inputs = Input(shape=(inputs_pad.shape[1],))

In [17]:
embedding_layer=pretrained_embedding_layer(model_emb, index_word)

In [18]:
embeddings=embedding_layer(inputs)

In [21]:
x=Bidirectional(LSTM(20, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True))(embeddings)

In [22]:
x=Bidirectional(LSTM(200, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=False))(x)

In [23]:
x=Dense(1,activation='linear')(x)

In [24]:
model = Model(inputs=inputs, outputs=x)

In [25]:
model.compile(optimizer='rmsprop', loss='MSE', metrics=['accuracy'], loss_weights=None, sample_weight_mode=None, weighted_metrics=None, target_tensors=None)

In [26]:
target.shape, inputs_pad.shape

((1000,), (1000, 13))

In [27]:
model.fit(x=inputs_pad, y=target, batch_size=32, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f4b02ab24a8>

In [None]:
model.summary()