In [1]:
# Allow recursive reloading
import builtins
from IPython.lib import deepreload
builtins.reload = deepreload.reload
import app
builtins.reload(app)

Reloading app.ML
Reloading app.ML.model
Reloading app.ML.in_out


<module 'app' from 'app/__init__.py'>

## Import useful librairies

In [2]:
import numpy as np
import pandas as pd
import pickle
from keras.models import Model
from keras.layers import Input, LSTM, Dense

Using TensorFlow backend.


## Import the data

We get the data from http://www.manythings.org/anki/

In [3]:
donnees = pd.read_csv('Data/fra-eng/fra.txt',sep='\t',header=None, names=['anglais','français'])[:10000]
donnees.head()

Unnamed: 0,anglais,français
0,Go.,Va !
1,Hi.,Salut !
2,Run!,Cours !
3,Run!,Courez !
4,Who?,Qui ?


In [4]:
donnees.tail()

Unnamed: 0,anglais,français
9995,Don't overdo it.,N'en fais pas trop.
9996,Don't play dumb!,Ne fais pas l'imbécile.
9997,Don't remind me.,Ne me le rappelle pas !
9998,Don't remind me.,Ne me le rappelez pas !
9999,Don't resist us.,Ne nous résiste pas !


## Preprocessing

In [5]:
donnees['français'] = donnees["français"].apply(lambda x : '\t' + x + '\n')

The next step is to transform the sequences into numpy arrays<br>
We will use character-level one-hot encoding.<br>
We will have 3 vectors types :
     - Encoder input
     - decoder input
     - decoder labels

In [6]:
class Vocable():
    def __init__(self):
        self._char_to_int = dict()
        self._int_to_char = []
        self.voc_size = 0
        self.max_sentence_size = 0
    
    def train(self, L):
        all_char = set()
        m = 0
        for s in L:
            all_char.update(list(s))
            if len(s)>m:
                m = len(s)
       
        self.max_sentence_size = m
        chars = sorted(list(all_char))
        self.voc_size = len(chars)
        self._int_to_char = chars
        self._char_to_int = dict([(chars[i],i) for i in range(len(chars))])
            
    def getChar(self, i):
        return self._int_to_char[i]
    
    def getInt(self, c):
        return self._char_to_int[c]
    
    def vectoriseSetence(self, sentence):
        res = np.zeros((self.max_sentence_size, self.voc_size))
        for i in range(len(sentence)):
            res[i][self.getInt(sentence[i])] = 1
        return res
    
    def decodeVec(self, vec):
        res = ""
        for i in range(vec.shape[0]):
            res += self.getChar(np.argmax(vec[i]))
        return res
    
        

In [7]:
eng_voc = Vocable()
eng_voc.train(donnees['anglais'])

In [8]:
v = eng_voc.vectoriseSetence("Hey")
v

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
eng_voc.decodeVec(v)

'Hey             '

In [10]:
fra_voc = Vocable()
fra_voc.train(donnees['français'])

In [11]:
donnees['input_encoder'] = donnees["anglais"].apply(eng_voc.vectoriseSetence)
donnees['input_decoder'] = donnees["français"].apply(fra_voc.vectoriseSetence)
donnees['output_decoder'] = donnees['input_decoder'].apply(lambda vec: np.vstack((vec[1:],0*vec[0].reshape((1,-1)))))

In [12]:
input_encoder = np.r_[list(donnees['input_encoder'])]
input_decoder = np.r_[list(donnees['input_decoder'])]
output_decoder =np.r_[list(donnees['output_decoder'])]

In [13]:
np.savez_compressed('Data/fra-eng/processed_data',
                    input_encoder=input_encoder,
                    input_decoder=input_decoder,
                    output_decoder=output_decoder
                   )

In [14]:
with open("app/ML/dico_eng.pickle","wb") as f:
    pickle.dump(eng_voc, f)
with open("app/ML/dico_fra.pickle","wb") as f:
    pickle.dump(fra_voc, f)

## Load the data

In [15]:
with open("app/ML/dico_eng.pickle","rb") as f:
    eng_voc = pickle.load(f)
with open("app/ML/dico_fra.pickle","rb") as f:
    fra_voc = pickle.load(f)

loaded = np.load('Data/fra-eng/processed_data.npz')

input_encoder = loaded['input_encoder']
input_decoder = loaded['input_decoder']
output_decoder = loaded['output_decoder']

In [17]:
fra_voc.decodeVec(output_decoder[0])

'Va !\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t'

## Build the model