In [1]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import pandas as pd
np.random.seed(1)

Using TensorFlow backend.


In [2]:
# bd = pd.read_csv('../../d0.csv', delimiter=';', header=None)
bd = pd.read_csv('../../data_600.csv', delimiter=';', header=None)
# bd = pd.read_csv('../../dataset.csv', delimiter=';', header=None)
X_train = bd[0]
Y_train = bd[1]

In [3]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [4]:
# grammar + GloVe
# la grammaire est de taille 400.000 mots
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../../glove.embeddings.txt')

#list of all tags (unique and sorted)
Y_train_unique = np.sort(Y_train.unique())

#Y_train represented by the index of each tag in the unique sorted list
Y_train_converted_to_numbers = np.zeros(shape=(Y_train.shape[0],)).astype(int)
for index, element in enumerate(Y_train):
    Y_train_converted_to_numbers[index] = int(np.where(Y_train_unique == element)[0])


In [5]:
print(word_to_vec_map['hello'])

[-0.38497   0.80092   0.064106 -0.28355  -0.026759 -0.34532  -0.64253
 -0.11729  -0.33257   0.55243  -0.087813  0.9035    0.47102   0.56657
  0.6985   -0.35229  -0.86542   0.90573   0.03576  -0.071705 -0.12327
  0.54923   0.47005   0.35572   1.2611   -0.67581  -0.94983   0.68666
  0.3871   -1.3492    0.63512   0.46416  -0.48814   0.83827  -0.9246
 -0.33722   0.53741  -1.0616   -0.081403 -0.67111   0.30923  -0.3923
 -0.55002  -0.68827   0.58049  -0.11626   0.013139 -0.57654   0.048833
  0.67204 ]


In [6]:
print(Y_train_unique)
print(len(Y_train_unique))

['accretion-discs' 'asteroids' 'astronomy' 'astrophotography'
 'astrophysics' 'atmosphere' 'callisto' 'cloud' 'comets' 'coordinate'
 'core' 'cosmology' 'distances' 'dust' 'earth' 'eclipse' 'exoplanet'
 'expansion' 'flare' 'formation' 'galaxy' 'gravity' 'hole' 'horizon'
 'impact' 'inclination' 'intergalactic' 'interstellar' 'jupiter' 'life'
 'light' 'mars' 'matter' 'meteor' 'meteorite' 'milky' 'moon' 'naming'
 'navigation' 'nebula' 'neptune' 'neutrinos' 'neutron' 'observation'
 'observatory' 'observing' 'orbit' 'orbital' 'photography' 'planet'
 'planetary' 'pluto' 'pulsar' 'redshift' 'relativity' 'rotation'
 'satellites' 'saturn' 'size' 'solar' 'space' 'spectra' 'spectroscopy'
 'star' 'stellar' 'sun' 'supernova' 'telescope' 'temperature'
 'terminology' 'theory' 'time' 'titan' 'units' 'universe' 'venus' 'water'
 'waves']
78


In [7]:
#switch between index and tags => index of the tag in our grammar
print(Y_train_converted_to_numbers[5])
print(Y_train[5])
print(Y_train_converted_to_numbers[37])
print(Y_train[37])

36
moon
22
hole


In [8]:
def sentences_to_indices(X, word_to_index, max_len):
    
#     X = tableau de phrases -- taille (m,1)
#     word_to_index = dictionnaire qui mappe chaque mot à son indice dans la grammaire
#     nombre de mots de la phrase la plus longue
#     retourne une matrice de taille (m, max_len) : ayant comme valeurs les indices des mots des phrases 
# dans la gramamire 
    
    m = X.shape[0]

    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        
        j = 0
        
        for w in sentence_words:
            if w in word_to_index.keys(): # si le mot existe bien dans notre grammaire
                X_indices[i, j] = word_to_index[w]
            else: # si le mot n'existe pas dans notre grammaire = on la remplace par el mot unknown
                X_indices[i, j] = 1; # indice 1 = <unknown>
            j = j + 1

    return X_indices

In [9]:
X1 = np.array(["hot sun", "funny lol", "lets play football", "food is ready for you"])
X1_indices = sentences_to_indices(X1,word_to_index, max_len = 5)
print("X1 =", X1)
print("X1_indices =", X1_indices)


X1 = ['hot sun' 'funny lol' 'lets play football' 'food is ready for you']
X1_indices = [[182186. 347345.      0.      0.      0.]
 [155345. 225122.      0.      0.      0.]
 [220930. 286375. 151266.      0.      0.]
 [151204. 192973. 302254. 151349. 394475.]]


In [10]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    
#     cette fonction créé Keras Embedding() layer avec pretrained Glove (Embeddings)
#     word_to_vec_map = dictionnaire ==> words : GloVe vector (taille) representation.
#     word_to_index -- dictionary ==> words : indices in the vocabulary (400,001 words)

#     cette fonction retourne "embedding_layer" qui est un "pretrained layer Keras" instance

    vocab_len = len(word_to_index) + 1 # +1 pour s'adapterau Keras embedding
    emb_dim = word_to_vec_map["cucumber"].shape[0] # la dimension de GloVe word vectors (= 50)
        
    emb_matrix = np.zeros((vocab_len, emb_dim))

    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
        
    embedding_layer = Embedding(vocab_len, emb_dim)

    embedding_layer.build((None,))

    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [11]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][7][5])
print(len(embedding_layer.get_weights()))

weights[0][1][3] = -0.74187
1


In [12]:

def Tagger_LSTM(input_shape, word_to_vec_map, word_to_index):
    
#     cette fonction permet la création de notre modèle

#     input_shape : shape de l'input, (max_len,) avec max_len = taille(nb mots) de la phrase la plus longue
#     word_to_vec_map : dictionnaire qui a comme clé la mot comme valeur GloVe 50-Vector representation
#     word_to_index -- dictionnaire avec key=mot et valeur=indice du mot=key dans le vocabulaire (400,001 words)
#     model : valeur de retour de la fonction qui s'agit de notre modèle
    
    
    sentence_indices = Input(shape=input_shape, dtype=np.int32)

    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

    embeddings = embedding_layer(sentence_indices)
    
    

    X = LSTM(128, return_sequences=True)(embeddings)

    X = LSTM(128, return_sequences=True)(X)

    X = Dropout(0.5)(X)

    X = LSTM(128, return_sequences=True)(X)

    X = LSTM(128, return_sequences=True)(X)

    X = Dropout(0.5)(X)
    
    X = LSTM(128)(X)

    
    X = Dense(Y_train_unique.shape[0], activation='softmax')(X)

    X = Activation('softmax')(X)


    model = Model(sentence_indices, X)

    return model


In [13]:

# taille en terme de nb de mots de la phrase la plus longue
maxLen = len(max(X_train, key=len).split())

model = Tagger_LSTM((maxLen,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 456)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 456, 50)           20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 456, 128)          91648     
_________________________________________________________________
lstm_2 (LSTM)                (None, 456, 128)          131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 456, 128)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 456, 128)          131584    
_________________________________________________________________
lstm_4 (LSTM)                (None, 456, 128)          131584    
__________

In [14]:
print('maxLen = ',maxLen,'tailles de toutes les phrases', '\n------------------------------------')
for x in X_train:
    print(len(x.split()))

maxLen =  456 tailles de toutes les phrases 
------------------------------------
10
20
49
46
146
84
59
32
16
115
33
17
23
184
46
30
150
25
42
25
104
38
42
44
86
15
63
77
15
14
55
42
19
47
74
21
20
85
141
35
34
43
76
44
24
42
55
51
140
53
25
84
150
23
24
130
36
63
9
57
84
31
50
40
20
24
165
9
48
19
7
27
31
22
17
88
20
74
36
19
27
17
25
174
53
41
18
21
37
46
61
90
76
98
83
54
48
34
21
66
55
146
44
52
44
21
65
199
33
41
19
23
23
67
84
27
60
141
52
30
42
26
44
52
26
86
20
57
127
26
15
32
19
47
61
28
29
18
40
89
15
65
25
79
54
32
19
78
23
82
23
14
23
29
28
22
29
54
83
28
36
55
130
40
38
127
61
61
82
18
22
86
52
59
40
80
70
33
133
80
138
39
79
30
83
19
27
27
20
35
42
12
32
11
61
32
22
19
7
34
51
16
11
13
25
34
55
21
22
16
79
62
88
68
112
19
84
55
12
11
34
21
21
50
65
282
38
41
34
24
63
35
196
56
38
101
53
60
64
162
16
62
63
38
22
21
13
51
40
36
75
29
14
44
51
54
23
32
42
47
23
45
28
53
278
28
24
45
23
38
15
65
18
13
21
29
27
15
52
34
77
87
18
11
44
69
57
28
21
55
43
47
82
28
271
29
34
103
1

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C, dtype=int)[Y.reshape(-1)]
    return Y

In [17]:
if 'sun' in word_to_index.keys():
    print('ok')
word_to_index['sun']

ok


347345

In [18]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(Y_train_converted_to_numbers, C = Y_train_unique.shape[0])
print(Y_train_unique.shape[0])

78


In [None]:
model.fit(X_train_indices, Y_train_oh, epochs = 5, batch_size = 64, shuffle=True)

Epoch 1/5
Epoch 2/5

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-102b25e087cf>", line 1, in <module>
    model.fit(X_train_indices, Y_train_oh, epochs = 5, batch_size = 64, shuffle=True)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1705, in fit
    validation_steps=validation_steps)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1235, in _fit_loop
    outs = f(ins_batch)
  File "/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py", line 2478, in __call__
    **self.session_kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 905, in run
    run_metadata_ptr)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1140, in _run
    feed_dict_tensor, options, run_metadat

KeyboardInterrupt: 

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 421, in execute_request
    self._abort_queues()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 636, in _abort_queues
    self._abort_queue(stream)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 661, in _abort_queue
    pol

In [None]:

loss, acc = model.evaluate(X_train_indices, Y_train_oh)
print()
print("Test accuracy = ", acc)

In [None]:
Y_train_unique

In [None]:
x = np.array(['sun rise'])
# x2 = np.array(['why the moon has phases'])
# x = np.array(['why the atmosphere is essential for life'])
indices = sentences_to_indices(x, word_to_index, maxLen)
print(x[0] +' -convert_to_one_hot-- '+  Y_train_unique[np.argmax(model.predict(indices))])