In [1]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import pandas as pd
np.random.seed(1)

Using TensorFlow backend.


In [2]:
# bd = pd.read_csv('../../d0.csv', delimiter=';', header=None)
# bd = pd.read_csv('../../data_600.csv', delimiter=';', header=None)
bd = pd.read_csv('../../dataset.csv', delimiter=';', header=None)
X_train = bd[0]
Y_train = bd[1]

In [3]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [4]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../../glove.6B.50d.txt')
#list of all tags (unique and sorted)
Y_train_unique = np.sort(Y_train.unique())
#Y_train represented by the index of each tag in the unique sorted list
Y_train_converted_to_numbers = np.zeros(shape=(Y_train.shape[0],)).astype(int)
for index, element in enumerate(Y_train):
    Y_train_converted_to_numbers[index] = int(np.where(Y_train_unique == element)[0])


In [5]:
print(word_to_vec_map['hello'])

[-0.38497   0.80092   0.064106 -0.28355  -0.026759 -0.34532  -0.64253
 -0.11729  -0.33257   0.55243  -0.087813  0.9035    0.47102   0.56657
  0.6985   -0.35229  -0.86542   0.90573   0.03576  -0.071705 -0.12327
  0.54923   0.47005   0.35572   1.2611   -0.67581  -0.94983   0.68666
  0.3871   -1.3492    0.63512   0.46416  -0.48814   0.83827  -0.9246
 -0.33722   0.53741  -1.0616   -0.081403 -0.67111   0.30923  -0.3923
 -0.55002  -0.68827   0.58049  -0.11626   0.013139 -0.57654   0.048833
  0.67204 ]


In [6]:
print(Y_train_unique)
print(len(Y_train_unique))

['9th-planet' 'abiogenesis' 'absolute-magnitude' 'accretion'
 'accretion-discs' 'adaptive-optics' 'age' 'algorithm' 'alignment'
 'amateur-observing' 'angular-resolution' 'antimatter'
 'apparent-magnitude' 'artificial-satellite' 'ascension' 'asteroid-belt'
 'asteroids' 'astrobiology' 'astrochemistry' 'astrometry'
 'astrophotography' 'astrophysics' 'astropy' 'atmosphere' 'aurora'
 'azimuth' 'big-bang-theory' 'binary-star' 'binoculars' 'biosignatures'
 'black-hole' 'brown-dwarf' 'callisto' 'celestial-mechanics' 'cepheids'
 'ceres' 'chemistry' 'cmb' 'comets' 'constellations' 'coordinate' 'core'
 'coronal-mass-ejection' 'cosmic-ray' 'cosmological-inflation' 'cosmology'
 'crater' 'dark-energy' 'dark-matter' 'data-analysis' 'date-time'
 'declination' 'deep-sky-observing' 'disk' 'distances'
 'dobsonian-telescope' 'doppler-effect' 'dust' 'dwarf-planets'
 'dwarf-star' 'early-universe' 'earth' 'earth-like-planet'
 'eccentric-orbit' 'eclipse' 'ecliptic' 'elemental-abundances'
 'ephemerides' 'ephem

In [7]:
#switch between index and tags
print(Y_train_converted_to_numbers[5])
print(Y_train[5])
print(Y_train_converted_to_numbers[37])
print(Y_train[37])

136
moon
98
hole


In [8]:
# GRADED FUNCTION: sentences_to_indices
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in
    the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure
    4).
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no
    longer than this.
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m,
    max_len)
    """
    m = X.shape[0]
    # number of training examples
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        # loop over training examples
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in word_to_index.keys():
                X_indices[i, j] = word_to_index[w]
            else:
                X_indices[i, j] = 1; # 1 = <unknown>
            # Increment j to j + 1
            j = j + 1
    ### END CODE HERE ###
    return X_indices

In [9]:
X1 = np.array(["hot sun", "funny lol", "lets play football", "food is ready for you"])
X1_indices = sentences_to_indices(X1,word_to_index, max_len = 5)
print("X1 =", X1)
print("X1_indices =", X1_indices)


X1 = ['hot sun' 'funny lol' 'lets play football' 'food is ready for you']
X1_indices = [[182186. 347345.      0.      0.      0.]
 [155345. 225122.      0.      0.      0.]
 [220930. 286375. 151266.      0.      0.]
 [151204. 192973. 302254. 151349. 394475.]]


In [10]:
# GRADED FUNCTION: pretrained_embedding_layer
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001
    words)
    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    vocab_len = len(word_to_index) + 1 # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0] # define dimensionality of your GloVe word vectors (= 50)
    
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False.
    embedding_layer = Embedding(vocab_len, emb_dim)
    ### END CODE HERE ###
    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

In [11]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][7][5])
print(len(embedding_layer.get_weights()))

weights[0][1][3] = -0.74187
1


In [12]:
# GRADED FUNCTION: Emojify_V2
def Tagger_LSTM(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)
    Returns:
    model -- a model instance in Keras
    """
    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape=input_shape, dtype=np.int32)
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)
    
    
#     Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
#     Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
#     Add dropout with a probability of 0.5
#     X = Dropout(0.5)(X)
#     Propagate X trough another LSTM layer with 128-dimensional hidden state
#     Be careful, the returned output should be a single hidden state, not a batch of sequences.


    X = LSTM(128, return_sequences=True)(X)
    #Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)

#     X = LSTM(128, return_sequences=True)(X)
#     X = Dropout(0.5)(X)
#     X = LSTM(128, return_sequences=True)(X)
#     X = Dropout(0.5)(X)
    X = LSTM(128)(X)
#     X = Dropout(0.5)(X)

    #Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.

    X = Dense(Y_train_unique.shape[0], activation='softmax')(X)
    #Add a softmax activation
    X = Activation('softmax')(X)


    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices, X)
    ### END CODE HERE ###
    return model


In [13]:
maxLen = len(max(X_train, key=len).split())
model = Tagger_LSTM((maxLen,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2997)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 2997, 50)          20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 2997, 128)         91648     
_________________________________________________________________
lstm_2 (LSTM)                (None, 2997, 128)         131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 2997, 128)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 269)               34701     
__________

In [14]:
print(maxLen, '\n------------------------------------')
for x in X_train:
    print(len(x.split()))

2997 
------------------------------------
10
20
49
46
146
84
59
32
16
115
33
17
23
184
43
29
150
25
42
25
102
38
42
44
86
15
63
77
15
14
55
42
19
47
74
20
20
85
141
35
34
43
76
41
24
42
55
51
143
53
25
84
150
23
24
130
36
63
9
57
84
31
139
50
40
20
24
159
9
48
19
7
27
31
23
17
88
20
74
36
19
27
17
25
177
53
41
18
21
37
46
61
90
76
98
83
54
48
34
21
67
55
146
43
52
44
21
65
202
33
41
19
23
23
67
84
27
54
141
52
30
42
26
44
52
26
86
20
57
129
26
15
33
19
47
61
28
29
18
40
89
15
65
25
79
54
32
19
78
23
82
23
14
23
29
28
22
29
54
83
28
36
55
130
40
38
127
61
61
82
18
22
86
52
59
40
80
70
33
133
80
138
39
79
30
83
19
27
27
20
35
42
12
32
11
61
32
22
19
7
34
51
16
11
13
28
34
55
21
22
16
79
62
88
68
112
19
84
55
12
11
34
21
21
50
65
282
38
41
34
24
63
35
196
56
38
101
53
60
64
162
16
62
63
38
22
21
13
51
40
36
75
29
14
44
51
54
23
32
42
47
23
45
28
53
278
28
24
45
23
38
15
65
18
13
21
29
27
15
52
34
77
87
18
11
44
69
57
28
21
55
43
47
82
28
271
29
34
103
152
12
27
49
26
89
22
32
51
31
115
1

41
75
23
32
33
54
15
47
23
25
89
146
26
40
66
23
33
54
89
148
29
218
205
160
60
27
87
116
98
39
24
27
46
64
72
48
18
74
20
38
89
89
42
20
57
73
60
57
74
39
103
19
19
81
117
252
47
40
335
27
42
39
14
57
65
54
30
462
31
49
13
11
71
103
42
78
18
28
27
35
126
31
49
35
99
27
141
13
30
138
60
67
114
49
50
115
75
92
208
68
78
40
30
82
56
52
19
33
49
33
118
44
58
22
23
39
15
13
29
36
56
74
44
14
21
52
34
30
20
72
46
45
162
58
30
140
49
39
21
16
25
107
35
16
19
93
64
84
29
24
193
70
76
69
56
46
12
57
36
39
27
14
13
41
33
56
14
16
37
42
28
36
24
147
21
26
68
55
13
27
46
32
42
97
21
61
22
52
30
43
34
56
20
68
18
30
15
2997
190
22
91
128
121
58
166
104
39
48
73
20
38
11
15
49
22
50
53
141
38
57
72
46
47
13
71
27
19
55
80
29
82
43
72
32
39
136
16
23
19
12
61
27
54
63
53
28
54
18
18
46
92
45
67
76
71
48
97
34
21
85
49
87
22
20
11
35
22
168
20
33
46
59
19
44
80
50
54
91
64
39
52
81
15
20
90
126
58
17
162
52
58
52
37
25
41
47
17
126
56
46
15
65
23
24
28
62
22
31
25
23
78
44
65
39
39
49
68
68
73
45
9
1

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C, dtype=int)[Y.reshape(-1)]
    return Y

In [17]:
if 'sun' in word_to_index.keys():
    print('ok')
word_to_index['sun']

ok


347345

In [18]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(Y_train_converted_to_numbers, C = Y_train_unique.shape[0])
print(Y_train_unique.shape[0])

269


In [None]:
model.fit(X_train_indices, Y_train_oh, epochs = 10, batch_size = 64, shuffle=True)

Epoch 1/10


In [None]:

loss, acc = model.evaluate(X_train_indices, Y_train_oh)
print()
print("Test accuracy = ", acc)

In [None]:
Y_train_unique

In [None]:
# Change the sentence below to see your prediction. Make sure all the words are in the Glove embeddings.  
x1 = np.array(['sun rise'])
x2 = np.array(['why the moon has phases'])
x = np.array(['why the atmosphere is essential for life'])
indices = sentences_to_indices(x1, word_to_index, maxLen)
print(x1[0] +' --- '+  Y_train_unique[np.argmax(model.predict(indices))])