### RNN for atis -- fit

### prepare data

In [101]:
from Teemo.examples.atis import load_data
import numpy as np
from assertpy import assert_that

In [102]:
train_set, valid_set, dicts = load_data.atis()
w2idx, labels2idx = dicts['words2idx'], dicts['labels2idx']
train_x, _, train_y = train_set
valid_x, _, valid_y = valid_set

idx2w = {w2idx[k]:k for k in w2idx}
idx2labels = {labels2idx[k]:k for k in labels2idx}
train_words = [list(map(lambda x: idx2w[x], w)) for w in train_x]
train_labels = [list(map(lambda x: idx2labels[x], w)) for w in train_y]
valid_words = [list(map(lambda x: idx2w[x], w)) for w in valid_x]
valid_labels = [list(map(lambda x: idx2labels[x], w)) for w in valid_y]

In [103]:
def get_max_min_idx(list_of_seq):
    min_seq = [np.min(x) for x in list_of_seq]
    max_seq = [np.max(x) for x in list_of_seq]
    print ('max idx: {0}, min idx: {1}'.format(np.max(max_seq), np.min(min_seq)))
    return np.max(max_seq), np.min(min_seq)

def get_max_seq_length(list_of_seq):
    len_seq = [len(x) for x in list_of_seq]
    max_len = np.max(len_seq)
    print ('max_seq_length: {0}'.format(max_len))
    return max_len

def mask_zero_add_1(list_of_seq):
    new_list_of_seq = []
    for seq in list_of_seq:
        seq += 1
        new_list_of_seq.append(seq)
    return new_list_of_seq

def convert_seq_to_matrix(list_of_seq, max_seq_length=50):
    new_seq = []
    for seq in list_of_seq:
        assert_that(len(seq)).is_less_than(max_seq_length)
        seq = list(seq) + [0] * (max_seq_length-len(seq))
        new_seq.append(seq)        
    return np.vstack(new_seq)

get_max_min_idx(train_x)
get_max_min_idx(valid_x)
get_max_seq_length(train_x)
get_max_seq_length(valid_x)
get_max_min_idx(train_y)
get_max_min_idx(valid_y)

max idx: 571, min idx: 0
max idx: 570, min idx: 0
max_seq_length: 46
max_seq_length: 30
max idx: 126, min idx: 0
max idx: 126, min idx: 0


(126, 0)

In [105]:
max_seq_length = 150

train_x_new = mask_zero_add_1(train_x) ## add 1 to every idx, as mask_zero==True
valid_x_new = mask_zero_add_1(valid_x)
get_max_min_idx(train_x_new)
get_max_min_idx(valid_x_new)

train_x_new = convert_seq_to_matrix(train_x_new)
train_y_new = convert_seq_to_matrix(train_y)
valid_x_new = convert_seq_to_matrix(valid_x_new)
valid_y_new = convert_seq_to_matrix(valid_y)

train_y_new = np.eye(n_classes)[train_y_new]
valid_y_new = np.eye(n_classes)[valid_y_new]

print (train_x_new.shape, train_y_new.shape)
print (valid_x_new.shape, valid_y_new.shape)

max idx: 572, min idx: 1
max idx: 571, min idx: 1
((4978, 50), (4978, 50, 127))
((893, 50), (893, 50, 127))


### build model

In [106]:
voca_size = len(idx2w) + 3
n_classes = len(idx2labels)
word_vec_dim = 100
hidden_dim = 200
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D

In [107]:
def build_model(voca_size, word_vec_dim, hidden_dim, n_classes):
    model = Sequential()
    model.add(Embedding(output_dim=word_vec_dim, input_dim=voca_size, mask_zero=True))
    model.add(Dropout(0.25))
    model.add(SimpleRNN(output_dim=hidden_dim, return_sequences=True))
    model.add(TimeDistributed(Dense(output_dim=n_classes, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    print ('model input_shape (nb_samples, seq_length): {0}'.format(model.input_shape))
    print ('model output_shape (nb_samples, seq_length, output_dim): {0}'.format(model.output_shape))
    return model

In [108]:
model = build_model(voca_size, word_vec_dim, hidden_dim, n_classes)

model input_shape (nb_samples, seq_length): (None, None)
model output_shape (nb_samples, seq_length, output_dim): (None, None, 127)


In [109]:
def fit_model(model, train_x, train_y):
    model.fit(train_x, train_y)

fit_model(model, train_x_new, train_y_new)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [110]:
pred = model.predict(valid_x_new)
pred = np.argmax(pred, axis=-1)
pred_values_list = [x[:len(y)] for x, y in zip(pred, valid_y)]
print ([len(x) for x in pred_values_list[:10]])
print ([len(x) for x in valid_y[:10]])

[19, 16, 13, 16, 17, 16, 11, 17, 17, 13]
[19, 16, 13, 16, 17, 16, 11, 17, 17, 13]


In [111]:
def conlleval_fun(words_list, pred_values_list, true_values_list, idx2labels):
    pred_labels_list = [list(map(lambda x: idx2labels[x], seq)) for seq in pred_values_list]
    true_labels_list = [list(map(lambda x: idx2labels[x], seq)) for seq in true_values_list]
    from Teemo.examples.atis.conlleval import conlleval
    con_dict = conlleval(pred_labels_list, true_labels_list, words_list, 'measure.txt')
    print ('Precision={}, Recall = {}, F1 = {}'.format(con_dict['r'], con_dict['p'], con_dict['f1']))

conlleval_fun(valid_words, pred_values_list, valid_y, idx2labels)

Precision=92.28, Recall = 91.92, F1 = 92.1


In [117]:
y_true = np.hstack(valid_y)
y_pred = np.hstack(pred_values_list)
y_true = np.eye(n_classes)[y_true]
y_pred = np.eye(n_classes)[y_pred]
from Teemo.algorithm.utils.evaluations import classification_evaluate
from Teemo.algorithm.utils.report_funcs import classification_report
res = classification_evaluate(y_pred, y_true)
print (classification_report(res))

            precision   recall      f_measure   support     
class 0     0.8421      0.4848      0.6154      33          
class 1     1.0         0.5588      0.717       34          
class 2     0.9118      0.9208      0.9163      101         
class 3     0.6667      0.4444      0.5333      9           
class 4     0.7778      0.3333      0.4667      21          
class 5     0.0         0.0         0.0         2           
class 6     0.75        0.5455      0.6316      11          
class 7     0.625       0.8333      0.7143      6           
class 8     1.0         0.6667      0.8         6           
class 9     0.0         0.0         0.0         0           
class 10    0.8889      1.0         0.9412      8           
class 11    0.0         0.0         0.0         0           
class 12    0.8571      1.0         0.9231      6           
class 13    1.0         1.0         1.0         8           
class 14    0.9429      0.9706      0.9565      34          
class 15    0.871       