In [85]:
import TrainingDataExtractor as tde
from deep_dynet import support

In [86]:
from os import path
import sys
sys.path.append(path.abspath('./stanford_parser'))

from stanford_parser.parser import Parser

In [87]:
# Dep parsing test
parser = Parser()

dependencies = parser.parseToStanfordDependencies("Most death sentences are for drug @-@ related offenses .")

tupleResult = [(rel, gov.text, dep.text) for rel, gov, dep in dependencies.dependencies]
print tupleResult

[('amod', 'sentences', 'Most'), ('nn', 'sentences', 'death'), ('nsubj', 'are', 'sentences'), ('prep_for', 'are', 'drug'), ('dep', 'drug', '@'), ('dep', 'are', '@'), ('amod', 'offenses', 'related'), ('dep', '@', 'offenses')]


In [None]:
from os import listdir

def read_data(type, dataset=None):
    data = []
    mypath = 'resources/alignments/split/' + type
    print(mypath)
    if dataset is None:
        for f in listdir(mypath):
            mypath_f = mypath + "/" + f
            print(mypath_f)
            data += tde.generate_training_data(mypath_f, False)
    else:
        mypath_f = mypath + "/" + dataset
        print(mypath_f)
        data = tde.generate_training_data(mypath_f, verbose=False, withDependencies=True)
    return data

data = read_data('training', 'deft-p2-amr-r1-alignments-training-dfa.txt')

  0%|          | 0/6455 [00:00<?, ?it/s]

resources/alignments/split/training
resources/alignments/split/training/deft-p2-amr-r1-alignments-training-dfa.txt


 28%|██▊       | 1788/6455 [01:07<01:21, 57.17it/s]

In [5]:
len(data)

1412

In [6]:
data[0]

('date-entity',
 ['SH_date-entity'],
 '(d / date-entity :year 2002~e.0 :month 1~e.2 :day 5~e.4)\n',
 {})

In [7]:
len(data[0])

4

In [8]:
data[20][0]

'The organization has not been officially banned in country .'

In [9]:
data[20][1]

['DN',
 'SH_organization',
 'DN',
 'SH_-',
 'DN',
 'SH_official',
 'SH_ban-01',
 'RL_mod',
 'RL_polarity',
 'RL_ARG1',
 'DN',
 'SH_country',
 'RR_location',
 'DN']

In [10]:
vocab_acts = support.Vocab.from_list(['SH', 'RL', 'RR', 'DN', 'SW'])
action_sequence = support.oracle_actions_to_action_index(data[20][1], vocab_acts)

In [11]:
action_sequence

[action: DN label: None index: 3,
 action: SH label: organization index: 0,
 action: DN label: None index: 3,
 action: SH label: - index: 0,
 action: DN label: None index: 3,
 action: SH label: official index: 0,
 action: SH label: ban-01 index: 0,
 action: RL label: mod index: 1,
 action: RL label: polarity index: 1,
 action: RL label: ARG1 index: 1,
 action: DN label: None index: 3,
 action: SH label: country index: 0,
 action: RR label: location index: 2,
 action: DN label: None index: 3]

In [12]:
data[20][2]

'(b / ban-01~e.9 :polarity~e.6 -~e.6 \n      :ARG1 (o / organization :wiki - \n            :name (n / name :op1 "Al-Rashid"~e.1,3 :op2 "Trust"~e.4)) \n      :mod (o2 / official~e.8) \n      :location~e.10 (c / country :wiki "Pakistan" \n            :name (n2 / name :op1 "Pakistan"~e.11)))\n'

#### Obtain for each training example the following: The sentence, the sequence of action indices, the labels and the dependencies

In [13]:
sentences = [d[0] for d in data]

In [14]:
sentences

['date-entity',
 'country ( SA )',
 'university is based in city .',
 'organization .',
 'date-entity',
 'country ( KP )',
 "The organization tour of the isotope production laboratory will unlikely give a full picture of country 's suspected nuclear weapons ambitions .",
 'Delays have pushed back the finish until at least date-entity .',
 'date-entity',
 'country ( KR )',
 'A country group has announced a campaign backed by food scientists to promote the eating of dog meat .',
 'The game finals are to be co @-@ hosted by country and country this year .',
 'The anti @-@ country group stated that country culture is under attack .',
 'It was ahead of the game which starts date-entity .',
 'Activists contend many slaughterhouses in the countryside are not controlled .',
 'No similar move is planned for the game .',
 'The authorities say the practice is slowly disappearing in country .',
 'date-entity',
 'Religious extremism continues in country despite the banning of militant groups .',
 '

In [15]:
amrs = [d[2] for d in data]

In [16]:
amrs

['(d / date-entity :year 2002~e.0 :month 1~e.2 :day 5~e.4)\n',
 '(c / country :wiki "Saudi_Arabia" \n      :name (n / name :op1 "Saudi"~e.0 :op2 "Arabia"~e.1))\n',
 '(b / base-01~e.7 \n      :ARG1 (u / university :wiki - \n            :name (n / name :op1 "Naif"~e.0 :op2 "Arab"~e.1 :op3 "Academy"~e.2 :op4 "for"~e.3 :op5 "Security"~e.4 :op6 "Sciences"~e.5)) \n      :location~e.8 (c / city :wiki "Riyadh" \n            :name (n2 / name :op1 "Riyadh"~e.9)))\n',
 '(r / run-01~e.7 \n      :ARG0~e.8 (o / organization :wiki - \n            :name (n / name :op1 "Arab"~e.10 :op2 "Interior"~e.11 :op3 "Ministers\'"~e.12,13 :op4 "Council"~e.14)) \n      :ARG1 (u / university :wiki - \n            :name (n2 / name :op1 "Naif"~e.0 :op2 "Arab"~e.1 :op3 "Academy"~e.2 :op4 "for"~e.3 :op5 "Security"~e.4 :op6 "Sciences"~e.5)))\n',
 '(d / date-entity :year 2002~e.0 :month 1~e.2 :day 7~e.4)\n',
 '(c / country :wiki "North_Korea" \n      :name (n / name :op1 "North"~e.0 :op2 "Korea"~e.1))\n',
 '(l / likely-0

In [17]:
actions = [support.oracle_actions_to_action_index(d[1], vocab_acts) for d in data]

In [18]:
action_indices = [[a.index for a in actions_list] for actions_list in actions]

In [19]:
action_indices

[[0],
 [0, 3, 3, 3],
 [0, 3, 0, 1, 3, 0, 2, 3],
 [0, 3],
 [0],
 [0, 3, 3, 3],
 [3,
  0,
  0,
  1,
  3,
  3,
  0,
  0,
  1,
  0,
  1,
  2,
  3,
  0,
  0,
  3,
  0,
  0,
  1,
  3,
  0,
  3,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  2,
  2,
  3,
  4,
  1,
  2],
 [0, 3, 0, 1, 0, 2, 3, 0, 2, 0, 0, 3, 0, 2, 2, 2, 3],
 [0],
 [0, 3, 3, 3],
 [3,
  0,
  0,
  1,
  3,
  0,
  1,
  3,
  0,
  0,
  3,
  0,
  0,
  1,
  2,
  2,
  3,
  0,
  3,
  0,
  3,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  3],
 [3, 0, 0, 1, 3, 3, 3, 3, 3, 0, 1, 3, 0, 0, 1, 0, 2, 2, 0, 0, 1, 2, 3],
 [3, 0, 3, 0, 2, 0, 1, 0, 1, 3, 0, 0, 1, 3, 3, 0, 1, 2, 3],
 [0, 3, 0, 1, 3, 3, 0, 3, 0, 0, 2, 2, 2, 3],
 [0, 0, 1, 0, 0, 1, 3, 3, 0, 2, 3, 0, 0, 1, 1, 2, 3],
 [0, 0, 0, 1, 3, 0, 1, 1, 3, 3, 0, 2, 3],
 [3, 0, 0, 1, 3, 0, 3, 0, 0, 1, 1, 3, 0, 2, 2, 3],
 [0],
 [0, 0, 1, 0, 1, 3, 0, 2, 3, 3, 0, 3, 0, 0, 1, 2, 2, 3],
 [3,
  3,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  3,
  0,
  2,
  2,
  2,
  3,
  3,
  0,
  3,
  3,
  3,
  3,
  0,
  1,
  0,
  2

In [20]:
action_labels = [[a.label for a in actions_list] for actions_list in actions]

In [21]:
action_labels

[['date-entity'],
 ['country', None, None, None],
 ['university', None, 'base-01', 'ARG1', None, 'city', 'location', None],
 ['organization', None],
 ['date-entity'],
 ['country', None, None, None],
 [None,
  'organization',
  'tour-01',
  'ARG0',
  None,
  None,
  'isotope',
  'produce-01',
  'ARG1',
  'laboratory',
  'ARG0-of',
  'ARG1',
  None,
  'likely-01',
  'give-01',
  None,
  'full',
  'picture',
  'mod',
  None,
  'country',
  None,
  'suspect-01',
  'nucleus',
  'weapon',
  'mod',
  'ambition',
  'mod',
  'ARG1-of',
  'poss',
  'consist-of',
  'ARG1',
  None,
  None,
  'ARG0',
  'ARG1'],
 ['delay-01',
  None,
  'push-01',
  'ARG0',
  'back',
  'ARG2',
  None,
  'finish-01',
  'ARG1',
  'until',
  'at-least',
  None,
  'date-entity',
  'op1',
  'op1',
  'time',
  None],
 ['date-entity'],
 ['country', None, None, None],
 [None,
  'country',
  'group',
  'source',
  None,
  'announce-01',
  'ARG0',
  None,
  'campaign-01',
  'back-01',
  None,
  'food',
  'scientist',
  'mod',


In [22]:
dependencies = [d[3] for d in data]

In [23]:
dependencies

[{},
 {2: (0, 'abbrev')},
 {0: (2, 'nsubjpass'), 1: (2, 'auxpass'), 4: (2, 'prep_in')},
 {},
 {},
 {2: (0, 'abbrev')},
 {0: (2, 'det'),
  1: (2, 'nn'),
  2: (10, 'nsubj'),
  4: (7, 'det'),
  5: (7, 'amod'),
  6: (7, 'nn'),
  7: (2, 'prep_of'),
  8: (10, 'aux'),
  9: (10, 'advmod'),
  11: (13, 'det'),
  12: (13, 'amod'),
  13: (10, 'iobj'),
  15: (19, 'poss'),
  17: (19, 'amod'),
  18: (19, 'amod'),
  19: (13, 'prep_of'),
  20: (10, 'dobj')},
 {0: (2, 'nsubj'),
  1: (2, 'aux'),
  3: (2, 'advmod'),
  4: (5, 'det'),
  5: (3, 'dep'),
  6: (3, 'dep'),
  7: (6, 'dep'),
  8: (7, 'dep'),
  9: (2, 'advmod')},
 {},
 {2: (0, 'abbrev')},
 {0: (2, 'det'),
  1: (2, 'nn'),
  2: (4, 'nsubj'),
  3: (4, 'aux'),
  5: (6, 'det'),
  6: (4, 'dobj'),
  7: (6, 'partmod'),
  9: (10, 'nn'),
  10: (7, 'agent'),
  11: (12, 'aux'),
  12: (7, 'xcomp'),
  13: (14, 'det'),
  14: (12, 'dobj'),
  16: (17, 'nn'),
  17: (14, 'prep_of')},
 {0: (2, 'det'),
  1: (2, 'nn'),
  2: (3, 'nsubj'),
  4: (6, 'aux'),
  5: (6, 'auxpa

#### For the initial LSTM training we will have: Input = list of tokens, Output = list of actions. If the action is a reduce or swap, reinsert the same token at input at the next time step

#### Specifically, the LSTM will receive the word embedding of the first two tokens on the stack and the embedding of the current token on the buffer. The output will be an action index. The action index is analyzed and based on it we decide whether the next fed token is the same or the next one.

In [24]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [25]:
tokenizer = Tokenizer(filters="", lower=True, split=" ")

In [26]:
tokenizer.fit_on_texts(sentences)

In [27]:
import numpy as np

In [28]:
sequences = np.asarray(tokenizer.texts_to_sequences(sentences))

In [29]:
sequences

array([[4], [2, 10, 221, 9], [246, 8, 67, 5, 24, 1], ...,
       [3, 175, 8, 357, 15, 371, 23, 68, 1371, 5, 2, 12, 2, 1],
       [3, 75, 35, 29, 1208, 5, 4, 27, 3, 737, 818, 354, 25, 4, 1],
       [3, 62, 69, 2, 385, 388, 47, 119, 136, 1]], dtype=object)

In [30]:
word_index = tokenizer.word_index

In [31]:
word_index

{'limited': 492,
 'narcotic': 850,
 'facilities': 851,
 'protest': 852,
 'controversial': 853,
 'hanging': 854,
 'offenses': 138,
 'violate': 855,
 'electricity': 494,
 'unseating': 856,
 'presents': 857,
 'voted': 495,
 'under': 182,
 'pact': 729,
 'risk': 859,
 'rise': 332,
 'every': 860,
 'affect': 333,
 'vast': 861,
 'school': 862,
 'supports': 792,
 'companies': 496,
 'solution': 863,
 'red': 335,
 'poppy': 336,
 'veritable': 866,
 'force': 497,
 'phones': 867,
 'budget': 1691,
 'likely': 347,
 'street': 870,
 'even': 871,
 'established': 872,
 'contributed': 873,
 'supplied': 337,
 'spokesman': 874,
 'above': 875,
 'conduct': 876,
 'new': 98,
 'officially': 498,
 'told': 499,
 'maintains': 1897,
 'whose': 493,
 'men': 255,
 'drew': 877,
 'atoms': 878,
 'reported': 166,
 'protection': 338,
 'active': 879,
 'obtained': 500,
 'substance': 880,
 'forum': 501,
 'items': 881,
 'study': 882,
 'changed': 883,
 'envoy': 884,
 'reports': 502,
 'credit': 885,
 'stressed': 886,
 'military': 

In [32]:
len(word_index)

2069

#### Shuffle data

In [33]:
indices = np.arange(sequences.shape[0])

In [34]:
np.random.shuffle(indices)

In [35]:
indices

array([ 436, 1405,  926, ...,  677,  940,  719])

In [36]:
sequences = sequences[indices]

In [37]:
actions = np.asarray(action_indices)[indices]

In [46]:
dependencies = [dependencies[i] for i in indices]

In [47]:
dependencies

[{0: (1, 'nsubj'),
  2: (11, 'complm'),
  3: (11, 'csubj'),
  4: (3, 'dobj'),
  6: (4, 'conj_and'),
  8: (4, 'conj_and'),
  9: (11, 'aux'),
  10: (11, 'neg'),
  11: (1, 'ccomp'),
  12: (11, 'xcomp'),
  13: (12, 'dobj')},
 {1: (3, 'det'),
  2: (3, 'nn'),
  3: (7, 'prep_in'),
  4: (6, 'det'),
  5: (6, 'nn'),
  6: (7, 'nsubj'),
  8: (11, 'complm'),
  9: (11, 'nsubj'),
  10: (11, 'cop'),
  11: (7, 'ccomp'),
  12: (13, 'aux'),
  13: (11, 'xcomp'),
  14: (15, 'nn'),
  15: (13, 'dobj'),
  17: (18, 'amod'),
  18: (13, 'prep_for'),
  20: (11, 'conj_and'),
  21: (22, 'aux'),
  22: (20, 'xcomp'),
  23: (24, 'det'),
  24: (22, 'dobj'),
  26: (29, 'poss'),
  28: (29, 'nn'),
  29: (24, 'prep_of')},
 {1: (0, 'partmod')},
 {0: (1, 'amod'), 2: (1, 'dep')},
 {},
 {},
 {},
 {0: (1, 'nsubj'),
  2: (4, 'det'),
  3: (4, 'nn'),
  4: (1, 'dobj'),
  6: (1, 'prep_in')},
 {0: (4, 'nsubjpass'),
  2: (0, 'conj_and'),
  3: (4, 'auxpass'),
  6: (4, 'prep_in')},
 {1: (2, 'det'), 2: (4, 'prep_at'), 3: (4, 'nsubj')},
 

In [48]:
amrs = np.asanyarray(amrs)[indices]

In [49]:
sequences

array([[7, 16, 17, 758, 1994, 19, 378, 12, 1684, 253, 37, 948, 758, 942, 1],
       [5, 11, 578, 1518, 3, 70, 70, 16, 17, 42, 8, 1727, 15, 196, 64, 159, 25, 1954, 48, 12, 835, 15, 466, 3, 192, 6, 2, 18, 72, 104, 1],
       [7, 16, 20], ...,
       [1454, 77, 21, 106, 5, 1483, 3, 140, 591, 1566, 1033, 84, 203, 1],
       [4], [2, 10, 342, 9]], dtype=object)

In [50]:
actions

array([ [0, 0, 1, 3, 0, 0, 3, 0, 0, 1, 1, 0, 2, 2, 3, 0, 0, 1, 1, 0, 0, 2, 2, 2, 3],
       [3, 3, 0, 0, 1, 3, 3, 0, 0, 1, 1, 3, 3, 3, 0, 3, 0, 0, 0, 1, 2, 3, 0, 0, 1, 2, 2, 0, 1, 0, 3, 0, 3, 0, 3, 0, 3, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3],
       [0, 0, 1, 3], ...,
       [0, 0, 1, 3, 0, 3, 0, 3, 0, 2, 2, 0, 0, 0, 1, 0, 0, 1, 1, 2, 3, 4, 2, 2],
       [0], [0, 3, 3, 3]], dtype=object)

In [51]:
amrs

array(['(d2 / date-entity :day 14~e.4 :month 5~e.2 :year 2008~e.0)\n',
       '(s / say-01~e.2 \n      :ARG1 (t / this~e.0) \n      :location~e.3 (s2 / statement~e.5) \n      :time (a / after~e.6 \n            :op2 (m / meeting~e.12 \n                  :time~e.11 (d / date-entity :year 2002~e.10 :month~e.7 12~e.7 :day 26~e.8))))\n',
       '(c / country :wiki "United_Kingdom" \n      :name (n / name :op1 "United"~e.0 :op2 "Kingdom"~e.1))\n',
       ...,
       '(b / base-02~e.5 :polarity~e.4 -~e.4 \n      :ARG1 (i / information~e.2 \n            :poss~e.1 (g / government-organization :wiki "Danish_Defence_Intelligence_Service" \n                  :name (n / name :op1 "FE"~e.0))) \n      :ARG2~e.6 (s / source~e.8 \n            :mod (c / credible~e.7)))\n',
       '(c2 / costly~e.3 \n      :domain~e.2 (m / modernize-01~e.0))\n',
       '(h / head-01~e.0 \n      :ARG0 (p / person :wiki - \n            :name (n5 / name :op1 "Dosym"~e.12 :op2 "Satpayev"~e.13)) \n      :ARG1 (g / group :wiki

In [55]:
num_train_samples = int(0.95 * sequences.shape[0])

In [61]:
x_train = sequences[:num_train_samples]
y_train = actions[:num_train_samples]
amrs_train = amrs[:num_train_samples]
dependencies_train = dependencies[:num_train_samples]

x_test = sequences[num_train_samples:]
y_test = actions[num_train_samples:]
amrs_test = amrs[num_train_samples:]
dependencies_test = dependencies[num_train_samples:]

In [57]:
print x_train.shape
print y_train.shape
print amrs_train.shape

(1341,)
(1341,)
(1341,)


In [63]:
print len(dependencies_train)

1341


#### Prepare the embedding layer

In [58]:
embeddings_index = {}
f = open('./resources/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [59]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 2, EMBEDDING_DIM))
not_found = []
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        not_found.append(word)

print 'Not found: {}'.format(not_found)

Not found: ['toiba', 'volume-quantity', 'monetary-quantity', 'ransomware', 'temporal-quantity', 'railway-line', '@-@', 'mass-quantity', 'power-quantity', 'political-party', 'energy-quantity', 'country-region', 'esrp', 'date-entity', 'city-district', 'religious-group', 'criminal-organization', '0.00525', 'ethnic-group', 'worship-place', 'securitynet', 'resarch', 'pgpcoder', 'distance-quantity', 'aircraft-type', 'research-institute', 'world-region', 'government-organization']


#### Prepare the proper data set:
#### Input: Buffer top, First three elements on the stack, previous action index, stack[0] deps on stack[1], stack[1] deps on stack[0], stack[0] deps on buffer[0], buffer[0] deps on stack[1], stack[0] deps on stack[2], stack[2] deps on stack[0].
#### If the current action is shift, the next action will have the next token in the buffer and updated stack elements.
#### Else, the same element on the buffer is fed and the elements from the stack are updated
#### Do not consider instances with more than 30 actions for the moment.

In [67]:
import sklearn.preprocessing
label_binarizer = sklearn.preprocessing.LabelBinarizer()
label_binarizer.fit(range(5))

SH = 0
RL = 1
RR = 2
DN = 3
SW = 4
NONE = 5

max_len = 30
no_word_index = (len(word_index)) + 1

def generate_dataset(x, y, dependencies):
    x_full = np.zeros((len(x), max_len, 15), dtype=np.int32)
    y_full = np.full((len(y), max_len), dtype=np.int32, fill_value=NONE)

    lengths = []
    filtered_count = 0

    for action_sequence, tokens_sequence, deps, i in zip(y, x, dependencies, range(len(y))):
        next_action_token = tokens_sequence[0]
        next_action_stack = [no_word_index, no_word_index, no_word_index, no_word_index] 
        next_action_prev_action = NONE
        tokens_sequence_index = 0
        features_matrix = []
        lengths.append(len(action_sequence))
        if len(action_sequence) > 30:
            filtered_count += 1
            continue
        for action, j in zip(action_sequence, range(len(action_sequence))):
            if next_action_prev_action != NONE:
                next_action_prev_action_ohe = label_binarizer.transform([next_action_prev_action])[0, :]
            else:
                next_action_prev_action_ohe = [0, 0, 0, 0, 0]
            
            dep_0_on_1 = 0
            dep_1_on_0 = 0
            dep_0_on_2 = 0
            dep_2_on_0 = 0
            dep_0_on_b = 0
            dep_b_on_0 = 0
            if next_action_stack[0] in  deps.keys() and deps[next_action_stack[0]][0] == next_action_stack[1]:
                dep_0_on_1 = 1
            if next_action_stack[1] in  deps.keys() and deps[next_action_stack[1]][0] == next_action_stack[0]:
                dep_1_on_0 = 1
            if next_action_stack[0] in  deps.keys() and deps[next_action_stack[0]][0] == next_action_stack[2]:
                dep_0_on_2 = 1
            if next_action_stack[2] in  deps.keys() and deps[next_action_stack[2]][0] == next_action_stack[0]:
                dep_2_on_0 = 1
            if next_action_stack[0] in  deps.keys() and deps[next_action_stack[0]][0] == next_action_token:
                dep_0_on_b = 1
            if next_action_token in  deps.keys() and deps[next_action_token][0] == next_action_stack[0]:
                dep_b_on_0 = 1
            features = np.concatenate((np.asarray([next_action_token, next_action_stack[0], 
                                             next_action_stack[1], next_action_stack[2]]),
                        next_action_prev_action_ohe,
                                      np.asarray([dep_0_on_1, dep_1_on_0, dep_0_on_2, dep_2_on_0, dep_0_on_b, dep_b_on_0])))
            if action == SH:
                tokens_sequence_index += 1
                next_action_stack = [next_action_token] + next_action_stack
                if tokens_sequence_index < len(tokens_sequence):
                    next_action_token = tokens_sequence[tokens_sequence_index]
                else:
                    next_action_token = no_word_index 
            if action == RL:
                next_action_stack = [next_action_stack[0]] + next_action_stack[2:]
            if action == RR:
                next_action_stack = [next_action_stack[1]] + next_action_stack[2:]
            if action == DN:
                tokens_sequence_index += 1
                if tokens_sequence_index < len(tokens_sequence):
                    next_action_token = tokens_sequence[tokens_sequence_index]
                else:
                    next_action_token = no_word_index
            if action == SW:
                next_action_stack = [next_action_stack[0], next_action_stack[2], next_action_stack[1]] + next_action_stack[3:]
            next_action_prev_action = action
            features_matrix.append(features)
        if tokens_sequence_index != len(tokens_sequence):
            raise Exception("There was a problem at training instance " + str(i) + "\n")

        features_matrix = np.concatenate((np.asarray(features_matrix),
                                          np.zeros((max_len - len(features_matrix), 15), dtype=np.int32)))
        actions = np.concatenate((np.asarray(action_sequence),
                                np.full((max_len - len(action_sequence)), dtype=np.int32, fill_value=NONE)))
        x_full[i, :, :] = features_matrix
        y_full[i, :] = actions
    return x_full, y_full, lengths, filtered_count

(x_train_full, y_train_full, lengths_train, filtered_count_tr) = generate_dataset(x_train, y_train, dependencies_train)
(x_test_full, y_test_full, lengths_test, filtered_count_test) = generate_dataset(x_test, y_test, dependencies_test)

In [68]:
np.asarray(lengths_train).mean()

11.610738255033556

In [69]:
filtered_count_tr

67

In [70]:
x_train_full.shape

(1341, 30, 15)

In [71]:
y_train_full.shape

(1341, 30)

In [72]:
y_train_ohe = np.zeros((y_train.shape[0], max_len, 5), dtype='int32')
for row, i in zip(y_train_full[:, :], range(y_train_full.shape[0])):
    y_train_instance_matrix = label_binarizer.transform(row)
    y_train_ohe[i, :, :] = y_train_instance_matrix

In [76]:
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Flatten, TimeDistributed
from keras.models import Model

buffer_input = Input(shape=(max_len,), dtype='int32')
stack_input_0 = Input(shape=(max_len,), dtype='int32')
stack_input_1 = Input(shape=(max_len,), dtype='int32')
stack_input_2 = Input(shape=(max_len,), dtype='int32')
prev_action_input = Input(shape=(max_len, 5), dtype='float32')
dep_info_input = Input(shape=(max_len, 6), dtype='float32')

embedding = Embedding(len(word_index) + 2,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)

buffer_emb = embedding(buffer_input)
stack_emb_0 = embedding(stack_input_0)
stack_emb_1 = embedding(stack_input_1)
stack_emb_2 = embedding(stack_input_2)

x = concatenate([buffer_emb, stack_emb_0, stack_emb_1, stack_emb_2, prev_action_input, dep_info_input])

lstm_output = LSTM(1024, return_sequences=True)(x)

dense = TimeDistributed(Dense(5, activation="softmax"))(lstm_output)

In [77]:
lstm_output

<tf.Tensor 'lstm_2/transpose_1:0' shape=(?, ?, 1024) dtype=float32>

In [78]:
dense

<tf.Tensor 'time_distributed_2/Reshape_1:0' shape=(?, 30, 5) dtype=float32>

In [79]:
model = Model([buffer_input, stack_input_0, stack_input_1, stack_input_2, prev_action_input, dep_info_input], dense)

In [80]:
from keras.optimizers import RMSprop
rms = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0)
model.compile(optimizer=rms,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [81]:
from keras.utils import plot_model

plot_model(model, to_file='model.png')

In [82]:
print model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_12 (InputLayer)            (None, 30)            0                                            
____________________________________________________________________________________________________
input_13 (InputLayer)            (None, 30)            0                                            
____________________________________________________________________________________________________
input_14 (InputLayer)            (None, 30)            0                                            
____________________________________________________________________________________________________
input_15 (InputLayer)            (None, 30)            0                                            
___________________________________________________________________________________________

In [84]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

model.fit([x_train_full[:, :, 0], x_train_full[:, :, 1], x_train_full[:, :, 2], x_train_full[:, :, 3], x_train_full[:, :, 4:9], x_train_full[:, :, 9:]], 
         y_train_ohe,
         epochs=70, batch_size=16,
         validation_split=0.2,
         callbacks=[ModelCheckpoint('./proxy_model_with_deps', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)])

Train on 1072 samples, validate on 269 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x166896b90>

In [149]:
model.load_weights('proxy_model', by_name=False)
index_to_word_map = {v: k for k, v in tokenizer.word_index.iteritems()}

def get_predictions_from_distr(predictions_distr):
    predictions = [np.argmax(p) for p in predictions_distr]
    return predictions

def pretty_print_predictions(predictions):
    actions = ['SH', 'RL', 'RR', 'DN', 'SW']
    for i in range(len(predictions)):
        print actions[predictions[i]],;
    print '\n'
    
def pretty_print_sentence(tokens, index_to_word_map):
    for i in range(len(tokens)):
        print index_to_word_map[tokens[i]],;
    print '\n'

def make_prediction(model, x_test, y_test):
    tokens_buffer = x_test
    tokens_stack = []
    current_step = 0
    buffer_token = np.zeros((1, max_len))
    stack_token0 = np.zeros((1, max_len))
    stack_token1 = np.zeros((1, max_len))
    stack_token2 = np.zeros((1, max_len))
    prev_action = np.zeros((1, max_len, 5))
    
    buffer_token[0][current_step] = tokens_buffer[0]
    stack_token0[0][current_step] = no_word_index
    stack_token1[0][current_step] = no_word_index
    stack_token2[0][current_step] = no_word_index
    prev_action[0][current_step] = [0, 0, 0, 0, 0]

    final_prediction = []
    while (len(tokens_buffer) != 0 or len(tokens_stack) != 1) and current_step < max_len - 1:
        prediction = model.predict([buffer_token, stack_token0, stack_token1, stack_token2, prev_action])
        current_actions_distr_ordered = np.argsort(prediction[0][current_step])[::-1]
        current_inspected_action_index = 0
        current_action = current_actions_distr_ordered[current_inspected_action_index]
        invalid = True
        while invalid:
            invalid = False
            current_action = current_actions_distr_ordered[current_inspected_action_index]
            current_inspected_action_index += 1
            if current_action == SH:
                if len(tokens_buffer) == 0:
                    invalid = True
                    continue
                tokens_stack = [tokens_buffer[0]] + tokens_stack
                tokens_buffer = tokens_buffer[1:]
            if  current_action == RL:
                if len(tokens_stack) < 2:
                    invalid = True
                    continue
                tokens_stack = [tokens_stack[0]] + tokens_stack[2:]
            if current_action == RR:
                if len(tokens_stack) < 2:
                    invalid = True
                    continue
                tokens_stack = [tokens_stack[1]] + tokens_stack[2:]
            if current_action == DN:
                if len(tokens_buffer) == 0:
                    invalid = True
                    continue
                tokens_buffer = tokens_buffer[1:]
            if current_action == SW:
                if len(tokens_stack) < 3:
                    invalid = True
                    continue
                tokens_stack = [tokens_stack[0], tokens_stack[2], tokens_stack[1]] + tokens_stack[3:]
        final_prediction.append(current_action)
        current_step += 1
        if len(tokens_buffer) > 0:
            buffer_token[0][current_step] = tokens_buffer[0]
        else:
            buffer_token[0][current_step] = no_word_index
            
        if len(tokens_stack) > 3:
            stack_token0[0][current_step] = tokens_stack[0]
            stack_token1[0][current_step] = tokens_stack[1]
            stack_token2[0][current_step] = tokens_stack[2]
        else:
            if len(tokens_stack) > 2:
                stack_token0[0][current_step] = tokens_stack[0]
                stack_token1[0][current_step] = tokens_stack[1]
                stack_token2[0][current_step] = no_word_index
            else:
                if len(tokens_stack) > 1:
                    stack_token0[0][current_step] = tokens_stack[0]
                    stack_token1[0][current_step] = no_word_index
                    stack_token2[0][current_step] = no_word_index
                else:
                    stack_token0[0][current_step] = no_word_index
                    stack_token1[0][current_step] = no_word_index
                    stack_token2[0][current_step] = no_word_index
        prev_action[0][current_step] = label_binarizer.transform([current_action])[0, :]
    print 'Buffer and stack at end of prediction'
    print tokens_buffer
    print tokens_stack
    return final_prediction

for i in range(10):
    prediction = make_prediction(model, x_test[i], y_test[i])  
    print 'Predicted'
    pretty_print_predictions(prediction)
    print 'Actual'
    pretty_print_predictions(y_test[i])
    print 'Sentence'
    pretty_print_sentence(x_test[i], index_to_word_map)    
    print 'Amr'
    print amrs_test[i]

Buffer and stack at end of prediction
[]
[498]
Predicted
SH SH DN DN SH SH RL RR DN RR 

Actual
SH SH DN DN DN SH RL RL DN 

Sentence
officially country is a pacifist nation . 

Amr
(n / nation~e.5 
      :domain~e.2 (c2 / country :wiki "Japan" 
            :name (n3 / name :op1 "Japan"~e.1)) 
      :mod (p / pacifism) 
      :manner~e.0 (o / official~e.0))

Buffer and stack at end of prediction
[]
[4]
Predicted
SH 

Actual
SH 

Sentence
date-entity 

Amr
(d / date-entity :year 2008~e.0 :month 2~e.2 :day 4~e.4)

Buffer and stack at end of prediction
[]
[90]
Predicted
SH DN SH SH RL DN DN DN DN DN RL 

Actual
SH DN DN DN DN DN DN DN DN 

Sentence
country ( islamic republic of ) ( ir ) 

Amr
(c / country :wiki "Iran" 
      :name (n / name :op1 "Iran"~e.0))

Buffer and stack at end of prediction
[]
[54]
Predicted
SH DN DN SH SH RL DN SH RR RR DN 

Actual
SH DN DN SH SH RL RL DN SH RR DN 

Sentence
opium is the raw ingredient for heroin . 

Amr
(i / ingredient~e.4 
      :purpose~e.5 (h /