-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_tbptt.py
215 lines (175 loc) · 10.3 KB
/
train_tbptt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
from __future__ import print_function
import argparse
from collections import OrderedDict
from blocks.algorithms import GradientDescent, Adam, CompositeRule, StepClipping
from blocks.extensions import Timing, Printing, FinishAfter, ProgressBar
from blocks.extensions.monitoring import TrainingDataMonitoring
from blocks.extensions.training import SharedVariableModifier
from blocks.main_loop import MainLoop
from blocks.monitoring import aggregation
from blocks.monitoring.evaluators import AggregationBuffer
from fuel.datasets.hdf5 import H5PYDataset
from fuel.schemes import SequentialScheme, ShuffledScheme
from fuel.streams import DataStream
from numpy import load, array, ones, all, zeros
from theano import function, shared
from custom_blocks import PadAndAddMasks, EarlyStopping
from network import *
from prepare_lk import ALPHABET_FILE, OUT_FILE_NAME as DATA_FILE_LOC
from util import StateComputer
from martin_test_module import OverrideStateReset
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--clipping", default=1.0, type=float, help="Step clipping threshold (default: 1)")
parser.add_argument("-t", "--type", default="simple", help="Type of RNN ("+NetworkType.SIMPLE_RNN+", " +
NetworkType.GRU+" or "+NetworkType.LSTM+")")
parser.add_argument("-e", "--epochs", default=0, type=int, help="Stop after this many epochs (default: 0 -- means it will" +
" run forever until you cancel manually)")
parser.add_argument("-d", "--dimensions", default="512,512,512", type=str, help="Configure number of layers and dimensions by" +
" just enumerating dimensions (e.g. 100, 100 for" +
" a two-layered network with dims 100)")
args = parser.parse_args()
char2ix = load(ALPHABET_FILE).item()
ix2char = {}
for k, v in char2ix.items():
ix2char[v] = k
dimensions = [int(d) for d in args.dimensions.split(',')]
print('Dimensions:', dimensions)
nkwargs = {'network_type': args.type, 'reset_states': False, 'input_dim': len(ix2char)}
if dimensions:
nkwargs['hidden_dims'] = dimensions
network = Network(**nkwargs)
cross_ent = network.cost
generator = network.generator
cost_model = network.cost_model
# do init_state stuff
initial_states = network.initial_states
#init_state_2 = initial_states[2]
# data
train_data = H5PYDataset(DATA_FILE_LOC, which_sets=("train",), load_in_memory=True)
valid_data = H5PYDataset(DATA_FILE_LOC, which_sets=("valid",), load_in_memory=True)
# see custom_blocks for the transformer
data_stream = PadAndAddMasks(DataStream.default_stream(dataset=train_data, iteration_scheme=ShuffledScheme(train_data.num_examples,
batch_size=1)), produces_examples=False)
data_stream_valid = PadAndAddMasks(DataStream.default_stream(dataset=valid_data, iteration_scheme=SequentialScheme(valid_data.num_examples,
batch_size=1)), produces_examples=False)
# monitor:
# - training cost every 200 batches (computed along the way, so cheap to do), as well as gradient and step lengths to
# detect exploding gradient problems
# - validation cost once every epoch
sc = StateComputer(network.cost_model, ix2char)
# FIXME too complicated and hard-coded
state_to_compare_0 = list(filter(lambda x: x.name == 'sequencegenerator_cost_matrix_states', sc.state_variables))[0]
state_to_compare_1 = list(filter(lambda x: x.name == 'sequencegenerator_cost_matrix_states#1', sc.state_variables))[0]
state_to_compare_2 = list(filter(lambda x: x.name == 'sequencegenerator_cost_matrix_states#2', sc.state_variables))[0] # notice: python2 filter seems to return a list, but anyway
# The thing that I feed into the parameters argument I copied from some blocks-examples thing. the good old computation
# graph and then cg.parameters should work as well.
# On the step rule: This means gradient clipping (threshold passed in as a command line argument) followed by Adam
# performed on the clipped gradient.
# If you don't know what gradient clipping is: Define threshold T; if the length of the gradient is > T, scale it such
# that its length is equal to T. This serves to make exploding gradients less severe.
algorithm = GradientDescent(cost=cross_ent, parameters=cost_model.parameters,
step_rule=CompositeRule(components=[StepClipping(threshold=args.clipping), Adam()]),
on_unused_sources="ignore")
aggr_0 = AggregationBuffer(variables=[state_to_compare_0], use_take_last=True)
aggr_0.initialize_aggregators()
check_value_0 = shared(zeros((network.transitions[-1].dim), dtype='float32')) # TODO: is it necessary or recommendable to add these to the computation graph is constants or sth like that?
def modifier_function_0(iterations_done, old_value):
"""
Note about this method: The accumulator seems to get the state vector in random order
:param iterations_done:
:param old_value:
:return:
"""
values = aggr_0.get_aggregated_values()
new_value = values[state_to_compare_0.name]
aggr_0.initialize_aggregators() # TODO what's the purpose of that? I observed them do it in the monitoring extensions after every request
#print('0:OLD ..:', old_value)
#print('0:NEW in:', new_value[-1][0], new_value[0][0], sep='\n')
#print('0:CHECK:', aggr_0.check_value.get_value(), sep='\n')
value_a = new_value[-1][0]
value_b = new_value[0][0]
if all(check_value_0.get_value() == zeros((1, old_value.size), dtype='float32')):
check_value_0.set_value(old_value)
new_value = value_a if all(value_a != check_value_0) else value_b
check_value_0.set_value(new_value)
#print('0:CHOICE:', new_value, sep='\n')
return new_value
aggr_1 = AggregationBuffer(variables=[state_to_compare_1], use_take_last=True)
aggr_1.initialize_aggregators()
check_value_1 = shared(zeros((network.transitions[-1].dim), dtype='float32'))
def modifier_function_1(iterations_done, old_value):
"""
Note about this method: The accumulator seems to get the state vector in random order
:param iterations_done:
:param old_value:
:return:
"""
values = aggr_1.get_aggregated_values()
new_value = values[state_to_compare_1.name]
aggr_1.initialize_aggregators() # TODO what's the purpose of that? I observed them do it in the monitoring extensions after every request
#print('1:OLD ..:', old_value)
#print('1:NEW in:', new_value[-1][0], new_value[0][0], sep='\n')
#print('1:CHECK:', aggr_1.check_value.get_value(), sep='\n')
value_a = new_value[-1][0]
value_b = new_value[0][0]
if all(check_value_1.get_value() == zeros((1, old_value.size), dtype='float32')):
check_value_1.set_value(old_value)
new_value = value_a if all(value_a != check_value_1) else value_b
check_value_1.set_value(new_value)
#print('1:CHOICE:', new_value, sep='\n')
return new_value
aggr_2 = AggregationBuffer(variables=[state_to_compare_2], use_take_last=True)
aggr_2.initialize_aggregators()
check_value_2 = shared(zeros((network.transitions[-1].dim), dtype='float32'))
def modifier_function_2(iterations_done, old_value):
"""
Note about this method: The accumulator seems to get the state vector in random order
:param iterations_done:
:param old_value:
:return:
"""
values = aggr_2.get_aggregated_values()
new_value = values[state_to_compare_2.name]
aggr_2.initialize_aggregators() # TODO what's the purpose of that? I observed them do it in the monitoring extensions after every request
#print('2:OLD ..:', old_value)
#print('2:NEW in:', new_value[-1][0], new_value[0][0], sep='\n')
#print('2:CHECK:', check_value_2.get_value(), sep='\n')
value_a = new_value[-1][0]
value_b = new_value[0][0]
if all(check_value_2.get_value() == zeros((1, old_value.size), dtype='float32')):
check_value_2.set_value(old_value)
new_value = value_a if all(value_a != check_value_2) else value_b
check_value_2.set_value(new_value)
#print('2:CHOICE:', new_value, sep='\n')
return new_value
# FIXME hard-coded for 3-layered LSTM
modifier_functions = {
network.transitions[0].name : modifier_function_0,
network.transitions[1].name : modifier_function_1,
network.transitions[2].name : modifier_function_2
}
#init_state_modifier = SharedVariableModifier(network.transitions[-1].initial_state_, function=modifier_function, after_batch=True)
init_state_modifiers = [SharedVariableModifier(trans.initial_state_, function=modifier_functions[trans.name], after_batch=True) for trans in network.transitions]
#state_function = function([state_to_compare], initial_states[2], updates=[(init_state_2, state_to_compare[0][-1])]) #TODO look at this, this is how it basically works!
monitor_grad = TrainingDataMonitoring(variables=[cross_ent, aggregation.mean(algorithm.total_gradient_norm),
aggregation.mean(algorithm.total_step_norm)], #+initial_states+[state_to_compare_1],
prefix="training", after_batch=True)
early_stopping = EarlyStopping(variables=[cross_ent], data_stream=data_stream_valid,
path="seqgen_" + args.type + "_" + "_".join([str(d) for d in network.hidden_dims]) + ".pkl",
tolerance=4, prefix="validation")
prkwargs = {
#'after_batch':True # use this for prints after every batch
}
main_loop = MainLoop(algorithm=algorithm, data_stream=data_stream, model=cost_model,
extensions=[monitor_grad, early_stopping, FinishAfter(after_n_epochs=args.epochs), ProgressBar(),
Timing(), Printing(**prkwargs)]+init_state_modifiers)
main_loop.algorithm.add_updates(aggr_0.accumulation_updates)
main_loop.algorithm.add_updates(aggr_1.accumulation_updates)
main_loop.algorithm.add_updates(aggr_2.accumulation_updates)
# remove update
# updates = main_loop.algorithm.updates
# init_state_update = list(filter(lambda u: u[0] == init_state_2, updates))[0]
# updates.remove(init_state_update)
# print('REMOVED UPDATE:', init_state_update)
# print('\nUPDATES:', main_loop.algorithm.updates,'\n')
main_loop.run()