-
Notifications
You must be signed in to change notification settings - Fork 100
/
model.py
514 lines (391 loc) · 18.1 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
"""
This file is part of the PSL software.
Copyright 2011-2015 University of Maryland
Copyright 2013-2019 The Regents of the University of California
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import csv
import logging
import os
import re
import shutil
import tempfile
import uuid
import yaml
import pandas
import pslpython.util
from pslpython.partition import Partition
from pslpython.predicate import Predicate
from pslpython.predicate import PredicateError
from pslpython.rule import Rule
class Model(object):
"""
A PSL model.
This is the primary class for running PSL.
The python inferface to PSL utilizes PSL's CLI.
For information on default values / behavior, see the CLI: https://github.com/linqs/psl/wiki/Using-the-CLI
"""
CLI_INFERRED_OUTPUT_DIR = 'inferred-predicates'
CLI_DELIM = "\t"
TEMP_DIR_SUBDIR = 'psl-python'
DATA_STORAGE_DIR = 'data'
TRUTH_COLUMN_NAME = 'truth'
CLI_JAR_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'cli', 'psl-cli.jar'))
PSL_LOGGING_OPTION = 'log4j.threshold'
PSL_LOGGING_LEVEL_REGEX = r'\] (TRACE|DEBUG|INFO|WARN|ERROR|FATAL) '
PYTHON_LOGGING_FORMAT_STRING = '%(relativeCreated)d [%(name)s PSL] %(levelname)s --- %(message)s'
PYTHON_TO_PSL_LOGGING_LEVELS = {
logging.CRITICAL: 'FATAL',
logging.ERROR: 'ERROR',
logging.WARNING: 'WARN',
logging.INFO: 'INFO',
logging.DEBUG: 'DEBUG',
logging.NOTSET: 'INFO',
}
def __init__(self, name = None):
"""
Create a PSL model.
All models have some name.
Args:
name: The name of this model. If not supplied, then a random one is chosen.
"""
self._java_path = shutil.which('java')
if (self._java_path is None):
raise ModelError("Could not locate a java runtime (via https://docs.python.org/dev/library/shutil.html#shutil.which). Make sure that java exists within your path.")
self._name = name
if (self._name is None):
self._name = uuid.uuid4()
self._rules = []
# {normalized_name: predicate, ...}
self._predicates = {}
def add_predicate(self, predicate: Predicate):
"""
Add a predicate to the model.
Two predicates with the same name should never be added to the same model.
Args:
predicate: The predicate to add.
Returns:
This model.
"""
if (predicate is None):
raise ModelError('Cannot add a None predicate.')
name = predicate.name()
if (name in self._predicates and predicate != self._predicates[name]):
raise PredicateError("Within a model, predciates must have unique names. Got a duplicate: %s." % (name))
self._predicates[predicate.name()] = predicate
return self
def add_rule(self, rule: Rule):
"""
Add a rule to the model.
Rules are ordered and will maintain the order they were inserted in.
The rule ordering does not effect inference.
Args:
rule: The rule to add.
Returns:
This model.
"""
self._rules.append(rule)
return self
def infer(self, method = '', additional_cli_optons = [], psl_config = {}, jvm_options = [], logger = None, temp_dir = None, cleanup_temp = True):
"""
Run inference on this model.
Args:
method: The inference method to use.
additional_cli_optons: Additional options to pass direcly to the CLI.
Here you would do things like select a database backend.
psl_config: Configuration passed directly to the PSL core code.
https://github.com/eriq-augustine/psl/wiki/Configuration-Options
jvm_options: Options passed to the JVM.
Most commonly '-Xmx' and '-Xms'.
logger: An optional logger to send the output of PSL to.
If not specified (None), then a default INFO logger is used.
If False, only fatal PSL output will be passed on.
If no logging levels are sent via psl_config, PSL's logging level will be set
to match this logger's level.
temp_dir: Where to write PSL files to for calling the CLI.
Defaults to Model.TEMP_DIR_SUBDIR inside the system's temp directory (tempfile.gettempdir()).
cleanup_temp: Remove the files in temp_dir after running.
Returns:
The inferred values as a map to dataframe.
{predicate: frame, ...}
The frame will have columns names that match the index of the argument and 'truth'.
"""
logger, temp_dir, data_file_path, rules_file_path = self._prep_run(logger, temp_dir)
cli_options = []
cli_options.append('--infer')
if (method != ''):
cli_options.append(method)
inferred_dir = os.path.join(temp_dir, Model.CLI_INFERRED_OUTPUT_DIR)
cli_options.append('--output')
cli_options.append(inferred_dir)
cli_options += additional_cli_optons
self._run_psl(data_file_path, rules_file_path, cli_options, psl_config, jvm_options, logger)
results = self._collect_inference_results(inferred_dir)
if (cleanup_temp):
self._cleanup_temp(temp_dir)
return results
def learn(self, method = '', additional_cli_optons = [], psl_config = {}, jvm_options = [], logger = None, temp_dir = None, cleanup_temp = True):
"""
Run weight learning on this model.
The new weights will be applied to this model.
Args:
method: The weight learning method to use.
additional_cli_optons: Additional options to pass direcly to the CLI.
Here you would do things like select a database backend.
psl_config: Configuration passed directly to the PSL core code.
https://github.com/eriq-augustine/psl/wiki/Configuration-Options
jvm_options: Options passed to the JVM.
Most commonly '-Xmx' and '-Xms'.
logger: An optional logger to send the output of PSL to.
If not specified (None), then a default INFO logger is used.
If False, only fatal PSL output will be passed on.
If no logging levels are sent via psl_config, PSL's logging level will be set
to match this logger's level.
temp_dir: Where to write PSL files to for calling the CLI.
Defaults to Model.TEMP_DIR_SUBDIR inside the system's temp directory (tempfile.gettempdir()).
cleanup_temp: Remove the files in temp_dir after running.
Returns:
This model.
"""
logger, temp_dir, data_file_path, rules_file_path = self._prep_run(logger, temp_dir)
cli_options = []
cli_options.append('--learn')
if (method != ''):
cli_options.append(method)
cli_options += additional_cli_optons
self._run_psl(data_file_path, rules_file_path, cli_options, psl_config, jvm_options, logger)
self._fetch_new_weights(rules_file_path)
if (cleanup_temp):
self._cleanup_temp(temp_dir)
return self
def get_rules(self):
return self._rules
def get_predicates(self):
"""
Get all the predicates keyed by their normalized name.
If you are trying to get a specific predicate by name you should use Predicate.normalize_name(),
or just use get_predicate() instead.
Returns:
A dict of predicates keyed by their normalized name.
"""
return self._predicates
def get_predicate(self, name):
"""
Get a specific predicate or None if one does not exist.
Name normalization will be handled internally.
Returns:
A predicate matching the name, or None.
"""
return self._predicates[Predicate.normalize_name(name)]
def get_name(self):
return self._name
def _collect_inference_results(self, inferred_dir):
"""
Get the inferred data written by PSL.
Returns:
A dict with the keys being the predicate and the value being a dataframe with the data.
{predicate: frame, ...}
The frame will have columns names that match the index of the argument and 'truth'.
"""
results = {}
for dirent in os.listdir(inferred_dir):
path = os.path.join(inferred_dir, dirent)
if (not os.path.isfile(path)):
continue
predicate_name = os.path.splitext(dirent)[0]
predicate = None
for possible_predicate in self._predicates.values():
if (possible_predicate.name() == predicate_name):
predicate = possible_predicate
break
if (predicate is None):
raise ModelError("Unable to find predicate that matches name if inferred data file. Predicate name: '%s'. Inferred file path: '%s'." % (predicate_name, path))
columns = list(range(len(predicate))) + [Model.TRUTH_COLUMN_NAME]
data = pandas.read_csv(path, delimiter = Model.CLI_DELIM, names = columns, header = None, skiprows = None, quoting = csv.QUOTE_NONE)
# Clean up and convert types.
for i in range(len(data.columns) - 1):
if (predicate.types()[i] in Predicate.INT_TYPES):
data[data.columns[i]] = data[data.columns[i]].apply(lambda val: int(val))
elif (predicate.types()[i] in Predicate.FLOAT_TYPES):
data[data.columns[i]] = data[data.columns[i]].apply(lambda val: float(val))
data[Model.TRUTH_COLUMN_NAME] = pandas.to_numeric(data[Model.TRUTH_COLUMN_NAME])
results[predicate] = data
return results
def _fetch_new_weights(self, base_rules_file_path):
new_weights = []
learned_rules_path = re.sub(r'\.psl$', '-learned.psl', base_rules_file_path)
with open(learned_rules_path, 'r') as file:
for line in file:
line = line.strip()
if (line == ''):
continue
# Unweighted
if (line.endswith('.')):
new_weights.append(None)
continue
parts = line.split(':')
new_weights.append(float(parts[0]))
if (len(new_weights) != len(self._rules)):
raise ModelError("Mismatch between the number of base rules and the number of weighted rules. Base rules: '%s', learned rules: '%s'." % (base_rules_file_path, learned_rules_path))
for i in range(len(self._rules)):
if (self._rules[i].weighted()):
self._rules[i].set_weight(new_weights[i])
def _cleanup_temp(self, temp_dir):
shutil.rmtree(temp_dir)
def _write_data(self, temp_dir):
"""
Write out all the data for the predicates found in the rules of this model.
Will clobber any existing data.
Also writes out the CLI data file.
Returns:
The path to the data file.
"""
data_file_path = os.path.join(temp_dir, self._name + '.data')
data_storage_path = os.path.join(temp_dir, Model.DATA_STORAGE_DIR)
os.makedirs(data_storage_path, exist_ok = True)
self._write_cli_datafile(data_file_path, data_storage_path)
self._write_cli_data(data_storage_path)
return data_file_path
def _write_cli_data(self, data_storage_path):
for partition in Partition:
for predicate in self._predicates.values():
if (partition not in predicate.data()):
continue
data = predicate.data()[partition]
if (data is None or len(data) == 0):
continue
filename = "%s_%s.txt" % (predicate.name(), partition.value)
path = os.path.join(data_storage_path, filename)
data.to_csv(path, sep = Model.CLI_DELIM, header = False, index = False)
def _write_cli_datafile(self, data_file_path, data_storage_path):
data_file_contents = {}
predicates = {}
for predicate in self._predicates.values():
predicate_id = predicate.name() + '/' + str(len(predicate))
types = []
for predicate_type in predicate.types():
types.append(predicate_type.value)
open_closed = 'open'
if (predicate.closed()):
open_closed = 'closed'
predicates[predicate_id] = [
open_closed,
{'types': types}
]
data_file_contents['predicates'] = predicates
for partition in Partition:
partition_data = {}
for predicate in self._predicates.values():
if (partition not in predicate.data()):
continue
data = predicate.data()[partition]
if (data is None or len(data) == 0):
continue
filename = "%s_%s.txt" % (predicate.name(), partition.value)
# Make paths relative to the CLI data file for portability.
partition_data[predicate.name()] = os.path.join(Model.DATA_STORAGE_DIR, filename)
if (len(partition_data) > 0):
data_file_contents[partition.value] = partition_data
with open(data_file_path, 'w') as file:
yaml.dump(data_file_contents, file, default_flow_style = False)
def _write_rules(self, temp_dir):
"""
Write out all the rules for this model.
Will clobber any existing rules.
Returns:
The path to the rules file.
"""
rules_file_path = os.path.join(temp_dir, self._name + '.psl')
with open(rules_file_path, 'w') as file:
for rule in self._rules:
file.write(str(rule) + "\n")
return rules_file_path
def _prep_run(self, logger = None, temp_dir = None):
"""
Run weight learning on this model.
The new weights will be applied to this model.
Args:
logger: An optional logger to send the output of PSL to.
If not specified (None), then a default INFO logger is used.
If False, only fatal PSL output will be passed on.
temp_dir: Where to write PSL files to for calling the CLI.
Defaults to Model.TEMP_DIR_SUBDIR inside the system's temp directory (tempfile.gettempdir()).
Returns:
A prepped logger, a usable temp_dir, the path to the CLI data file, and the path to the CLI rules file.
"""
if (len(self._rules) == 0):
raise ModelError("No rules specified to the model.")
if (logger is None or logger == False):
level = logging.INFO
if (logger == False):
level = logging.CRITICAL
logging.basicConfig(format = Model.PYTHON_LOGGING_FORMAT_STRING)
logger = logging.getLogger(__name__)
logger.setLevel(level)
if (temp_dir is None):
temp_dir = os.path.join(tempfile.gettempdir(), Model.TEMP_DIR_SUBDIR)
temp_dir = os.path.join(temp_dir, self._name)
os.makedirs(temp_dir, exist_ok = True)
data_file_path = self._write_data(temp_dir)
rules_file_path = self._write_rules(temp_dir)
return logger, temp_dir, data_file_path, rules_file_path
def _run_psl(self, data_file_path, rules_file_path, cli_options, psl_config, jvm_options, logger):
command = [
self._java_path
]
for option in jvm_options:
command.append(str(option))
command += [
'-jar',
Model.CLI_JAR_PATH,
'--model',
rules_file_path,
'--data',
data_file_path,
]
# Set the PSL logging level to match the logger (if not explicitly set in the additional options).
if (Model.PSL_LOGGING_OPTION not in psl_config):
psl_config[Model.PSL_LOGGING_OPTION] = Model.PYTHON_TO_PSL_LOGGING_LEVELS[logger.level]
for option in cli_options:
command.append(str(option))
for (key, value) in psl_config.items():
command.append('-D')
command.append("%s=%s" % (key, value))
log_callback = lambda line: Model._log_stdout(logger, line)
logger.debug("Running: `%s`." % (pslpython.util.shell_join(command)))
exit_status = pslpython.util.execute(command, log_callback)
if (exit_status != 0):
raise ModelError("PSL returned a non-zero exit status: %d." % (exit_status))
@staticmethod
def _log_stdout(logger, line):
match = re.search(Model.PSL_LOGGING_LEVEL_REGEX, line)
if (match is None):
# On a failed lookup, log to error.
logger.error('(Unknown PSL logging level) -- ' + line)
return
level = match.group(1)
if (level == 'TRACE' or level == 'DEBUG'):
logger.debug(line)
elif (level == 'INFO'):
logger.info(line)
elif (level == 'WARN'):
logger.warning(line)
elif (level == 'ERROR'):
logger.error(line)
elif (level == 'FATAL'):
logger.critical(line)
else:
logger.error('(Unknown PSL logging level) -- ' + line)
@staticmethod
def _log_stderr(logger, line):
logger.error('(PSL stderr) -- ' + line)
class ModelError(Exception):
pass