Skip to content
Permalink
Browse files

#9287 Fix most of the file-handle resource leaks. (#9309)

Open resources (ordinary file, HDF5, tar, urlopen response, etc.) were often not
closed properly. When an exception is raised the resource might not be closed.
This may eg. result in depleting the number of file handles for the process.

Where possible use context managers (`with open(...) as f`) that close
the resource even after catching an exception.

Still, HDF5Matrix, CSVLogger and keras.preprocess.image.load_img()
are difficult to fix without changing their API. In those places resource
ownership is not properly defined.
  • Loading branch information...
bzamecnik authored and fchollet committed Feb 4, 2018
1 parent ad00676 commit 4cde148de0c37981c50f3a8e4a59fa4e5f653e17
@@ -489,13 +489,18 @@ def process_docstring(docstring):
new_fpath = fpath.replace('templates', 'sources')
shutil.copy(fpath, new_fpath)


# Take care of index page.
readme = open('../README.md').read()
index = open('templates/index.md').read()
def read_file(path):
with open(path) as f:
return f.read()


readme = read_file('../README.md')
index = read_file('templates/index.md')
index = index.replace('{{autogenerated}}', readme[readme.find('##'):])
f = open('sources/index.md', 'w')
f.write(index)
f.close()
with open('sources/index.md', 'w') as f:
f.write(index)

print('Starting autogeneration.')
for page_data in PAGES:
@@ -564,7 +569,7 @@ def process_docstring(docstring):
page_name = page_data['page']
path = os.path.join('sources', page_name)
if os.path.exists(path):
template = open(path).read()
template = read_file(path)
assert '{{autogenerated}}' in template, ('Template found for ' + path +
' but missing {{autogenerated}} tag.')
mkdown = template.replace('{{autogenerated}}', mkdown)
@@ -574,6 +579,7 @@ def process_docstring(docstring):
subdir = os.path.dirname(path)
if not os.path.exists(subdir):
os.makedirs(subdir)
open(path, 'w').write(mkdown)
with open(path, 'w') as f:
f.write(mkdown)

shutil.copyfile('../CONTRIBUTING.md', 'sources/contributing.md')
@@ -100,7 +100,7 @@ def vectorize_stories(data):
'$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
'$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
raise
tar = tarfile.open(path)


challenges = {
# QA1 with 10,000 samples
@@ -112,8 +112,9 @@ def vectorize_stories(data):
challenge = challenges[challenge_type]

print('Extracting stories for the challenge:', challenge_type)
train_stories = get_stories(tar.extractfile(challenge.format('train')))
test_stories = get_stories(tar.extractfile(challenge.format('test')))
with tarfile.open(path) as tar:
train_stories = get_stories(tar.extractfile(challenge.format('train')))
test_stories = get_stories(tar.extractfile(challenge.format('test')))

vocab = set()
for story, q, answer in train_stories + test_stories:
@@ -160,7 +160,7 @@ def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
'$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
'$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
raise
tar = tarfile.open(path)

# Default QA1 with 1000 samples
# challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
@@ -169,8 +169,9 @@ def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
train = get_stories(tar.extractfile(challenge.format('train')))
test = get_stories(tar.extractfile(challenge.format('test')))
with tarfile.open(path) as tar:
train = get_stories(tar.extractfile(challenge.format('train')))
test = get_stories(tar.extractfile(challenge.format('test')))

vocab = set()
for story, q, answer in train + test:
@@ -66,7 +66,8 @@
target_texts = []
input_characters = set()
target_characters = set()
lines = open(data_path, 'r', encoding='utf-8').read().split('\n')
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
input_text, target_text = line.split('\t')
# We use "tab" as the "start sequence" character
@@ -29,7 +29,8 @@
target_texts = []
input_characters = set()
target_characters = set()
lines = open(data_path, 'r', encoding='utf-8').read().split('\n')
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
input_text, target_text = line.split('\t')
# We use "tab" as the "start sequence" character
@@ -23,7 +23,8 @@
import io

path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = io.open(path, encoding='utf-8').read().lower()
with io.open(path, encoding='utf-8') as f:
text = f.read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
@@ -341,5 +341,5 @@ def build_discriminator():
Image.fromarray(img).save(
'plot_epoch_{0:03d}_generated.png'.format(epoch))

pickle.dump({'train': train_history, 'test': test_history},
open('acgan-history.pkl', 'wb'))
with open('acgan-history.pkl', 'wb') as f:
pickle.dump({'train': train_history, 'test': test_history}, f)
@@ -38,13 +38,12 @@
print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

@@ -62,16 +61,13 @@
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
if sys.version_info < (3,):
f = open(fpath)
else:
f = open(fpath, encoding='latin-1')
t = f.read()
i = t.find('\n\n') # skip header
if 0 < i:
t = t[i:]
texts.append(t)
f.close()
args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
with open(fpath, **args) as f:
t = f.read()
i = t.find('\n\n') # skip header
if 0 < i:
t = t[i:]
texts.append(t)
labels.append(label_id)

print('Found %s texts.' % len(texts))
@@ -205,7 +205,8 @@ def decode_predictions(preds, top=5):
CLASS_INDEX_PATH,
cache_subdir='models',
file_hash='c2c37ea517e94d9795004a39431a14cb')
CLASS_INDEX = json.load(open(fpath))
with open(fpath) as f:
CLASS_INDEX = json.load(f)
results = []
for pred in preds:
top_indices = pred.argsort()[-top:][::-1]
@@ -24,7 +24,8 @@
_config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json'))
if os.path.exists(_config_path):
try:
_config = json.load(open(_config_path))
with open(_config_path) as f:
_config = json.load(f)
except ValueError:
_config = {}
_floatx = _config.get('floatx', floatx())
@@ -20,17 +20,16 @@ def load_batch(fpath, label_key='labels'):
# Returns
A tuple `(data, labels)`.
"""
f = open(fpath, 'rb')
if sys.version_info < (3,):
d = cPickle.load(f)
else:
d = cPickle.load(f, encoding='bytes')
# decode utf8
d_decoded = {}
for k, v in d.items():
d_decoded[k.decode('utf8')] = v
d = d_decoded
f.close()
with open(fpath, 'rb') as f:
if sys.version_info < (3,):
d = cPickle.load(f)
else:
d = cPickle.load(f, encoding='bytes')
# decode utf8
d_decoded = {}
for k, v in d.items():
d_decoded[k.decode('utf8')] = v
d = d_decoded
data = d['data']
labels = d[label_key]

@@ -114,7 +114,5 @@ def get_word_index(path='imdb_word_index.json'):
path = get_file(path,
origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json',
file_hash='bfafd718b763782e994055a2d397834f')
f = open(path)
data = json.load(f)
f.close()
return data
with open(path) as f:
return json.load(f)
@@ -2603,10 +2603,9 @@ def save_weights(self, filepath, overwrite=True):
proceed = ask_to_proceed_with_overwrite(filepath)
if not proceed:
return
f = h5py.File(filepath, 'w')
save_weights_to_hdf5_group(f, self.layers)
f.flush()
f.close()
with h5py.File(filepath, 'w') as f:
save_weights_to_hdf5_group(f, self.layers)
f.flush()

def load_weights(self, filepath, by_name=False,
skip_mismatch=False, reshape=False):
@@ -2641,19 +2640,16 @@ def load_weights(self, filepath, by_name=False,
"""
if h5py is None:
raise ImportError('`load_weights` requires h5py.')
f = h5py.File(filepath, mode='r')
if 'layer_names' not in f.attrs and 'model_weights' in f:
f = f['model_weights']
if by_name:
load_weights_from_hdf5_group_by_name(
f, self.layers, skip_mismatch=skip_mismatch,
reshape=reshape)
else:
load_weights_from_hdf5_group(
f, self.layers, reshape=reshape)

if hasattr(f, 'close'):
f.close()
with h5py.File(filepath, mode='r') as f:
if 'layer_names' not in f.attrs and 'model_weights' in f:
f = f['model_weights']
if by_name:
load_weights_from_hdf5_group_by_name(
f, self.layers, skip_mismatch=skip_mismatch,
reshape=reshape)
else:
load_weights_from_hdf5_group(
f, self.layers, reshape=reshape)

def _updated_config(self):
"""Util hared between different serialization methods.
@@ -2027,13 +2027,12 @@ def fit_generator(self,
```python
def generate_arrays_from_file(path):
while 1:
f = open(path)
for line in f:
# create numpy arrays of input data
# and labels, from each line in the file
x1, x2, y = process_line(line)
yield ({'input_1': x1, 'input_2': x2}, {'output': y})
f.close()
with open(path) as f:
for line in f:
# create numpy arrays of input data
# and labels, from each line in the file
x1, x2, y = process_line(line)
yield ({'input_1': x1, 'input_2': x2}, {'output': y})
model.fit_generator(generate_arrays_from_file('/my_file.txt'),
steps_per_epoch=10000, epochs=10)
@@ -721,23 +721,21 @@ def set_weights(self, weights):
def load_weights(self, filepath, by_name=False, skip_mismatch=False, reshape=False):
if h5py is None:
raise ImportError('`load_weights` requires h5py.')
f = h5py.File(filepath, mode='r')
if 'layer_names' not in f.attrs and 'model_weights' in f:
f = f['model_weights']
with h5py.File(filepath, mode='r') as f:
if 'layer_names' not in f.attrs and 'model_weights' in f:
f = f['model_weights']

# Legacy support
if legacy_models.needs_legacy_support(self):
layers = legacy_models.legacy_sequential_layers(self)
else:
layers = self.layers
if by_name:
topology.load_weights_from_hdf5_group_by_name(f, layers,
skip_mismatch=skip_mismatch,
reshape=reshape)
else:
topology.load_weights_from_hdf5_group(f, layers, reshape=reshape)
if hasattr(f, 'close'):
f.close()
# Legacy support
if legacy_models.needs_legacy_support(self):
layers = legacy_models.legacy_sequential_layers(self)
else:
layers = self.layers
if by_name:
topology.load_weights_from_hdf5_group_by_name(f, layers,
skip_mismatch=skip_mismatch,
reshape=reshape)
else:
topology.load_weights_from_hdf5_group(f, layers, reshape=reshape)

def save_weights(self, filepath, overwrite=True):
if h5py is None:
@@ -753,10 +751,9 @@ def save_weights(self, filepath, overwrite=True):
else:
layers = self.layers

f = h5py.File(filepath, 'w')
topology.save_weights_to_hdf5_group(f, layers)
f.flush()
f.close()
with h5py.File(filepath, 'w') as f:
topology.save_weights_to_hdf5_group(f, layers)
f.flush()

def compile(self, optimizer, loss,
metrics=None,
@@ -1227,13 +1224,12 @@ def fit_generator(self, generator,
```python
def generate_arrays_from_file(path):
while 1:
f = open(path)
for line in f:
# create Numpy arrays of input data
# and labels, from each line in the file
x, y = process_line(line)
yield (x, y)
f.close()
with open(path) as f:
for line in f:
# create Numpy arrays of input data
# and labels, from each line in the file
x, y = process_line(line)
yield (x, y)
model.fit_generator(generate_arrays_from_file('/my_file.txt'),
steps_per_epoch=1000, epochs=10)
@@ -66,10 +66,9 @@ def chunk_read(response, chunk_size=8192, reporthook=None):
else:
break

response = urlopen(url, data)
with open(filename, 'wb') as fd:
for chunk in chunk_read(response, reporthook=reporthook):
fd.write(chunk)
with closing(urlopen(url, data)) as response, open(filename, 'wb') as fd:
for chunk in chunk_read(response, reporthook=reporthook):
fd.write(chunk)
else:
from six.moves.urllib.request import urlretrieve

@@ -31,14 +31,13 @@ def in_tmpdir(tmpdir):
def create_dataset(h5_path='test.h5'):
X = np.random.randn(200, 10).astype('float32')
y = np.random.randint(0, 2, size=(200, 1))
f = h5py.File(h5_path, 'w')
# Creating dataset to store features
X_dset = f.create_dataset('my_data', (200, 10), dtype='f')
X_dset[:] = X
# Creating dataset to store labels
y_dset = f.create_dataset('my_labels', (200, 1), dtype='i')
y_dset[:] = y
f.close()
with h5py.File(h5_path, 'w') as f:
# Creating dataset to store features
X_dset = f.create_dataset('my_data', (200, 10), dtype='f')
X_dset[:] = X
# Creating dataset to store labels
y_dset = f.create_dataset('my_labels', (200, 1), dtype='i')
y_dset[:] = y


def test_io_utils(in_tmpdir):

0 comments on commit 4cde148

Please sign in to comment.
You can’t perform that action at this time.