Skip to content

Commit

Permalink
remove the extracted archive files after parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
Johannes Villmow committed Jul 17, 2019
1 parent 0cf4f4a commit f4f9aeb
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion extract_text.py
Expand Up @@ -47,7 +47,9 @@ def parse_archive(archive_fp, out_dir, n_procs, chunk_size=100):
tmp_data_dir = pl.Path(archive_fp).with_suffix(".tmp")

# extract tar first
if not tmp_data_dir.exists():
if tmp_data_dir.exists():
raise FileExistsError("Trying to extract archive to {}".format(tmp_data_dir))
else:
tar = tarfile.open(archive_fp)
tar.extractall(tmp_data_dir)
tar.close()
Expand Down Expand Up @@ -75,6 +77,12 @@ def file_gen():
save_parsed_file(filename, text, out_dir)
print("Could not parse {} files".format(unparsable))

# remove the extracted files
for filename in tmp_data_dir.iterdir():
if filename.is_file():
filename.unlink()
# and then the now (hopefully) empty directory
tmp_data_dir.rmdir()

if __name__ == "__main__":
month = extract_month(args.html_archive)
Expand Down

0 comments on commit f4f9aeb

Please sign in to comment.