forked from mideind/GreynirServer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sortfile.py
executable file
·101 lines (84 loc) · 3.35 KB
/
sortfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""
Sort utility for large UTF-8 text files
Adapted from Recipe 466302: Sorting big files the Python 2.4 way
by Nicolas Lehuen
http://code.activestate.com/recipes/576755-sorting-big-files-the-python-26-way/
Example usage:
python sortfile.py resources/ordalisti.txt resources/ordalisti.sorted.txt -b 200000
"""
import os
import io
from tempfile import gettempdir
from itertools import islice, cycle
from collections import namedtuple
import heapq
Keyed = namedtuple("Keyed", ["key", "obj"])
def keyfunc(line):
return line
def merge(*iterables):
# based on code posted by Scott David Daniels in c.l.p.
# http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d
keyed_iterables = [
(Keyed(keyfunc(obj), obj)
for obj in iterable)
for iterable in iterables]
for element in heapq.merge(*keyed_iterables):
yield element.obj
def batch_sort(infile, output, buffer_size=32000, tempdirs=None):
if tempdirs is None:
tempdirs = []
if not tempdirs:
tempdirs.append(gettempdir())
chunks = []
try:
with io.open(infile, mode='r', buffering=64*1024, encoding='utf8') as input_file:
print(u"Opened input {0}".format(infile))
input_iterator = iter(input_file)
for tempdir in cycle(tempdirs):
current_chunk = list(islice(input_iterator,buffer_size))
if not current_chunk:
break
current_chunk.sort(key=keyfunc)
fname = '%06i' % len(chunks)
output_chunk = io.open(os.path.join(tempdir,fname),mode='w+',buffering=64*1024, encoding='utf8')
print(u"Writing tempfile {0}/{1}".format(tempdir, fname))
chunks.append(output_chunk)
output_chunk.writelines(current_chunk)
output_chunk.flush()
output_chunk.seek(0)
print(u"Writing outfile {0}".format(output))
with io.open(output,mode='w',buffering=64*1024, encoding='utf8') as output_file:
output_file.writelines(merge(*chunks))
finally:
for chunk in chunks:
# noinspection PyBroadException
try:
chunk.close()
os.remove(chunk.name)
except:
print(u"Exception when closing chunk")
pass
if __name__ == '__main__':
import optparse
parser = optparse.OptionParser()
parser.add_option(
'-b','--buffer',
dest='buffer_size',
type='int',default=32000,
help='''Size of the line buffer. The file to sort is
divided into chunks of that many lines. Default : 32,000 lines.'''
)
parser.add_option(
'-t','--tempdir',
dest='tempdirs',
action='append',
default=[],
help='''Temporary directory to use. You might get performance
improvements if the temporary directory is not on the same physical
disk than the input and output directories. You can even try
providing multiples directories on differents physical disks.
Use multiple -t options to do that.'''
)
options,args = parser.parse_args()
batch_sort(args[0],args[1],options.buffer_size,options.tempdirs)