Skip to content

Commit

Permalink
add -d flag for directory, and -c flag for collection
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Apr 16, 2015
1 parent 02dd1b4 commit 237ef28
Showing 1 changed file with 19 additions and 6 deletions.
25 changes: 19 additions & 6 deletions cdx-index-client.py
Expand Up @@ -10,6 +10,7 @@
import sys
import signal
import random
import os

import logging

Expand Down Expand Up @@ -54,6 +55,7 @@ def fetch_result_page(job_params):
timeout = job_params['timeout']
gzipped = job_params['gzipped']
headers = job_params['headers']
dir_ = job_params['dir']

query = {'url': url,
'page': page}
Expand Down Expand Up @@ -100,6 +102,12 @@ def fetch_result_page(job_params):
r.close()
return

# use dir, if provided
if dir_:
if not os.path.isdir(dir_):
os.makedirs(dir_)
filename = os.path.join(dir_, filename)

if not gzipped:
with open(filename, 'w+b') as fh:
for chunk in r.iter_content(1024):
Expand Down Expand Up @@ -205,9 +213,6 @@ def main():

parser = ArgumentParser('CDX Index API Client')

parser.add_argument('collection', default='CC-MAIN-2015-06', nargs='?',
help='The index collection to use')

parser.add_argument('url',
help=url_help)

Expand All @@ -229,11 +234,18 @@ def main():
parser.add_argument('-o', '--output-prefix',
help='Custom output prefix, append with -NN for each page')

parser.add_argument('-d', '--directory',
help='Specify custom output directory')

parser.add_argument('--page-size', type=int,
help='size of each page in blocks, >=1')

parser.add_argument('--cdx-server-url',
help='Set endpoint for CDX Server API')
group = parser.add_mutually_exclusive_group()
group.add_argument('-c', '--coll', default='CC-MAIN-2015-06',
help='The index collection to use')

group.add_argument('--cdx-server-url',
help='Set endpoint for CDX Server API')

parser.add_argument('--timeout', default=30, type=int,
help='HTTP read timeout before retry')
Expand Down Expand Up @@ -271,7 +283,7 @@ def main():
if r.cdx_server_url:
api_url = r.cdx_server_url
else:
api_url = DEF_API_BASE + r.collection + '-index'
api_url = DEF_API_BASE + r.coll + '-index'

logging.debug('Getting Num Pages...')
num_pages = get_num_pages(api_url, r.url, r.page_size)
Expand Down Expand Up @@ -315,6 +327,7 @@ def get_page_job(page):
job['max_retries'] = r.max_retries
job['gzipped'] = r.gzipped
job['headers'] = r.header
job['dir'] = r.directory
return job

if r.pages:
Expand Down

0 comments on commit 237ef28

Please sign in to comment.