Skip to content

Commit

Permalink
minor fixes from latest CC index build:
Browse files Browse the repository at this point in the history
- fix typos
- update reducer start for cluster job to be much later
- update reqs
  • Loading branch information
ikreymer committed Mar 29, 2015
1 parent 70339a0 commit 0005813
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 6 deletions.
6 changes: 3 additions & 3 deletions indexwarcsjob.py
Expand Up @@ -30,7 +30,7 @@ class IndexWARCJob(MRJob):
'mapreduce.reduce.speculative': 'false',
'mapreduce.job.jvm.numtasks': '-1',

'mapreduce.input.lineinputformat.linespermap': 1,
'mapreduce.input.lineinputformat.linespermap': 2,
}

def configure_options(self):
Expand Down Expand Up @@ -63,7 +63,7 @@ def mapper_init(self):
self.index_options = {
'surt_ordered': True,
'sort': True,
'cdx06': True,
'cdxj': True,
'minimal': True
}

Expand All @@ -77,7 +77,7 @@ def mapper(self, _, line):

def _conv_warc_to_cdx_path(self, warc_path):
# set cdx path
cdx_path = warc_path.replace('common-crawl/crawl-data', '/cdx')
cdx_path = warc_path.replace('common-crawl/crawl-data', 'cdx2')
cdx_path = cdx_path.replace('.warc.gz', '.cdx.gz')
return cdx_path

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
@@ -1,4 +1,5 @@
mrjob
boto
-e git+https://github.com/ikreymer/pywb.git@indexing-work#egg=pywb
pywb
#-e git+https://github.com/ikreymer/pywb.git@0.9.0b#egg=pywb
-e git+https://github.com/matteobertozzi/Hadoop.git#egg=hadoop&subdirectory=python-hadoop
3 changes: 2 additions & 1 deletion runindexwarcs.sh
Expand Up @@ -4,7 +4,8 @@ source ./index_env.sh

python indexwarcsjob.py \
--conf-path ./mrjob.conf \
--cdx_bucket=$CDX_BUCKET \
--cdx_bucket=$WARC_CDX_BUCKET \
--no-output \
--cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-r emr $WARC_MANIFEST &> /tmp/emrrun.log &
Expand Down
2 changes: 1 addition & 1 deletion runzipcluster.sh
Expand Up @@ -10,5 +10,5 @@ python zipnumclusterjob.py \
--conf-path ./mrjob.conf \
--cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-r emr $WARC_CDX &> /tmp/emrrun.log &
-r emr $WARC_CDX &> /tmp/emrrun2.log &

3 changes: 3 additions & 0 deletions zipnumclusterjob.py
Expand Up @@ -27,6 +27,8 @@ class ZipNumClusterJob(MRJob):
'mapreduce.input.fileinputformat.split.maxsize': '50000000',
'mapreduce.map.speculative': 'false',
'mapreduce.reduce.speculative': 'false',
'mapreduce.output.fileoutputformat.compress': 'false',
'mapreduce.job.reduce.slowstart.completedmaps': '0.8',
'mapreduce.job.jvm.numtasks': '-1'
}

Expand All @@ -44,6 +46,7 @@ def configure_options(self):

self.add_passthrough_option('--convert', dest='convert',
action='store_true',
default=False,
help='Convert CDX through _convert_line() function')

self.add_passthrough_option('--shards', dest='shards',
Expand Down

0 comments on commit 0005813

Please sign in to comment.