diff --git a/indexwarcsjob.py b/indexwarcsjob.py index 7377176..660b185 100644 --- a/indexwarcsjob.py +++ b/indexwarcsjob.py @@ -30,7 +30,7 @@ class IndexWARCJob(MRJob): 'mapreduce.reduce.speculative': 'false', 'mapreduce.job.jvm.numtasks': '-1', - 'mapreduce.input.lineinputformat.linespermap': 1, + 'mapreduce.input.lineinputformat.linespermap': 2, } def configure_options(self): @@ -63,7 +63,7 @@ def mapper_init(self): self.index_options = { 'surt_ordered': True, 'sort': True, - 'cdx06': True, + 'cdxj': True, 'minimal': True } @@ -77,7 +77,7 @@ def mapper(self, _, line): def _conv_warc_to_cdx_path(self, warc_path): # set cdx path - cdx_path = warc_path.replace('common-crawl/crawl-data', '/cdx') + cdx_path = warc_path.replace('common-crawl/crawl-data', 'cdx2') cdx_path = cdx_path.replace('.warc.gz', '.cdx.gz') return cdx_path diff --git a/requirements.txt b/requirements.txt index 63c3f1f..8c8af65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ mrjob boto --e git+https://github.com/ikreymer/pywb.git@indexing-work#egg=pywb +pywb +#-e git+https://github.com/ikreymer/pywb.git@0.9.0b#egg=pywb -e git+https://github.com/matteobertozzi/Hadoop.git#egg=hadoop&subdirectory=python-hadoop diff --git a/runindexwarcs.sh b/runindexwarcs.sh index bb9f55d..b4a0ecb 100755 --- a/runindexwarcs.sh +++ b/runindexwarcs.sh @@ -4,7 +4,8 @@ source ./index_env.sh python indexwarcsjob.py \ --conf-path ./mrjob.conf \ ---cdx_bucket=$CDX_BUCKET \ +--cdx_bucket=$WARC_CDX_BUCKET \ +--no-output \ --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ -r emr $WARC_MANIFEST &> /tmp/emrrun.log & diff --git a/runzipcluster.sh b/runzipcluster.sh index c754599..1e23e86 100755 --- a/runzipcluster.sh +++ b/runzipcluster.sh @@ -10,5 +10,5 @@ python zipnumclusterjob.py \ --conf-path ./mrjob.conf \ --cmdenv AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ --cmdenv AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ --r emr $WARC_CDX &> /tmp/emrrun.log & +-r emr $WARC_CDX &> /tmp/emrrun2.log & diff --git a/zipnumclusterjob.py b/zipnumclusterjob.py index 7233b24..6bf4481 100644 --- a/zipnumclusterjob.py +++ b/zipnumclusterjob.py @@ -27,6 +27,8 @@ class ZipNumClusterJob(MRJob): 'mapreduce.input.fileinputformat.split.maxsize': '50000000', 'mapreduce.map.speculative': 'false', 'mapreduce.reduce.speculative': 'false', + 'mapreduce.output.fileoutputformat.compress': 'false', + 'mapreduce.job.reduce.slowstart.completedmaps': '0.8', 'mapreduce.job.jvm.numtasks': '-1' } @@ -44,6 +46,7 @@ def configure_options(self): self.add_passthrough_option('--convert', dest='convert', action='store_true', + default=False, help='Convert CDX through _convert_line() function') self.add_passthrough_option('--shards', dest='shards',