Permalink
Checking mergeability…
Don’t worry, you can still create the pull request.
Comparing changes
Open a pull request
- 11 commits
- 7 files changed
- 0 commit comments
- 5 contributors
Commits on Dec 12, 2016
Commits on Jun 27, 2018
…f 100s, 200s and 300s. Changed hadoop input type to lineinput to run faster in the new cluster
Commits on May 23, 2019
Commits on Jun 19, 2019
Unified
Split
Showing
with
203 additions
and 1 deletion.
- +2 −0 .gitignore
- +119 −0 IndexArcs.py
- +7 −0 RunhadoopJobs.sh
- +40 −0 cdx2cdxj.py
- +16 −0 generateArcErrors.sh
- +1 −1 requirements.txt
- +18 −0 teste.cdx
| @@ -40,3 +40,5 @@ mrjob.conf | ||
| index_env.sh | ||
|
|
||
| .vagrant | ||
| arcsList | ||
| src | ||
| @@ -0,0 +1,119 @@ | ||
| import boto | ||
| import shutil | ||
| from mrjob.job import MRJob | ||
| from os.path import basename | ||
| import os | ||
| import errno | ||
| import re | ||
| import sys | ||
| import io | ||
| import gzip | ||
| import codecs | ||
| import time | ||
| import json | ||
| from mrjob.job import MRJob | ||
| from mrjob.protocol import RawValueProtocol | ||
|
|
||
| from io import StringIO | ||
| from io import BytesIO | ||
| from tempfile import TemporaryFile | ||
| from pywb.warc.cdxindexer import write_cdx_index | ||
| from pywb.utils.canonicalize import UrlCanonicalizeException | ||
| from json.decoder import JSONDecodeError | ||
|
|
||
|
|
||
| from gzip import GzipFile | ||
| import urllib | ||
| from urllib.request import urlopen | ||
|
|
||
|
|
||
| WORD_RE = re.compile(r"[\w']+") | ||
|
|
||
| class IndexArcs(MRJob): | ||
|
|
||
| INPUT_PROTOCOL = RawValueProtocol | ||
| OUTPUT_PROTOCOL = RawValueProtocol | ||
|
|
||
|
|
||
| HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' | ||
|
|
||
| JOBCONF = {'mapreduce.task.timeout': '9600000', | ||
| 'mapreduce.input.fileinputformat.split.maxsize': '50000000', | ||
| 'mapreduce.map.speculative': 'false', | ||
| 'mapreduce.reduce.speculative': 'false', | ||
| 'mapreduce.job.jvm.numtasks': '-1', | ||
| 'mapreduce.input.lineinputformat.linespermap': 1, | ||
| 'mapred.job.priority': 'VERY_HIGH' | ||
| } | ||
|
|
||
| def configure_options(self): | ||
| """Custom command line options for indexing""" | ||
| super(IndexArcs, self).configure_options() | ||
| def mapper_init(self): | ||
| self.index_options = { | ||
| 'surt_ordered': True, | ||
| 'sort': True, | ||
| 'cdxj': True, | ||
| } | ||
|
|
||
| def mapper(self, _, line): | ||
| attempts = 5 | ||
|
|
||
| warc_path = line.split('\t')[1] #need to do this for the hadoop input type need to check if hadoop or local if we want to make this also work on local mode | ||
| try: | ||
| self._load_and_index(warc_path) | ||
| except UrlCanonicalizeException as exc: | ||
| self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8')) | ||
| pass | ||
| except JSONDecodeError as exc: | ||
| self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8')) | ||
| pass | ||
| except Exception as exc: | ||
| try: | ||
| if exc.errno == errno.EPIPE: | ||
| if attempts > 0: | ||
| time.sleep(5) #wait 5 seconds - servers must be busy | ||
| attempts -= 1 | ||
| self._load_and_index(warc_path) #try again in case of Broken Pipe Error | ||
| else: | ||
| self.stderr.write(("Broken Pipe tried more than 3 times.\n").encode('utf-8')) | ||
| self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8')) | ||
| else: | ||
| self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8')) | ||
| pass | ||
| except AttributeError: #Exception has no ErrorNo lets just print the error to the output | ||
| self.stderr.write( (("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8')) ) | ||
| pass | ||
| except Exception as ex2: | ||
| self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8')) | ||
| pass | ||
|
|
||
|
|
||
|
|
||
| def _load_and_index(self, warc_path): | ||
| warctempURL = urlopen(warc_path) | ||
| warctemp=BytesIO(warctempURL.read()) | ||
|
|
||
| with TemporaryFile(mode='w+b') as cdxtemp: | ||
| cdxfile = GzipFile(fileobj=cdxtemp, mode='w+b') | ||
| write_cdx_index(cdxfile, warctemp, basename(warc_path), **self.index_options) | ||
| cdxfile.close() | ||
| cdxtemp.seek(0) | ||
| cdxtempString=BytesIO(cdxtemp.read()) | ||
| cdxtempobj = gzip.GzipFile(fileobj=cdxtempString) | ||
| cdxRecordsString = cdxtempobj.read().decode('utf-8') | ||
|
|
||
| fileLines = cdxRecordsString.split("\n") | ||
| for line in fileLines: | ||
| if not (line.startswith("Error") or line.startswith("Invalid ARC record") or line.startswith("Unknown archive format")): | ||
| " ".join(line.split(" ")[2:]) | ||
| jsonStr = " ".join(line.split(" ")[2:]) | ||
| cdx_record = json.loads(jsonStr) | ||
| status = cdx_record.get('status', None) | ||
| if status: | ||
| if re.search(r'2.*|3.*|1.*', status): | ||
| self.stdout.write( (line + "\n").encode('utf-8') ) | ||
|
|
||
| cdxtemp.flush() | ||
| if __name__ == '__main__': | ||
| IndexArcs.run() |
| @@ -0,0 +1,7 @@ | ||
| #!/bin/bash | ||
| export HADOOP_HOME=/opt/hadoop-3.0.3 | ||
| FILE=$1 | ||
| while read line; do | ||
| echo "$line" | ||
| python3.5 /opt/webarchive-indexing/IndexArcs.py -r hadoop /opt/webarchive-indexing/arcsList/"$line"_ARCS.txt --step-num=1 --output-dir hdfs:///user/root/"$line" --no-output --jobconf mapred.job.name="$line" --python-bin /usr/local/bin/python3.5 & | ||
| done < $FILE |
| @@ -0,0 +1,40 @@ | ||
| import argparse | ||
| import json | ||
| from collections import defaultdict | ||
|
|
||
| from pywb.utils.canonicalize import canonicalize | ||
|
|
||
|
|
||
| # TODO generate a ordered dictionary (python 3.6 feature) | ||
| # TODO adapt field extracting with the number of fields available (diferente formats of cdx) | ||
| def transform_cdx(cdx_path): | ||
| with open(cdx_path, mode="r") as cdxfile: | ||
| for line in cdxfile: | ||
| record_dict = defaultdict() | ||
| record_list = line.split(' ') | ||
|
|
||
| # build dict | ||
| record_dict['url'] = record_list[2] | ||
| record_dict['mime'] = record_list[3] | ||
| record_dict['status'] = record_list[4] | ||
| record_dict['digest'] = record_list[5] | ||
| record_dict['length'] = '0' | ||
| record_dict['offset'] = record_list[7] | ||
| record_dict['filename'] = record_list[8].replace('\n', '') | ||
| try: | ||
| print "{} {} {}".format(canonicalize(record_list[0], surt_ordered=True), record_list[1], | ||
| json.dumps(record_dict)) | ||
| except ValueError as e: | ||
| print "Header" | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description='Transform CDX to CDXJ format') | ||
| parser.add_argument('cdx', help='the location of the cdx file') | ||
|
|
||
| args = parser.parse_args() | ||
| transform_cdx(args.cdx) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
| @@ -0,0 +1,16 @@ | ||
| # This script creates for each Job (collection) | ||
| # File named AllErrors.txt with all generated errors from systderr | ||
| # File named FailedArcsUnique.txt with the list of Arcs that failed indexing to CDX | ||
| for dir in /opt/hadoop-1.2.1/logs/joblogs/userlogs/*/ | ||
| do | ||
| dir=${dir%*/} | ||
| cd ${dir##*/} | ||
| rm -f AllErrors.txt | ||
| rm -f FailedArcs.txt | ||
| rm -f FailedArcsUnique.txt | ||
| echo ${dir##*/} | ||
| find . -name "stderr" -exec cat {} \; > AllErrors.txt | ||
| grep 'Arcname:' AllErrors.txt > FailedArcs.txt | ||
| sort FailedArcs.txt | uniq > FailedArcsUnique.txt | ||
| cd .. | ||
| done |
| @@ -1,5 +1,5 @@ | ||
| mrjob | ||
| boto | ||
| pywb | ||
| pywb==0.30.1 | ||
| #-e git+https://github.com/matteobertozzi/Hadoop.git#egg=hadoop&subdirectory=python-hadoop | ||
| -e git+https://github.com/commoncrawl/python-hadoop.git#egg=master |
| @@ -0,0 +1,18 @@ | ||
| mocho.pt/cab/5/rei_justino.php 20061230223546 mocho.pt text/html 200 B3W3NZII6TU5L7PELLOFAN64B5QPDSOE - 95173778 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| mocho.pt/cab/5/sopa_letras.php 20061230223556 mocho.pt text/html 200 RBVKS6E66ZQIB6WSRUJFPCWFICJKVJYS - 95176081 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| mocho.pt/cab/5/pra_mesa.php 20061230223526 mocho.pt text/html 200 C4FXGVSQUFRMZIIR43FP5UI6MIOX3Z62 - 95178421 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multiassistencia.mediaminds.pt/backoffice/output_efile.aspx?sid=4ab0d388-983a-4bc8-8767-6fbbc3f9fbdb&cntx=m5p1xpo/uifkxap3cm7pvb2mezezodzurrnzlyukyjtqhhlphkdlfe56+zdajjomoagvfzf2ovoghqo1abpo5g==&idf=11 20070104001051 multiassistencia.mediaminds.pt image/pjpeg 200 6RCRPT4UUC2F6JT6Z64VRQ4M6KA7EVGJ - 95180307 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multiassistencia.mediaminds.pt/backoffice/output_efile.aspx?sid=4ab0d388-983a-4bc8-8767-6fbbc3f9fbdb&cntx=bsnsmnzevpruhquxhtdbl9pozweaebrozievuv+j4y006nxupekd7i8rixpl3tmvmhtv5xvidldccxyyslzgsq==&idf=9 20070104001100 multiassistencia.mediaminds.pt image/pjpeg 200 NNX3C2BL5SOMWVB7AYOCKB4HG2IC6AQB - 95184969 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| morangosacucar.com.sapo.pt/historia.htm 20070104150909 morangosacucar.com.sapo.pt text/html 200 APSLBN5L7VW2K7NCYEZYVOXABMWVNK4G - 95192793 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| morangosacucar.com.sapo.pt/media.htm 20070104150901 morangosacucar.com.sapo.pt text/html 200 OTOTWQ2H7ZVFV3C3AXTXXBMZMUNJLSOC - 95195137 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/269812/25 20070105144535 multimedia.iol.pt image/gif 200 ZTQYUUXBORLQONOSC3US4RGZFSWYB5F4 - 95197547 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/271472/31 20070105144543 multimedia.iol.pt image/gif 200 345SDMXKREJABG4SUEZJYM6O2CBRKV7N - 95198730 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/271622/55 20070105144551 multimedia.iol.pt image/gif 200 V6YSV2JSAJ3QQ3MW5HV5EJS47RVJ6UEG - 95201317 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/280570/31 20070105144600 multimedia.iol.pt image/gif 200 7EE7VT5HYLIEMSJAMHJKVAYNQB5TQZ6A - 95204854 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/280587/31 20070105144609 multimedia.iol.pt image/gif 200 B6H45RNIOSHEHLVIB6UYFC44PGWV3EBB - 95206115 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/280787/31 20070105144617 multimedia.iol.pt image/gif 200 VT6FY2XKZ4AAD75FDG47HLLVOZBLV32I - 95207414 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/281803/31 20070105144625 multimedia.iol.pt image/gif 200 7RWNK6TU6ZECSIWMR64IXCHV2OXN4FDS - 95208947 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/47415/146 20070105144633 multimedia.iol.pt image/gif 200 UZ4NNPL7RV6B7VJ3USPFN7MOGQAFNYTV - 95210414 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/265683/31 20070105144527 multimedia.iol.pt image/gif 200 RKCDJCULPJFQ2N77YLF3FF2KGH56PYZW - 95212565 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.iol.pt/oratvi/multimedia/imagem/id/53052/460 20070105144644 multimedia.iol.pt image/gif 200 RFU5P46TOZXDC6W6W5434XWIRGZ56P5G - 95213847 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz | ||
| multimedia.rtp.pt/envia_file.php?file=/at3/106787-0610021330.mp3&name=h%EF%BF%BD%20vida%20em%20markl 20070106130144 multimedia.rtp.pt audio/mp3 200 A6WUXU2EPQOG4W4DBUWBCKFVEQZDQDVV - 95222331 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz |