Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also .

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also .
  • 11 commits
  • 7 files changed
  • 0 commit comments
  • 5 contributors
Commits on Jun 27, 2018
…f 100s, 200s and 300s. Changed hadoop input type to lineinput to run faster in the new cluster
Commits on May 20, 2019
Commits on May 21, 2019
Commits on May 23, 2019
Showing with 203 additions and 1 deletion.
  1. +2 −0 .gitignore
  2. +119 −0 IndexArcs.py
  3. +7 −0 RunhadoopJobs.sh
  4. +40 −0 cdx2cdxj.py
  5. +16 −0 generateArcErrors.sh
  6. +1 −1 requirements.txt
  7. +18 −0 teste.cdx
@@ -40,3 +40,5 @@ mrjob.conf
index_env.sh

.vagrant
arcsList
src
@@ -0,0 +1,119 @@
import boto
import shutil
from mrjob.job import MRJob
from os.path import basename
import os
import errno
import re
import sys
import io
import gzip
import codecs
import time
import json
from mrjob.job import MRJob
from mrjob.protocol import RawValueProtocol

from io import StringIO
from io import BytesIO
from tempfile import TemporaryFile
from pywb.warc.cdxindexer import write_cdx_index
from pywb.utils.canonicalize import UrlCanonicalizeException
from json.decoder import JSONDecodeError


from gzip import GzipFile
import urllib
from urllib.request import urlopen


WORD_RE = re.compile(r"[\w']+")

class IndexArcs(MRJob):

INPUT_PROTOCOL = RawValueProtocol
OUTPUT_PROTOCOL = RawValueProtocol


HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'

JOBCONF = {'mapreduce.task.timeout': '9600000',
'mapreduce.input.fileinputformat.split.maxsize': '50000000',
'mapreduce.map.speculative': 'false',
'mapreduce.reduce.speculative': 'false',
'mapreduce.job.jvm.numtasks': '-1',
'mapreduce.input.lineinputformat.linespermap': 1,
'mapred.job.priority': 'VERY_HIGH'
}

def configure_options(self):
"""Custom command line options for indexing"""
super(IndexArcs, self).configure_options()
def mapper_init(self):
self.index_options = {
'surt_ordered': True,
'sort': True,
'cdxj': True,
}

def mapper(self, _, line):
attempts = 5

warc_path = line.split('\t')[1] #need to do this for the hadoop input type need to check if hadoop or local if we want to make this also work on local mode
try:
self._load_and_index(warc_path)
except UrlCanonicalizeException as exc:
self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8'))
pass
except JSONDecodeError as exc:
self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8'))
pass
except Exception as exc:
try:
if exc.errno == errno.EPIPE:
if attempts > 0:
time.sleep(5) #wait 5 seconds - servers must be busy
attempts -= 1
self._load_and_index(warc_path) #try again in case of Broken Pipe Error
else:
self.stderr.write(("Broken Pipe tried more than 3 times.\n").encode('utf-8'))
self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8'))
else:
self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8'))
pass
except AttributeError: #Exception has no ErrorNo lets just print the error to the output
self.stderr.write( (("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8')) )
pass
except Exception as ex2:
self.stderr.write( ("Arcname:\t" + warc_path+"\n" + str(exc)).encode('utf-8'))
pass



def _load_and_index(self, warc_path):
warctempURL = urlopen(warc_path)
warctemp=BytesIO(warctempURL.read())

with TemporaryFile(mode='w+b') as cdxtemp:
cdxfile = GzipFile(fileobj=cdxtemp, mode='w+b')
write_cdx_index(cdxfile, warctemp, basename(warc_path), **self.index_options)
cdxfile.close()
cdxtemp.seek(0)
cdxtempString=BytesIO(cdxtemp.read())
cdxtempobj = gzip.GzipFile(fileobj=cdxtempString)
cdxRecordsString = cdxtempobj.read().decode('utf-8')

fileLines = cdxRecordsString.split("\n")
for line in fileLines:
if not (line.startswith("Error") or line.startswith("Invalid ARC record") or line.startswith("Unknown archive format")):
" ".join(line.split(" ")[2:])
jsonStr = " ".join(line.split(" ")[2:])
cdx_record = json.loads(jsonStr)
status = cdx_record.get('status', None)
if status:
if re.search(r'2.*|3.*|1.*', status):
self.stdout.write( (line + "\n").encode('utf-8') )

cdxtemp.flush()
if __name__ == '__main__':
IndexArcs.run()
@@ -0,0 +1,7 @@
#!/bin/bash
export HADOOP_HOME=/opt/hadoop-3.0.3
FILE=$1
while read line; do
echo "$line"
python3.5 /opt/webarchive-indexing/IndexArcs.py -r hadoop /opt/webarchive-indexing/arcsList/"$line"_ARCS.txt --step-num=1 --output-dir hdfs:///user/root/"$line" --no-output --jobconf mapred.job.name="$line" --python-bin /usr/local/bin/python3.5 &
done < $FILE
@@ -0,0 +1,40 @@
import argparse
import json
from collections import defaultdict

from pywb.utils.canonicalize import canonicalize


# TODO generate a ordered dictionary (python 3.6 feature)
# TODO adapt field extracting with the number of fields available (diferente formats of cdx)
def transform_cdx(cdx_path):
with open(cdx_path, mode="r") as cdxfile:
for line in cdxfile:
record_dict = defaultdict()
record_list = line.split(' ')

# build dict
record_dict['url'] = record_list[2]
record_dict['mime'] = record_list[3]
record_dict['status'] = record_list[4]
record_dict['digest'] = record_list[5]
record_dict['length'] = '0'
record_dict['offset'] = record_list[7]
record_dict['filename'] = record_list[8].replace('\n', '')
try:
print "{} {} {}".format(canonicalize(record_list[0], surt_ordered=True), record_list[1],
json.dumps(record_dict))
except ValueError as e:
print "Header"


def main():
parser = argparse.ArgumentParser(description='Transform CDX to CDXJ format')
parser.add_argument('cdx', help='the location of the cdx file')

args = parser.parse_args()
transform_cdx(args.cdx)


if __name__ == "__main__":
main()
@@ -0,0 +1,16 @@
# This script creates for each Job (collection)
# File named AllErrors.txt with all generated errors from systderr
# File named FailedArcsUnique.txt with the list of Arcs that failed indexing to CDX
for dir in /opt/hadoop-1.2.1/logs/joblogs/userlogs/*/
do
dir=${dir%*/}
cd ${dir##*/}
rm -f AllErrors.txt
rm -f FailedArcs.txt
rm -f FailedArcsUnique.txt
echo ${dir##*/}
find . -name "stderr" -exec cat {} \; > AllErrors.txt
grep 'Arcname:' AllErrors.txt > FailedArcs.txt
sort FailedArcs.txt | uniq > FailedArcsUnique.txt
cd ..
done
@@ -1,5 +1,5 @@
mrjob
boto
pywb
pywb==0.30.1
#-e git+https://github.com/matteobertozzi/Hadoop.git#egg=hadoop&subdirectory=python-hadoop
-e git+https://github.com/commoncrawl/python-hadoop.git#egg=master
@@ -0,0 +1,18 @@
mocho.pt/cab/5/rei_justino.php 20061230223546 mocho.pt text/html 200 B3W3NZII6TU5L7PELLOFAN64B5QPDSOE - 95173778 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
mocho.pt/cab/5/sopa_letras.php 20061230223556 mocho.pt text/html 200 RBVKS6E66ZQIB6WSRUJFPCWFICJKVJYS - 95176081 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
mocho.pt/cab/5/pra_mesa.php 20061230223526 mocho.pt text/html 200 C4FXGVSQUFRMZIIR43FP5UI6MIOX3Z62 - 95178421 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multiassistencia.mediaminds.pt/backoffice/output_efile.aspx?sid=4ab0d388-983a-4bc8-8767-6fbbc3f9fbdb&cntx=m5p1xpo/uifkxap3cm7pvb2mezezodzurrnzlyukyjtqhhlphkdlfe56+zdajjomoagvfzf2ovoghqo1abpo5g==&idf=11 20070104001051 multiassistencia.mediaminds.pt image/pjpeg 200 6RCRPT4UUC2F6JT6Z64VRQ4M6KA7EVGJ - 95180307 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multiassistencia.mediaminds.pt/backoffice/output_efile.aspx?sid=4ab0d388-983a-4bc8-8767-6fbbc3f9fbdb&cntx=bsnsmnzevpruhquxhtdbl9pozweaebrozievuv+j4y006nxupekd7i8rixpl3tmvmhtv5xvidldccxyyslzgsq==&idf=9 20070104001100 multiassistencia.mediaminds.pt image/pjpeg 200 NNX3C2BL5SOMWVB7AYOCKB4HG2IC6AQB - 95184969 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
morangosacucar.com.sapo.pt/historia.htm 20070104150909 morangosacucar.com.sapo.pt text/html 200 APSLBN5L7VW2K7NCYEZYVOXABMWVNK4G - 95192793 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
morangosacucar.com.sapo.pt/media.htm 20070104150901 morangosacucar.com.sapo.pt text/html 200 OTOTWQ2H7ZVFV3C3AXTXXBMZMUNJLSOC - 95195137 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/269812/25 20070105144535 multimedia.iol.pt image/gif 200 ZTQYUUXBORLQONOSC3US4RGZFSWYB5F4 - 95197547 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/271472/31 20070105144543 multimedia.iol.pt image/gif 200 345SDMXKREJABG4SUEZJYM6O2CBRKV7N - 95198730 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/271622/55 20070105144551 multimedia.iol.pt image/gif 200 V6YSV2JSAJ3QQ3MW5HV5EJS47RVJ6UEG - 95201317 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/280570/31 20070105144600 multimedia.iol.pt image/gif 200 7EE7VT5HYLIEMSJAMHJKVAYNQB5TQZ6A - 95204854 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/280587/31 20070105144609 multimedia.iol.pt image/gif 200 B6H45RNIOSHEHLVIB6UYFC44PGWV3EBB - 95206115 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/280787/31 20070105144617 multimedia.iol.pt image/gif 200 VT6FY2XKZ4AAD75FDG47HLLVOZBLV32I - 95207414 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/281803/31 20070105144625 multimedia.iol.pt image/gif 200 7RWNK6TU6ZECSIWMR64IXCHV2OXN4FDS - 95208947 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/47415/146 20070105144633 multimedia.iol.pt image/gif 200 UZ4NNPL7RV6B7VJ3USPFN7MOGQAFNYTV - 95210414 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/265683/31 20070105144527 multimedia.iol.pt image/gif 200 RKCDJCULPJFQ2N77YLF3FF2KGH56PYZW - 95212565 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.iol.pt/oratvi/multimedia/imagem/id/53052/460 20070105144644 multimedia.iol.pt image/gif 200 RFU5P46TOZXDC6W6W5434XWIRGZ56P5G - 95213847 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz
multimedia.rtp.pt/envia_file.php?file=/at3/106787-0610021330.mp3&amp;name=h%EF%BF%BD%20vida%20em%20markl 20070106130144 multimedia.rtp.pt audio/mp3 200 A6WUXU2EPQOG4W4DBUWBCKFVEQZDQDVV - 95222331 FCCN-PT-HISTORICAL-ia400104.20090107105332.arc.gz

No commit comments for this range

You can’t perform that action at this time.