In [10]:
import os

In [12]:
os.environ.PYTHONPATH

AttributeError: '_Environ' object has no attribute 'PYTHONPATH'

In [6]:
from ...dataset_downloader import store_captures_to_dataset

ValueError: attempted relative import beyond top-level package

In [83]:
datasets_path = os.environ.get('DATA_SET', '/var/datasets')

In [94]:
# TODO : may be good idea to use it because we would have similar to url filename
# from slugify import slugify 
import json
import hashlib
import urllib3

import os
import errno

def ensure_dir(directory):
    try:
        os.makedirs(directory)
        print(f'create directory {directory}')
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise e
        print(f'directory {directory} already exist')
        
headers = None
http = urllib3.HTTPConnectionPool('web.archive.org', maxsize=50,
                                   retries=urllib3.Retry(3, redirect=2),
                                   headers=headers)

def get_captures_of_url_and_year(url, year):
    # Collapse captures by timestamp to get 1 capture per hour.
    # Its necessary to reduce the huge number of captures some websites
    # (e.g. twitter.com has 167k captures for 2018. Get only 2xx captures.
    cdx_url = f'/cdx/search/cdx?url={url}&from={year}&to={year}&' \
               'fl=timestamp,digest&collapse=timestamp:10&statuscode=200'
    response = http.request('GET', cdx_url)
    assert response.status == 200
    assert response.data
    captures_txt = response.data.decode('utf-8')
    captures = [l.split(' ') for l in captures_txt.strip().split("\n")]
    return captures

def download_capture(url, timestamp):
    resp = http.request('GET', f'/web/{timestamp}id_/{url}')
    return resp.data.decode('utf-8', 'ignore')    

stored = {}


# TODO: can make it in parallel
def for_each_capture(url, year):
    captures = get_captures_of_url_and_year(url, year)
    # TODO: can make it in parallel
    for c in captures:
        [timestamp, digest] = c
        duplicated = False
        if digest not in stored:
            response_data = download_capture(url, timestamp)
            # TODO: store to file
            stored[digest] = response_data
        else:
            duplicated = True
            response_data = stored[digest]
            
        yield (timestamp, digest, response_data, duplicated)

def encode_url_to_path(url):
    if isinstance(url, str):
        url = url.encode('utf-8')
    return hashlib.md5(url).hexdigest()

def store_captures_to_dataset(url, year):
    url_to_path = encode_url_to_path(url)
    dataset_path = f'{datasets_path}/ia/wbm'
    dataset_path_captures = f'{dataset_path}/captures'
    dataset_path_urls = f'{dataset_path}/urls'

    ensure_dir(dataset_path_captures)
    captures = []
    for timestamp, digest, data, duplicated in for_each_capture(url, year):
        # TODO: store to file
        print('get data of ', timestamp, 'digest', digest, 'size', len(data), 'duplicate', duplicated)

        # store captures of year
        file_name = f'{dataset_path_captures}/{digest}'
        if not duplicated:
            if not os.path.isfile(file_name):
                with open(file_name, 'w+') as capture_data_file:
                    capture_data_file.write(data)
                print(f'created file {file_name}')
            else:
                print(f'already have {file_name}')
        captures.append([timestamp, digest])

    file_name = f'{dataset_path_urls}/{url_to_path}/{year}'
    if not os.path.isfile(file_name):
        ensure_dir(f'{dataset_path_urls}/{url_to_path}')
        with open(file_name, 'w+') as url_captures_file:
            url_captures_file.write(json.dumps(captures))
        print(f'created file {file_name}')
    else:
        print(f'already have {file_name}')

    print('we got all')

In [95]:
store_captures_to_dataset('https://reactjs.org/blog/all.html', 2019)
store_captures_to_dataset('https://reactjs.org/blog/all.html', 2018)

directory /var/datasets/ia/wbm/captures already exist
get data of  20190103131300 digest L5BSSVD5RZJSH7ROECXBDWITK4UFWUXG size 85249 duplicate False
already have /var/datasets/ia/wbm/captures/L5BSSVD5RZJSH7ROECXBDWITK4UFWUXG
get data of  20190110122215 digest LK6UWHFG4DBLQCNF4MW3RS7ECJIJ6O5Q size 85249 duplicate False
already have /var/datasets/ia/wbm/captures/LK6UWHFG4DBLQCNF4MW3RS7ECJIJ6O5Q
get data of  20190117185403 digest ZA7BK666OTUPAXGSCOU3WT6CXXMCIUV4 size 85249 duplicate False
already have /var/datasets/ia/wbm/captures/ZA7BK666OTUPAXGSCOU3WT6CXXMCIUV4
get data of  20190125232432 digest DEF74AUAWCAIQBSM727UVAYKEDNWH4O5 size 85245 duplicate False
already have /var/datasets/ia/wbm/captures/DEF74AUAWCAIQBSM727UVAYKEDNWH4O5
get data of  20190207131826 digest UYEWLPMCLR6CGWMH5IIYW2BSEUD4FI6D size 85511 duplicate False
already have /var/datasets/ia/wbm/captures/UYEWLPMCLR6CGWMH5IIYW2BSEUD4FI6D
already have /var/datasets/ia/wbm/urls/a5d099af8ea22a17edc3bc46b6535aeb/2019
we got all
dir

In [110]:
%ls -la /var/datasets/ia/wbm/captures

total 4384
drwxr-xr-x 2 root root   4096 Feb 10 10:02 [0m[01;34m.[0m/
drwxr-xr-x 4 root root   4096 Feb 10 09:54 [01;34m..[0m/
-rw-r--r-- 1 root root 114494 Feb 10 10:02 2SQFALQDMZSNJGJBCPLBHUEIPEBES7RH
-rw-r--r-- 1 root root 105202 Feb 10 10:01 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
-rw-r--r-- 1 root root 112981 Feb 10 10:02 3VH4KC64AQSSUU64BSM6KNPYGPZEGL6Y
-rw-r--r-- 1 root root  84304 Feb 10 10:02 4ABQWDHMOEA7UPKY6TDXIYUDHKFEL7BT
-rw-r--r-- 1 root root 112013 Feb 10 10:02 4Y3HB5PIWL4B7NJXQRSD4QXZT7QA56GQ
-rw-r--r-- 1 root root  84607 Feb 10 10:02 5LM3VHY4QGFNJH5QAKBCCNNU2P2IRHYK
-rw-r--r-- 1 root root  83940 Feb 10 10:02 5V24MQETKRTUWQWG3W552J5L4IQFQEQR
-rw-r--r-- 1 root root 113443 Feb 10 10:02 6G6ALNC7PFZARXDRZLSPRQPPRNDHCUXW
-rw-r--r-- 1 root root  83593 Feb 10 10:02 6TY6NMZ3MFLR4VQ6INZRWSQDN5VSK3KW
-rw-r--r-- 1 root root 112981 Feb 10 10:02 6ZMC7XC62LV3A7NP3GLUJ3YW3VTNLZFE
-rw-r--r-- 1 root root  83593 Feb 10 10:02 7WTPGILK7Y6TNAWMVBI6ZE65BP3P2GOG
-rw-r--r-- 1 root 