In [8]:
import pandas as pd
import re
import cdx_toolkit
from tqdm import tqdm
from pyprojroot import here

# Extract archive

Extract archived URLs from [Bundesverband Carsharing](https://carsharing.de/).

In [73]:
# Define url for which captures shall be extracted from web archives
url = "carsharing.de/cs-standorte"

In [74]:
# Instantiate cdx client
source = 'ia'
client = cdx_toolkit.CDXFetcher(source=source)       # define client for fetching data from source (ia: Internet Archive, cc: Common Crawl)
limit = 1000                                         # define maximum number of captures that is suppossed to be retrieved for each year-url from the respective archive

In [75]:
# A 'warcinfo' record describes the records that follow it, up through end of file, end of input, or until next 'warcinfo' record.
# Typically, this appears once and at the beginning of a WARC file. 
# For a web archive, it often contains information about the web crawl which generated the following records.
warcinfo = {
    'software': 'pypi_cdx_toolkit iter-and-warc',
    'isPartOf': 'CARSHARING_BCS',
    'description': 'warc extraction',
    'format': 'WARC file version 1.0',
}

In [66]:
# Define years for which web archives shall be searched for captures from the above url
years = range(2006, 2022) # years for which information of newly registered cars exist

In [13]:
# Set directory where to save the captrures
import os 
os.chdir(here(r'01_Data\03_Carsharing\02_BCS_Archived'))

'Q:\\Meine Bibliotheken\\Research\\01_Promotion\\05_Ideas\\08_Carsharing\\carsharing\\01_Data\\03_Carsharing\\02_BCS_Archived'

In [76]:
%%time
for year in years:
    
    # Create object for writing archive captures into .arc files
    writer = cdx_toolkit.warc.CDXToolkitWARCWriter(
        prefix='BCS_' + source,  # first part of .warc file where warc records will be stored
        subprefix=str(year),     # second part of .warc file where warc records will be stored
        info=warcinfo,           
        size=1000000000,         # once the .warc file exceeds 1 GB of size a new .warc file will be created for succeeding records
        gzip=True)
    
    capture = client.iter(url, from_ts=str(year), to=str(year), limit=limit, collapse='urlkey', verbose='v', filter=['status:200'])
    for obj in tqdm(capture):
        url = obj['url']
        status = obj['status']
        timestamp = obj['timestamp']

        try:
            record = obj.fetch_warc_record()
            writer.write_record(record)
                        
        # Single captures can run into errors:
        # Except RuntimeError
        except RuntimeError:
            print('Skipping capture for RuntimeError 404: %s %s', url, timestamp)
            continue                
                    
        # Except encoding error that typically arises if no content found on webpage
        except UnicodeEncodeError:
            print('Skipping capture for UnicodeEncodeError: %s %s', url, timestamp)
            continue
            
    print(year)

0it [00:00, ?it/s]


2006


0it [00:00, ?it/s]


2007


0it [00:00, ?it/s]


2008


0it [00:00, ?it/s]


2009


0it [00:00, ?it/s]


2010


0it [00:00, ?it/s]


2011


0it [00:00, ?it/s]


2012


1it [00:07,  7.57s/it]


2013


1it [00:04,  4.37s/it]


2014


1it [00:04,  4.71s/it]


2015


1it [00:05,  5.07s/it]


2016


1it [00:04,  4.98s/it]


2017


0it [00:00, ?it/s]


2018


0it [00:00, ?it/s]


2019


0it [00:00, ?it/s]


2020


0it [00:00, ?it/s]

2021
Wall time: 44.6 s





# Analyze archive 

Analyze whether carsharing locations can be extracted from the archived websites 

In [31]:
from warcio.archiveiterator import ArchiveIterator # iterate over .warc files
from bs4 import BeautifulSoup                      # html parsing

In [77]:
#files = [str(year) + '-000000' for year in range(2006, 2022)]
files = ['2013-000000', '2014-000000', '2015-000000', '2016-000000', '2017-000000']

In [78]:
source = 'ia'

In [79]:
df_header = pd.DataFrame()

for file in files:
    with open(here(r'./01_Data/03_Carsharing/02_BCS_Archived' + '/BCS_' + source + '-' + str(file) + '.extracted.warc.gz') , 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_headers['WARC-Type'] == 'warcinfo':
                pass
            else:
                temp = sorted([(i[0], [i[1]]) for i in record.rec_headers.headers if i[0] in ['WARC-Date', 'Content-Type', 'Content-Length', 'WARC-Source-URI']], key=lambda tup: tup[0], reverse=True) 
                df_temp = pd.DataFrame.from_dict(dict(temp))
                df_header = df_header.append(df_temp)
    stream.close()
df_header.reset_index(drop=True, inplace=True)

In [80]:
df_header.shape

(5, 4)

In [81]:
df_header['WARC-Source-URI'].values

array(['https://web.archive.org/web/20130827214227id_/http%3A//www.carsharing.de%3A80/cs-standorte',
       'https://web.archive.org/web/20140108061440id_/http%3A//www.carsharing.de%3A80/cs-standorte',
       'https://web.archive.org/web/20150113164251id_/http%3A//carsharing.de/cs-standorte',
       'https://web.archive.org/web/20160116011315id_/http%3A//www.carsharing.de/cs-standorte',
       'https://web.archive.org/web/20170101213506id_/http%3A//www.carsharing.de%3A80/cs-standorte'],
      dtype=object)