# warc_reader.ipynb

Takes a WARC file/collection of WARC files and a list of URLs supplied via .txt file.

The notebook reads the WARC files and determines which URLs are present/missing as specified by the URL list.
The notebook also reads the HTML content of the URLs specified to find problematic elements in the WARC files.

It is currently configured to find the following elements for a recurring web crawl -
 - &lt;title>Error response&lt;/title>
 - &lt;section class="securing-warning"> and &lt;div id="restricted">
 - &lt;div class="more-link">
 - &lt;div class="tab-placeholder">
 - &lt;div class="c-filter--dynamic">

## Imports

In [None]:
import os
import sys
from datetime import datetime
from pathlib import Path

from bs4 import BeautifulSoup
from tqdm import tqdm
from warcio.archiveiterator import ArchiveIterator
!{sys.executable} -m pip install warcio # Install warcio into conda environment

## Function definitions

In [None]:
def get_warc_paths(warc_path):
    """Get WARC file paths from file/directory path, filtering out non-WARC files"""
    warc_paths = []
    if os.path.isfile(warc_path):
        if Path(warc_path).match('*warc*'):
            warc_paths.append(warc_path)
    elif os.path.isdir(warc_path):
        warc_files = os.listdir(warc_path)
        for filename in warc_files:
            warc_path_tmp = os.path.join(warc_path, filename)
            if not Path(warc_path_tmp).match('*warc*'):
                continue
            warc_paths.append(warc_path_tmp)
    return warc_paths


def element_test(soup, tag, attr_type, attr_val):
    """Return True or False if HTML contains element as defined in soup.findAll"""
    out = soup.findAll(tag, attrs={attr_type: attr_val})
    return bool(out)


def read_file(url_list):
    """Load the .txt file and return list"""
    with open(url_list, 'r') as f:
        lines_file = [line.strip() for line in f]
    return lines_file

## Get WARC filepaths

In [None]:
WARC_PATHS = get_warc_paths(input('Path to WARC file/directory: '))
print(f'Number of WARC files: {len(WARC_PATHS)}')

## Get URL list from .txt file

In [None]:
URLS = read_file(input('Path to URL .txt list: '))
print(f'Number of URLs in list: {len(URLS)}')

## Set parameters

In [None]:
READ_RECORD_FROM_DATE = '20200101'
READ_RECORD_TO_DATE = '20221120'

# Create datetime objects
read_from = datetime.strptime(READ_RECORD_FROM_DATE, "%Y%m%d")
read_to = datetime.strptime(READ_RECORD_TO_DATE, "%Y%m%d")

## Read WARC files

### Main loop

In [None]:
# Active members
from collections import defaultdict
dates_dict = defaultdict(set)

WARC_Target_URI_list = []  # List contains ALL URIs found in WARC files

# HTML elements testing
error_response = []
restricted_page = []
more_link = []
tab_placeholder = []
c_filter_dynamic = []

for warc in tqdm(WARC_PATHS):
    with open(warc, 'rb') as stream:
        for record in ArchiveIterator(stream):
            # Create date_object for each record
            try:
                date_object = datetime.strptime(
                    record.rec_headers.get_header('WARC-Date'), "%Y-%m-%dT%H:%M:%SZ")
            except:
                try:
                    date_object = datetime.strptime(
                        record.rec_headers.get_header('WARC-Date'), "%Y-%m-%dT%H:%M:%S.%fZ")
                except Exception as e:
                    print(e)

            # Compare date_object with READ_FROM_DATE and READ_TO_DATE parameters
            if read_from < date_object and read_to > date_object:
                # Append URI to WARC_Target_URI_list - this creates a complete list of URIs in the WARC files
                WARC_Target_URI_list.append(
                    str(record.rec_headers.get_header('WARC-Target-URI')))
                
                # Active members
                if 'membership/active-members' in str(record.rec_headers.get_header('WARC-Target-URI')):
                    print(f"{record.rec_headers.get_header('WARC-Target-URI')} found in {warc}")
                    dates_dict[warc].add(record.rec_headers.get_header('WARC-Target-URI'))

                if record.rec_type == 'response':
                    try:
                        if 'text/html' in str(record.http_headers.get_header('Content-Type')):
                            # Loop through URLS, if a URL matches the record - read HTML and look for elements
                            for URL in URLS:
                                if URL in str(record.rec_headers.get_header('WARC-Target-URI')):
                                    # Get HTML
                                    # Decode bytes to utf-8 string and strip whitespace
                                    html = record.content_stream().read().decode('utf-8').strip()
                                    # Create soup object
                                    soup = BeautifulSoup(html, "html.parser")
                                    
                                    # Build list of pages which have 'Error response' in title
                                    if soup.find("title"):
                                        if soup.find("title").string == 'Error response':
                                            error_response.append(record.rec_headers.get_header('WARC-Target-URI'))
                                    
                                    # Build list of Boolean values - Restricted content
                                    if element_test(soup, tag="section", attr_type="class", attr_val="secure-warning") or element_test(soup, tag="div", attr_type="id", attr_val="restricted"):
                                        restricted_page.append(
                                            record.rec_headers.get_header('WARC-Target-URI'))

                                    # Build list of Boolean values - More-link
                                    if element_test(soup, tag="div", attr_type="class", attr_val="more-link"):
                                        more_link.append(record.rec_headers.get_header('WARC-Target-URI'))

                                    # Build list of Boolean values - Tab-placeholder
                                    if element_test(soup, tag="div", attr_type="class", attr_val="tab-placeholder"):
                                        tab_placeholder.append(
                                            record.rec_headers.get_header('WARC-Target-URI'))

                                    # Build list of Boolean values - C-filter--dynamic
                                    if element_test(soup, tag="div", attr_type="class", attr_val="c-filter--dynamic"):
                                        c_filter_dynamic.append(record.rec_headers.get_header('WARC-Target-URI'))
                                        
                    except Exception as e:
                        print(e)

In [None]:
# import json
# # pretty print dict as json
# print(json.dumps(dates_dict, indent=2))
# print(dates_dict)

for k, v in dates_dict.items():
    print(k)
    for v in dates_dict[k]:
        print('\t' + v)

### Found pages

In [None]:
found_in_URLS = []
for i in list(set(WARC_Target_URI_list)):
    if i in URLS:
        found_in_URLS.append(i)
print(f'Found (from URL list provided): {len(found_in_URLS)}/{len(URLS)}')

### Missing pages

In [None]:
missing_in_URLS = sorted(list(set(URLS) - set(found_in_URLS)))
print(f'Missing (from URL list provided): {len(missing_in_URLS)}/{len(URLS)}')
for i in missing_in_URLS:
    print(i)

### Error response pages

In [None]:
error_response_in_URLS = []
for i in sorted(list(set(error_response))): # set -> list removes duplicates
    if i in URLS: # only interested if the 'Error response' in title is found in URL list that is being tested
        error_response_in_URLS.append(i)
print(f'Found \'Error response\' pages: {len(error_response_in_URLS)}')
for i in error_response_in_URLS:
    print(i)

### Restricted pages

In [None]:
restricted_in_URLS = []
for i in sorted(list(set(restricted_page))): # set -> list removes duplicates
    if i in URLS: # only interested if the restricted page is found in URL list that is being tested
        restricted_in_URLS.append(i)
print(f'Found restricted pages (pages with <section class="secure-warning"> or <div id="restricted">): {len(restricted_in_URLS)}')

#### Read WARC files again - checking for logged-in versions of found restricted pages

In [None]:
# Loops through WARC files again, testing against the restricted_in_URLS list; looking for a version of the page that does
# NOT contain <section class="secure-warning"> or <div id="restricted">.
restricted_page_logged_in = []

for warc in tqdm(WARC_PATHS):
    with open(warc, 'rb') as stream:
        for record in ArchiveIterator(stream):
            # Create date_object for each record
            try:
                date_object = datetime.strptime(
                    record.rec_headers.get_header('WARC-Date'), "%Y-%m-%dT%H:%M:%SZ")
            except:
                try:
                    date_object = datetime.strptime(
                        record.rec_headers.get_header('WARC-Date'), "%Y-%m-%dT%H:%M:%S.%fZ")
                except Exception as e:
                    print(e)
            # Compare date_object with READ_FROM_DATE and READ_TO_DATE parameters
            if read_from < date_object and read_to > date_object:
                if record.rec_type == 'response':
                    try:
                        if 'text/html' in str(record.http_headers.get_header('Content-Type')):
                            for page in restricted_in_URLS:
                                if page in str(record.rec_headers.get_header('WARC-Target-URI')):
                                    # Decode bytes to utf-8 string and strip whitespace
                                    html = record.content_stream().read().decode('utf-8').strip()
                                    # Create soup object
                                    soup = BeautifulSoup(html, "html.parser")

                                    # Build list of Boolean values - Restricted content - the not negates the True
                                    if not element_test(soup, tag="section", attr_type="class", attr_val="secure-warning") and not element_test(soup, tag="div", attr_type="id", attr_val="restricted"):
                                        restricted_page_logged_in.append(
                                            record.rec_headers.get_header('WARC-Target-URI'))

                    except Exception as e:
                        print(e)

In [None]:
# A list of URLs where a restricted page and logged-in page exist in the WARC files
# for i in list(set(restricted_page_logged_in)):
#     print(i)

In [None]:
# Finds any example where a page with a restricted banner was found, but a logged-in counterpart was not
missing_logged_in = sorted(list(set(restricted_in_URLS) - set(restricted_page_logged_in)))
print(f'Restricted pages without a logged-in counterpart: {len(missing_logged_in)}')
for i in missing_logged_in:
    print(i)

### more_link

In [None]:
more_link_in_URLS = []
for i in sorted(list(set(more_link))): # set -> list removes duplicates
    if i in URLS: # only interested if the more_link element is found in URL list that is being tested
        more_link_in_URLS.append(i)
print(f'Found more-link pages (pages with <div class="more-link">): {len(more_link_in_URLS)}')
for i in more_link_in_URLS:
    print(i)

### tab_placeholder

In [None]:
tab_placeholder_in_URLS = []
for i in sorted(list(set(tab_placeholder))): # set -> list removes duplicates
    if i in URLS: # only interested if the tab_placeholder element is found in URL list that is being tested
        tab_placeholder_in_URLS.append(i)
print(f'Found tab_placeholder pages (pages with <div class="tab_placeholder">): {len(tab_placeholder_in_URLS)}')
for i in tab_placeholder_in_URLS:
    print(i)

### c_filter_dynamic

In [None]:
c_filter_dynamic_in_URLS = []
for i in sorted(list(set(c_filter_dynamic))): # set -> list removes duplicates
    if i in URLS: # only interested if the c_filter_dynamic element is found in URL list that is being tested
        c_filter_dynamic_in_URLS.append(i)
print(f'Found c_filter_dynamic pages (pages with <div class="c-filter--dynamic">): {len(c_filter_dynamic_in_URLS)}')
for i in c_filter_dynamic_in_URLS:
    print(i)