Test wayback internet archive API and get url util functions

In [46]:
import time
import requests
from urllib.parse import urlparse
import sys
import json
import lxml.html
import csv
from wayback import WaybackClient, memento_url_data, WaybackSession
import itertools
import datetime

REQUEST_DELAY = 0.2

In [66]:
def make_request(url, session=None):
    """
    Make a request to `url` and return the raw response.

    This function ensure that the domain matches what is expected and that the rate limit
    is obeyed.
    """
    # check if URL starts with an allowed domain name
    time.sleep(REQUEST_DELAY)
    print(f"Fetching {url}")
    if session:
        resp = session.get(url)
    else:
        resp = requests.get(url)
    return resp


def make_link_absolute(rel_url, current_url):
    """
    Given a relative URL like "/abc/def" or "?page=2"
    and a complete URL like "https://example.com/1/2/3" this function will
    combine the two yielding a URL like "https://example.com/abc/def"

    Parameters:
        * rel_url:      a URL or fragment
        * current_url:  a complete URL used to make the request that contained a link to rel_url

    Returns:
        A full URL with protocol & domain that refers to rel_url.
    """
    url = urlparse(current_url)
    if rel_url.startswith("/"):
        return f"{url.scheme}://{url.netloc}{rel_url}"
    elif rel_url.startswith("?"):
        return f"{url.scheme}://{url.netloc}{url.path}{rel_url}"
    else:
        return rel_url


def parse_html(html):
    """
    Parse HTML and return the root node.
    """
    return lxml.html.fromstring(html)


def page_grab(url, session=None):
    response = make_request(url, session)
    root = parse_html(response.text)
    return root


def create_csv(set1, title1, filename, set2=set(), title2=""):
    """
    turns list of articles and videos into a csv with
    these values as respective columns.
    args:
    set1- items scraped (ex. article urls)
    set2- second type of items scraped (ex. videos)
    title1- column header for first type
    title2- optional header for second type
    """
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([title1, title2])
        max_length = max(len(set1), len(set2))
        for i in range(max_length):
            row = [
                list(set1)[i] if i < len(set1) else "",
                list(set2)[i] if i < len(set2) else "",
            ]
            writer.writerow(row)


def get_urls(url, selectors, session=None):
    """
    This function takes a URLs and returns lists of URLs
    for containing each article on that page.

    Parameters:
        * url:  a URL to a page of articles
        * session: optional session object parameter
        * selectors: a list of css selectors

    Returns:
        A list of article URLs on that page.
    """
    response = page_grab(url, session)
    urls = []
    for selector in selectors:
        container = response.cssselect(selector)
        for j in container:
            atr = j.cssselect("a")
            if atr and len(atr) > 0:
                href = atr[0].get("href")
                if len(href) > 0:
                    urls.append(make_link_absolute(href, "https://web.archive.org/"))
    return urls


def crawl_wayback(homepage, break_point, scraper_func, startdate, selectors=False):
    """
    Take a politics homepage, or any source with a list of articles, finds all
    copies in the archive, and scrapes all of the article links on that page.
    args:
        homepage- the homepage or politics page we are looking for across time
        break_point- the approx. number of copies in the archive
        scraper_func - the individual function built for scraping that page
        startdate- the date you would like to begin scraping ('YYYYMMDD')
        selectors- optional css selector parameter(to be used with scraper_func)
    returns:
        list of articles from startdate to present

    """
    session = WaybackSession()
    client = WaybackClient(session)
    results = client.search(homepage, match_type="exact", from_date=startdate)
    crosstime_urls = list(itertools.islice(results, break_point))
    post_date_articles = set()
    for i in range(len(crosstime_urls)):
        date = datetime.datetime.strptime(startdate, "%Y%m%d")
        if crosstime_urls[i].timestamp.date() >= date.date():
            if selectors:
                articles = scraper_func(crosstime_urls[i].view_url, selectors, session)
            else:
                articles = scraper_func(crosstime_urls[i].view_url, session)
            # converts archive links back to current article links
            articles = [memento_url_data(item)[0] for item in articles]
            post_date_articles.update(articles)
    return post_date_articles

def crawl_wayback_2(homepage, startdate, enddate, scraper_func, selectors=False, delta_hrs= 6):
    #Create datetime - objects to crawl using wayback
    year, month, day = startdate
    current_date = datetime.datetime(year,month,day)
    year, month, day = enddate
    end_date = datetime.datetime(year,month,day)
    post_date_articles = set()

    session = WaybackSession()
    client = WaybackClient(session)
    last_url_visited = None

    #Crawl internet archive once every delta_hrs from startdate until enddate
    while current_date != end_date:
        results = client.search(homepage, match_type="exact", from_date=current_date)
        record = next(results)
        url = record.view_url
        #To avoid fetching urls multiple times, check if there are no updates in
        #the delta_hrs period
        if last_url_visited != url:
            articles = scraper_func(url,selectors,session)
            articles = [memento_url_data(item)[0] for item in articles]
            post_date_articles.update(articles)

        last_url_visited = url
        current_date += datetime.timedelta(hours = delta_hrs)
    return post_date_articles

In [67]:
articles=crawl_wayback_2("https://www.washingtontimes.com/news/politics/?page=1", [2022,1,1], [2022,1,10], get_urls, ['article'])

Fetching https://web.archive.org/web/20220101012450/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20220103060359/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20220104062753/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20220105064436/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20220107000917/https://www.washingtontimes.com/news/politics/?page=1


KeyboardInterrupt: 

In [34]:
session = WaybackSession()
client = WaybackClient(session)
results = client.search("https://www.nytimes.com/section/politics", match_type="exact", from_date="20230101")
record = next(results)
record.view_url



'https://web.archive.org/web/20230101010133/https://www.nytimes.com/section/politics'

In [None]:
#TEST get urls  and wayback with scrapers written by JP

#NYT
nyt_test = get_urls("https://web.archive.org/web/20230716222629/https://www.nytimes.com/section/politics",["article.css-1l4spti"])


#Test to see how back we can go with "articles" as css selector
nyt_crawler_test_23 = crawl_wayback_2("https://www.nytimes.com/section/politics", [2023,1,1], [2023,1,3], get_urls, ['article'])
print(nyt_crawler_test_23)
nyt_crawler_test_22 = crawl_wayback_2("https://www.nytimes.com/section/politics", [2022,1,1], [2022,1,3], get_urls, ['article'])
print(nyt_crawler_test_22)
nyt_crawler_test_21 = crawl_wayback_2("https://www.nytimes.com/section/politics", [2021,1,1], [2021,1,3], get_urls, ['article'])
print(nyt_crawler_test_21)
nyt_crawler_test_20 = crawl_wayback_2("https://www.nytimes.com/section/politics", [2020,1,1], [2020,1,3], get_urls, ['article'])
print(nyt_crawler_test_20)
nyt_crawler_test_19 = crawl_wayback_2("https://www.nytimes.com/section/politics", [2019,1,1], [2019,1,3], get_urls, ['article'])
print(nyt_crawler_test_19)
nyt_crawler_test_18 = crawl_wayback_2("https://www.nytimes.com/section/politics", [2018,1,1], [2018,1,3], get_urls, ['article'])
print(nyt_crawler_test_18)
nyt_crawler_test_17 = crawl_wayback_2("https://www.nytimes.com/section/politics", [2017,1,1], [2017,1,3], get_urls, ['article'])
print(nyt_crawler_test_17)
nyt_crawler_test_16 = crawl_wayback_2("https://www.nytimes.com/section/politics", [2016,1,1], [2016,1,3], get_urls, ['article'])
print(nyt_crawler_test_16)


In [48]:
def test_depth_wayback_crawler(homepage,  scraper_func, selectors=False, delta= False, min_year=2015, max_year=2023):
    years = [*range(min_year,max_year+1,1)]
    for year in reversed(years):
        print("Testing year",year)
        startdate = [year,1,1]
        enddate = [year,1,3]
        year_test = crawl_wayback_2(homepage, startdate, enddate, scraper_func, selectors, delta)
        if len(year_test) == 0:
            return print("No results using this CSS selector in year",year)
    
    return print("Selectors work for period between",min_year, "and", max_year)

In [49]:
test_depth_wayback_crawler("https://www.nytimes.com/section/politics", get_urls, ['article'],1,2015,2023)

Testing year 2023
Fetching https://web.archive.org/web/20230101010133/https://www.nytimes.com/section/politics
Fetching https://web.archive.org/web/20230102001011/http://nytimes.com/section/politics
Testing year 2022
Fetching https://web.archive.org/web/20220101040044/https://www.nytimes.com/section/politics
Fetching https://web.archive.org/web/20220102060757/http://nytimes.com/section/politics
Testing year 2021
Fetching https://web.archive.org/web/20210101010533/https://www.nytimes.com/section/politics
Fetching https://web.archive.org/web/20210102070144/https://www.nytimes.com/section/politics
Testing year 2020
Fetching https://web.archive.org/web/20200101005908/https://www.nytimes.com/section/politics
Fetching https://web.archive.org/web/20200102025944/https://www.nytimes.com/section/politics
Testing year 2019
Fetching https://web.archive.org/web/20190101041136/https://www.nytimes.com/section/politics
Fetching https://web.archive.org/web/20190102013343/https://www.nytimes.com/section