In [6]:
from utils import crawl_wayback, create_csv, get_urls

In [8]:
articles=crawl_wayback("https://www.washingtontimes.com/news/politics/?page=1", 30, get_urls, '20221228', ['article'])
create_csv(articles, 'testing', "test.csv",)

Fetching https://web.archive.org/web/20221228201228/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20221229205628/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20221230212206/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20221231213758/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20230101214721/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20230102215951/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20230103230629/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20230104214208/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/20230105215956/https://www.washingtontimes.com/news/politics/?page=1
Fetching https://web.archive.org/web/202301062

Fetching https://www.foxnews.com/politics
['https://web.archive.org/politics/watch-mccarthys-closed-door-video-bombshell-hearing-hunter-biden-irs-investigation', 'https://web.archive.org/politics/gop-senators-demand-doj-fbi-promise-not-retaliate-against-biden-whistleblowers', 'https://web.archive.org/politics/joe-manchins-campaign-fired-sole-employee-who-called-hang-trump-filings-suggest-otherwise', 'https://web.archive.org/politics/us-send-ukraine-another-1-3-billion-reuters', 'https://www.foxnews.com/video/6331414819112', 'https://web.archive.org/politics/top-gop-senator-slams-climate-czar-john-kerrys-trip-beijing-fossil-fuels-not-enemy-china-is', 'https://www.foxnews.com/video/6331414490112', 'https://web.archive.org/politics/desantis-becomes-first-enter-south-carolina-primary-defends-trump-fake-criminal-charges', 'https://www.foxnews.com/video/6331413767112', 'https://web.archive.org/media/nc-community-residents-pledge-uphold-constitution-fly-american-flag-year-round', 'https://web

ValueError: "https://web.archive.org/politics/watch-mccarthys-closed-door-video-bombshell-hearing-hunter-biden-irs-investigation" is not a memento URL

In [63]:
# Util Functions
import time
import requests
from urllib.parse import urlparse
import lxml.html
from wayback import WaybackClient, memento_url_data, WaybackSession
import itertools
import datetime
from utils import make_request, parse_html, make_link_absolute, page_grab 

DEFAULT_DELAY = 0.5
url=""
selectors=[]

class Crawler():
    """
    Need to define at least two properties:
    * start_url: the URL to start crawling from
    * selectors: a list of css selectors
    """

    def __init__(self):
        self.session = requests.Session()
        self.delay = DEFAULT_DELAY

    def make_request(self, url):
        """
        Make a request to `url` and return the raw response.

        This function ensure that the domain matches what is expected and that the rate limit
        is obeyed.
        """
        # check if URL starts with an allowed domain name
        time.sleep(self.delay)
        print(f"Fetching {url}")
        resp = self.session.get(url)
        return lxml.html.fromstring(resp.text)

    def crawl(self) -> list[str]:
        """
        Crawl the site and return a list of URLs to be scraped.
        """
        return self.get_urls(self.start_url, self.selectors)

    def get_urls(self, url, selectors):
        """
        This function takes a URLs and returns lists of URLs
        for containing each article on that page.

        Parameters:
            * url:  a URL to a page of articles
            * selectors: a list of css selectors

        Returns:
            A list of article URLs on that page.
        """
        response = self.make_request(url)
        urls = []
        for selector in selectors:
            container = response.cssselect(selector)
            for j in container:
                atr = j.cssselect("a")
                if atr and len(atr) > 0:
                    href = atr[0].get("href")
                    if len(href) > 0:
                        urls.append(
                            make_link_absolute(href, "https://web.archive.org/")
                        )
        return urls


class WaybackCrawler(Crawler):
    def __init__(self):
        super().__init__()
        self.session = WaybackSession()
        self.client = WaybackClient(self.session)

    # def crawl(self, startdate, break_point):
    #     results = self.client.search(self.url, match_type="exact", from_date=startdate)
    #     crosstime_urls = list(itertools.islice(results, break_point))
    #     post_date_articles = set()
    #     for i in range(len(crosstime_urls)):
    #         date = datetime.datetime.strptime(startdate, "%Y%m%d")
    #         if crosstime_urls[i].timestamp.date() >= date.date():
    #             articles = self.get_archive_urls(crosstime_urls[i].view_url, Crawler.selectors)
    #             # converts archive links back to current article links
    #             articles = [memento_url_data(item)[0] for item in articles]
    #             post_date_articles.update(articles)
    #     return post_date_articles

    def crawl(self,startdate,enddate,delta_hrs):
        #Create datetime - objects to crawl using wayback
        year, month, day = startdate
        current_date = datetime.datetime(year,month,day)
        year, month, day = enddate
        end_date = datetime.datetime(year,month,day)
        post_date_articles = set()

        last_url_visited = None

        #Crawl internet archive once every delta_hrs from startdate until enddate
        while current_date != end_date:
            results = self.client.search(self.url, match_type="exact", from_date=current_date)
            record = next(results)
            waybackurl = record.view_url
            #To avoid fetching urls multiple times, check if there are no updates in
            #the delta_hrs period
            if last_url_visited != url:
                articles = self.get_archive_urls(waybackurl,self.selector)
                articles = [memento_url_data(item)[0] for item in articles]
                post_date_articles.update(articles)

            last_url_visited = waybackurl
            current_date += datetime.timedelta(hours = delta_hrs)
        return post_date_articles
    
    def get_archive_urls(self, url, selectors):
        """
        might be overriden in child class
        """
        return self.get_urls(url, selectors)


class s(Crawler):
    def crawl(self):
        """
        Implement crawl here to override behavior
        """


class WashingtonPost(WaybackCrawler):
    def get_archive_urls(self, url, selectors):
        """
        Implement get_archive_urls here to override behavior
        """

class Fox(WaybackCrawler):
    def __init__(self):
        super().__init__()
        self.url="https://www.foxnews.com/politics"
        self.selector=['article']

    
a=Fox()
a.crawl([2022,1,1],[2022,1,10],6)

Fetching https://web.archive.org/web/20220101040818/http://foxnews.com/politics
Fetching https://web.archive.org/web/20220102024844/https://www.foxnews.com/politics
Fetching https://web.archive.org/web/20220102024844/https://www.foxnews.com/politics
Fetching https://web.archive.org/web/20220102024844/https://www.foxnews.com/politics
Fetching https://web.archive.org/web/20220102024844/https://www.foxnews.com/politics
Fetching https://web.archive.org/web/20220103013231/https://www.foxnews.com/politics
Fetching https://web.archive.org/web/20220103013231/https://www.foxnews.com/politics
Fetching https://web.archive.org/web/20220103013231/https://www.foxnews.com/politics
Fetching https://web.archive.org/web/20220103013231/https://www.foxnews.com/politics
Fetching https://web.archive.org/web/20220104001025/http://foxnews.com/politics
Fetching https://web.archive.org/web/20220104001025/http://foxnews.com/politics
Fetching https://web.archive.org/web/20220104001025/http://foxnews.com/politics


{'http://foxnews.com/health/fauci-milder-covid-not-signal-less-restrictions',
 'http://foxnews.com/health/states-scramble-covid-tests-biden-2022-rapid-test',
 'http://foxnews.com/lifestyle/christmas-code-of-vets-gives-back-veterans-in-need',
 'http://foxnews.com/lifestyle/surfing-santas-florida-space-coast-12th-annual',
 'http://foxnews.com/media/biden-bow-out-democrats-alternative-kurtz-media-buzz',
 'http://foxnews.com/media/cnn-jeffrey-toobin-merrick-garland-jan-6',
 'http://foxnews.com/media/elizabeth-holmes-banning-conservatives-silicon-valley-kurtz-media-buzz',
 'http://foxnews.com/media/jeff-van-drew-former-democrat-urges-manchin-democratic-affiliation',
 'http://foxnews.com/media/kayleigh-mcenany-aoc-response-critics-florida-bar-maskless',
 'http://foxnews.com/media/newt-gingrich-predicts-replay-of-2021-with-disastrous-results-for-democrats',
 'http://foxnews.com/media/one-year-later-why-january-6th-still-erodes-our-democracy',
 'http://foxnews.com/media/pandemic-fatigue-virus-