# The Wayback Machine Scraper
* Abhi Kumbar
* 18 Jan 2020

* https://medium.com/analytics-vidhya/the-wayback-machine-scraper-63238f6abb66

### Updated:

* Jon Chun
* 20 Sep 2022



In [1]:
import sys
import requests as rq
from bs4 import BeautifulSoup as bs
from time import sleep
from time import time
from random import randint
from warnings import warn
import json
import pandas as pd

# Tutorial: Sections

## Get most recent Wayback URL for Topic

In [6]:
# MSNBC Wayback machine archive urls

url = 'http://web.archive.org/cdx/search/cdx?url=nbcnews.com/politics&collapse=digest&from=20190401&to=20190431&output=json'
urls = rq.get(url).text
parse_url = json.loads(urls) #parses the JSON from urls.

parse_url

[['urlkey',
  'timestamp',
  'original',
  'mimetype',
  'statuscode',
  'digest',
  'length'],
 ['com,nbcnews)/politics',
  '20190401012911',
  'https://www.nbcnews.com/politics',
  'text/html',
  '200',
  'FVZYAKIUIFOQY5NCP7AI4LJB4JNLYQOF',
  '38471'],
 ['com,nbcnews)/politics',
  '20190401014736',
  'https://www.nbcnews.com/politics',
  'text/html',
  '200',
  'HK6O3VELADSKC3TC6TVKXIZ4SFIWUQCK',
  '38557'],
 ['com,nbcnews)/politics',
  '20190401045810',
  'https://www.nbcnews.com/politics',
  'text/html',
  '200',
  'FKBB6H2CO2IDV7YDPUL4VEVXXE5OQFHN',
  '38549'],
 ['com,nbcnews)/politics',
  '20190401065204',
  'https://www.nbcnews.com/politics',
  'text/html',
  '200',
  'TGBH3GXDR7WFKAUUMPKWVT2E6TIPD7QK',
  '38473'],
 ['com,nbcnews)/politics',
  '20190401092652',
  'https://www.nbcnews.com/politics',
  'text/html',
  '200',
  'PUG672JFIZNX6ORIX5TOXXHGKXJTS5AP',
  '38554'],
 ['com,nbcnews)/politics',
  '20190401101307',
  'https://www.nbcnews.com/politics',
  'text/html',
  '200',


In [7]:
## Extracts timestamp and original columns from urls and compiles a url list.

url_list = []
for i in range(1,len(parse_url)):
  orig_url = parse_url[i][2]
  tstamp = parse_url[i][1]
  waylink = tstamp+'/'+orig_url
  url_list.append(waylink)

url_list

['20190401012911/https://www.nbcnews.com/politics',
 '20190401014736/https://www.nbcnews.com/politics',
 '20190401045810/https://www.nbcnews.com/politics',
 '20190401065204/https://www.nbcnews.com/politics',
 '20190401092652/https://www.nbcnews.com/politics',
 '20190401101307/https://www.nbcnews.com/politics',
 '20190401122721/https://www.nbcnews.com/politics',
 '20190401123556/https://www.nbcnews.com/politics',
 '20190401144056/https://www.nbcnews.com/politics',
 '20190401144245/https://www.nbcnews.com/politics',
 '20190401152555/http://www.nbcnews.com/politics',
 '20190401181102/https://www.nbcnews.com/politics',
 '20190401185553/https://www.nbcnews.com/politics',
 '20190401221603/https://www.nbcnews.com/politics',
 '20190402043541/https://www.nbcnews.com/politics',
 '20190402053204/https://www.nbcnews.com/politics',
 '20190402062410/https://www.nbcnews.com/politics',
 '20190402071203/https://www.nbcnews.com/politics',
 '20190402082522/https://www.nbcnews.com/politics',
 '20190402085

In [8]:
## Compiles final url pattern.

for url in url_list:
  final_url = 'https://web.archive.org/web/'+url

final_url

'https://web.archive.org/web/20190430233702/https://www.nbcnews.com/politics'

## Parse Beautiful Soup

In [11]:
# Open page

req = rq.get(final_url).text

# parse html using beautifulsoup and store in soup
soup = bs(req,'html.parser')
soup

<!DOCTYPE html>
<html data-reactroot="" lang="en"><head><script src="//archive.org/includes/analytics.js?v=cf34f82" type="text/javascript"></script>
<script type="text/javascript">window.addEventListener('DOMContentLoaded',function(){var v=archive_analytics.values;v.service='wb';v.server_name='wwwb-app218.us.archive.org';v.server_ms=1076;archive_analytics.send_pageview({});});</script>
<script charset="utf-8" src="/_static/js/bundle-playback.js?v=KTqwAcYd" type="text/javascript"></script>
<script charset="utf-8" src="/_static/js/wombat.js?v=UHAOicsW" type="text/javascript"></script>
<script type="text/javascript">
  __wm.init("https://web.archive.org/web");
  __wm.wombat("https://www.nbcnews.com/politics","20190430233702","https://web.archive.org/","web","/_static/",
	      "1556667422");
</script>
<link href="/_static/css/banner-styles.css?v=fantwOh2" rel="stylesheet" type="text/css"/>
<link href="/_static/css/iconochive.css?v=qtvMKcIJ" rel="stylesheet" type="text/css"/>
<!-- End Wayb

In [12]:
# Get list of article tags that contain news titles
articles= soup.find_all('article')
for article in articles:
  try:
    if article != None:
      #title and link
      if article.find_all('h2') != None:
        #get news title
        title = article.find_all('h2')[1].a.text 
        #get individual news article link
        link = article.find_all('h2')[1].a['href'] 
      else:
        title = 'N/A'
        link = 'N/A'
  except:
    pass

In [13]:
req = rq.get(link).text
soup=bs(req,'html.parser') # Parse each individual news article
article = soup.find('div',attrs={'class':'article container___2EGEI'})
article.div.text # news summary

"FBI chief Wray: Russia works '365 days a year' to undermine American democracy One of the nation's top law enforcement officials said Moscow's meddling in the past has been a 'dress rehearsal' for the 2020 presidential contest."

## Save Article

In [None]:
# These details (e.g. summary) are need to be extracted, see 'Full Working Code' below

"""

import pandas as pd
nbc_df = pd.DataFrame({'title':news_title
                       ,'summary':news_summary
                       ,'source':news_source
                       ,'article_link':news_link})
nbc_df.to_csv('nbc_articles.csv',index=False)

"""

# Full Working Code

* Depends on 'url_list' created in Tutorial sections above


In [16]:
len(url_list)

280

In [17]:
url_list = url_list[:20]
len(url_list)

20

In [18]:
url_list[:3]

['20190401012911/https://www.nbcnews.com/politics',
 '20190401014736/https://www.nbcnews.com/politics',
 '20190401045810/https://www.nbcnews.com/politics']

In [30]:
len(url_list)

20

In [19]:
## Creating a loop to scrape from all pages

news_title = []
news_source = []
news_link = []

# Use len(url_list) instead of pages = [str(i) for i in range(1,371)]

reqs = 0

print('\n\nStarting to scrape all pages from Links...\n\n')

start_time = time()

for url in url_list:
    
    full_url = 'https://web.archive.org/web/'+url
    
    #open page
    try:
        pg = rq.get(full_url).text
    except urllib.error.HTTPError as e:
        print('Error: {}'.format(e))
    except urllib.error.URLError as e:
        print('Error: {}'.format(e.reason))
        
    sleep(randint(10,20))
    reqs +=1
    
    # Calculate elapsed time between requests
    elapsed_time = time() - start_time
    print('Request (title/source/link): {}; Frequency: {} requests/s'.format(reqs,reqs/elapsed_time))
    

    
    #Break once the max pages is reached
    if reqs > len(url_list):
        warn('No. of requests was greater than expected')
        break
        
    # parse html using beautifulsoup and store in soup
    soup = bs(pg,'html.parser')
    
    #find all news containers
    articles = soup.find_all('article')
    
    # parse through news containers to get info
    for article in articles:
        try:
            
            if article != None:
                #title and link
                if article.find_all('h2') != None:
                    #get news title
                    title = article.find_all('h2')[1].a.text 
                    #get individual news article link
                    link = article.find_all('h2')[1].a['href'] 
                else:
                    title = 'N/A'
                    link = 'N/A'
                # source
                source = 'NBC News'

            # Append data elements to lists
            news_title.append(title)
            news_source.append(source)
            news_link.append(link)
        except:
            e = sys.exc_info()[0]
            print(e)
 

## Creating a loop to scrape summary from links


news_summary = []
summ_link = []

reqs = 0

print('\n\nStarting to collect Summaries from Links...\n\n')

start_time = time()

for url in news_link:
    
    #open page
    try:
        pg = rq.get(url).text
    except urllib.error.HTTPError as e:
        print('Error: {}'.format(e))
    except urllib.error.URLError as e:
        print('Error: {}'.format(e.reason))
        
    sleep(randint(10,20))
    reqs +=1
    
    # Calculate elapsed time between requests
    elapsed_time = time() - start_time
    print('Request (Summary/Link): {}; Frequency: {} requests/s'.format(reqs,reqs/elapsed_time))
    

    
    #Break once the max pages is reached
    if reqs > len(news_link):
        warn('No. of requests was greater than expected')
        break
        
    # parse html using beautifulsoup and store in soup
    soup = bs(pg,'html.parser')
    
    #find all news containers
    article = soup.find('div',attrs={'class':'article container___2EGEI'})
    try:
        if article.div != None:
            summ = article.div.text
        else:
            summ = 'N/A'
        
        news_summary.append(summ)
        summ_link.append(url)
    except:
        e = sys.exc_info()[0]
        print(e)

Request: 1; Frequency: 0.0480302866150198 requests/s
Request: 2; Frequency: 0.062244459490794414 requests/s
Request: 3; Frequency: 0.06743260090394937 requests/s
Request: 4; Frequency: 0.06620477660228226 requests/s
Request: 5; Frequency: 0.06086954777723498 requests/s
Request: 6; Frequency: 0.05716454569749637 requests/s
Request: 7; Frequency: 0.05704590761253082 requests/s
Request: 8; Frequency: 0.05684834652535762 requests/s
Request: 9; Frequency: 0.05712598835998205 requests/s
Request: 10; Frequency: 0.0556185131794978 requests/s
Request: 11; Frequency: 0.054615863971803166 requests/s
Request: 12; Frequency: 0.0548846847991084 requests/s
Request: 13; Frequency: 0.05211359821713112 requests/s
Request: 14; Frequency: 0.0518194329409173 requests/s
Request: 15; Frequency: 0.052704649724257716 requests/s
Request: 16; Frequency: 0.05283104546800106 requests/s
Request: 17; Frequency: 0.05312164348803636 requests/s
Request: 18; Frequency: 0.05273860255611554 requests/s
Request: 19; Frequen

KeyboardInterrupt: ignored

In [29]:
# Print Results

summary_ct = len(news_summary)

print('\n\nArticles: Title, Source and Link')
print('================================\n')

for i, a_title in enumerate(news_title):
  print(f'\nArticle #{i} ------------------------------')
  print(f'  Title: {news_title[i]}')
  print(f' Source: {news_source[i]}')
  print(f'   Link: {news_link[i]}')
  if i < summary_ct:
    print(f'Summary: {news_summary[i]}')
  # print(f'   Link: {summ_link[i]}')


print('\n\nArticles: Summary and Link')
print('================================\n')

for i, a_summary in enumerate(news_summary):
  print(f'\nArticle #{i}')
  print(f'Summary: {news_summary[i]}')
  print(f'   Link: {summ_link[i]}')






Articles: Title, Source and Link


Article #0 ------------------------------
  Title: Most Americans don't think Trump is in the clear yet on Russia, new poll finds
 Source: NBC News
   Link: https://web.archive.org/web/20190401012911/https://www.nbcnews.com/politics/meet-the-press/poll-after-mueller-summary-americans-are-still-wait-see-mode-n989061
Summary: Most Americans don't think Trump is in the clear yet on Russia, new poll findsPresident Trump's approval remains stable and a third of voters say they don't know whether the summary of Mueller's findings clears him of wrongdoing in a new NBC News/WSJ poll.

Article #1 ------------------------------
  Title: Beto O'Rourke kicks off campaign in Texas with a focus on border roots
 Source: NBC News
   Link: https://web.archive.org/web/20190401012911/https://www.nbcnews.com/politics/politics-news/beto-o-rourke-kicks-three-texas-rallies-focus-border-n989206
Summary: Beto O'Rourke kicks off three Texas rallies with a focus on the border