In [1]:
import json
import pandas as pd
import numpy as np
from random import random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException, WebDriverException
from time import sleep
from os.path import exists

from pprint import pprint
from tqdm.notebook import tqdm

from retrying import retry

##########################################################################################################################################
##########################################################################################################################################
##########################################################################################################################################

def wait(min, max):
    '''wait between min and max seconds'''
    sleep(min + (max-min) * random())

def reinitialise():   
    '''close the selenium client, restart (ie wipe session cookies) and re-login'''
    global bro
    try: bro.close()
    except: pass

    bro = webdriver.Chrome('chromedriver.exe')
    # bro.set_window_position(-1220, 70, windowHandle='current')
    bro.maximize_window()
    bro.implicitly_wait(3)

    homepage = 'https://www.newspapers.com/'
    bro.get(homepage)

    # sleep(2)

    try:
        # click on 'accept all' cookies
        bro.switch_to.frame('sp_message_iframe_604967')
        bro.find_element_by_xpath('//button[@title="Accept All"]').click()
        bro.switch_to.parent_frame()
    except: pass

    # sign in 
    bro.find_element_by_xpath('//*[@id="signinlink"]').click()  # sign in button
    # sleep(2)

    loginFields = bro.find_element_by_id('username')
    loginFields.clear()
    loginFields.send_keys('elliott.t.ash@gmail.com')
    loginFields.send_keys('\tnews0952') # didnt accept any identifier or xpath -> do it manually

    signinButton = bro.find_element_by_xpath('//*[@id="SignInModal"]/div/div[2]/div/div[3]/div/button')
    signinButton.click()
    sleep(5)

    return bro

def retryOnTheseExceptions(exceptionToTest):
    return isinstance(exceptionToTest, WebDriverException) or isinstance(exceptionToTest, ElementClickInterceptedException)

def backup(dic, path):
    with open(path,'w') as f:
        json.dump(obj=dic, fp=f, indent=4)

def restore(path):
    with open(path) as f:
        return json.load(f)

def unravelCoverage(coverage):
    rows = [] 

    for state in coverage.keys():
        for city in coverage[state].keys():
            for newspaper in coverage[state][city].keys():

                dates = [f'{year}-{month:02}' for year in range(1690, 2020) for month in range(1, 13)]

                s = pd.Series(index=['state', 'city', 'newspaper', 'location']+dates, dtype='float')
                s['state'] = state           
                s['city'] = city           
                s['newspaper'] = newspaper           
                s['location'] = coverage[state][city][newspaper]['location']
                
                for date, value in coverage[state][city][newspaper]['data'].items():
                    s[date] = value

                rows.append(pd.DataFrame(s).T)

    df = pd.concat(rows, ignore_index=True)
    df.to_excel('coverage_over_time.xlsx')

def unravelSpectacleMentions(spectacleMentions):
    rows = [] 

    for state in spectacleMentions.keys():
        for city in spectacleMentions[state].keys():
            for newspaper in spectacleMentions[state][city].keys():

                dates = [f'{year}-{month:02}' for year in range(1690, 2020) for month in range(1, 13)]

                s = pd.Series(index=['state', 'city', 'newspaper']+dates, dtype='float')
                s['state'] = state           
                s['city'] = city           
                s['newspaper'] = newspaper           
                
                for date, value in spectacleMentions[state][city][newspaper].items():
                    s[date] = value

                rows.append(pd.DataFrame(s).T)

    df = pd.concat(rows, ignore_index=True)
    df.to_excel('spectacleMentions_over_time.xlsx')

def resetView():
    '''click on left arrow to reset view (so that column numbers dont get shifted)'''
    global bro

    try: 
        bro.find_element_by_xpath('//*[@id="bc_previous"]').click()    
        bro.find_element_by_xpath('//*[@id="bc_previous"]').click()    
    except: 
        pass 
    

##########################################################################################################################################
##########################################################################################################################################
##########################################################################################################################################




In [2]:
# wait 2^x seconds between each retry, up to 1h, then 1h forever afterwards
@retry(wait_exponential_multiplier=1_000, wait_exponential_max=60*60*1_000, retry_on_exception=retryOnTheseExceptions)
# wait between 20 and 40min to restart
# @retry(wait_random_min=20*60*1000, wait_random_max=40*60*1000)
def scrape():
    '''scrape the 'browsing' section of newspapers.com. 
    get 
        - number of monthly publications of all newspapers by state, city and newspaper; 
        - number of hits for 'spectacles OR glasses' in that month, and 
        - location of newspaper

    flow: 
        1) read existing data from files if exists (dict is saved after each successful city) 

        2) reinitialise browser (clears cookies and everything)

        3) go loopy into states, cities, newspapers - retrieving data if it exists, creating empty dict if does not (to allow further steps)

        4) at newspaper level, 
            if newspaper data already exists:
                skip to next newspaper
                (and thus eventually to next city, next state, if those are complete)
            if data does not exist:
                get number of publications in a month
                get number of hits for 'spectacles OR glasses'
                once all years and months are collected: 
                    get location of newspaper
                    backup whole dicts to files

    once complete, unravel dicts into DFs and save to file.
    '''


    coverage = restore('coverage_dict.json') if exists('coverage_dict.json') else {}
    spectacleMentions = restore('spectacleMentions_dict.json') if exists('spectacleMentions_dict.json') else {}
   
    requestCounter = 0

    bro = reinitialise()
    bro.get('https://www.newspapers.com/browse/united-kingdom/')




    

    ################################################################### states ##############################################################################
    stateElements = bro.find_elements_by_xpath('/html/body/div[1]/div/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div/div/a')
    stateLinks = [element.get_attribute('href') for element in stateElements]
    stateNames = [element.text for element in stateElements]
    for stateLink, stateName in tqdm(list(zip(stateLinks, stateNames))[::-1], position=0, desc='states done: ', leave=False):
        
        # get existing values if exists, otherwise put an empty dict
        coverage[stateName] = coverage.get(stateName, {})
        spectacleMentions[stateName] = spectacleMentions.get(stateName, {})

        bro.get(stateLink)
        wait(2, 10)







        ################################################################### cities ##############################################################################
        cityElements = bro.find_elements_by_xpath('/html/body/div[1]/div/div[2]/div/div[1]/div/div/div[3]/div[2]/div/div/div/a')
        cityLinks = [element.get_attribute('href') for element in cityElements]
        cityNames = [element.text for element in cityElements]
        for cityLink, cityName in tqdm(list(zip(cityLinks, cityNames)), position=1, desc='cities done: ', leave=False):
            
            # get existing values if exists, otherwise put an empty dict
            coverage[stateName][cityName] = coverage[stateName].get(cityName, {})
            spectacleMentions[stateName][cityName] = spectacleMentions[stateName].get(cityName, {})

            bro.get(cityLink)
            wait(2, 10)







            ################################################################### newspapers ##############################################################################
            newspaperElements = bro.find_elements_by_xpath('/html/body/div[1]/div/div[2]/div/div[1]/div/div/div[4]/div[2]/div/div/div/a')
            newspaperLinks = [element.get_attribute('href') for element in newspaperElements[::2]]  # the xpath hits both the newspaper title as well as the little info icon next to it -> every second list item is info
            newspaperNames = [element.text for element in newspaperElements[::2]]
            newspaperInfoLinks = [element.get_attribute('href') for element in newspaperElements[1::2]]
            for newspaperLink, newspaperName, newspaperInfoLink in list(zip(newspaperLinks, newspaperNames, newspaperInfoLinks)):

               

                
                # if the newspaper already has data (even if partial), skip to next newspaper
                # otherwise, create an empty dict to fill later
                # print(coverage[stateName][cityName].get(newspaperName))
                if coverage[stateName][cityName].get(newspaperName) != None:
                    continue
                else:
                    coverage[stateName][cityName][newspaperName] = {
                        'data': {}, 
                        'location': {},
                        }
                    spectacleMentions[stateName][cityName][newspaperName] = {}



                requestCounter += 1
                if requestCounter % 400 == 0: 
                    bro = reinitialise()

                bro.get(newspaperLink)
                wait(2, 10)
                resetView()





                ################################################################### years ##############################################################################
                yearElements = bro.find_elements_by_xpath('/html/body/div[1]/div/div[2]/div/div[1]/div/div/div[5]/div[2]/div/div/div/a')
                yearLinks = [element.get_attribute('href') for element in yearElements]
                yearNames = [element.text for element in yearElements]
                for yearLink, yearName in tqdm(list(zip(yearLinks, yearNames)), position=2, desc='years done: ', leave=False):

                    bro.get(yearLink)
                    wait(2, 10)
                    resetView()
                    






                    ################################################################### months ##############################################################################
                    monthElements = bro.find_elements_by_xpath('/html/body/div[1]/div/div[2]/div/div[1]/div/div/div[6]/div[2]/div/div/div/a')
                    monthLinks = [element.get_attribute('href') for element in monthElements]
                    monthNames = [element.text for element in monthElements]
                    for monthLink, monthName in list(zip(monthLinks, monthNames)):

                    
                        bro.get(monthLink)
                        wait(2, 5)
                        resetView()

                        month = bro.current_url[-3:-1]
                        date = f'{yearName}-{month}'

                        


                        ######## get monthly number of publications ########
                        publications = bro.find_elements_by_xpath('/html/body/div[1]/div/div[2]/div/div[1]/div/div/div[7]/div[2]/div/div/div')
                        coverage[stateName][cityName][newspaperName]['data'][date] = len(publications)


                       
                




                        ######## get number of spectacle results ########
                        # enter search terms
                        keywordField = bro.find_element_by_id('searchText')
                        keywordField.clear()
                        keywordField.send_keys('spectacles OR glasses')
                        keywordField.send_keys(Keys.ENTER)
                        wait(2,5)
                        
                        # get results number
                        try:
                            resultsDescription = bro.find_element_by_xpath('//*[@id="SearchResults"]/div[1]/p').text
                            resultsString = resultsDescription.partition(' Matches')[0].replace(',', '')                # partitions at given string -> access first element

                            if resultsString == 'No': 
                                numberOfResults = 0
                            else:
                                numberOfResults = int(resultsString)

                                if numberOfResults > 180_000:
                                    numberOfResults = np.nan

                        except:
                            numberOfResults = np.nan

                        spectacleMentions[stateName][cityName][newspaperName][date] = numberOfResults

                        
                        
                        # restart client every 400 requests (two each time for initial page request, then search)
                        requestCounter += 2
                        if requestCounter > 400:
                            requestCounter = 0 
                            bro = reinitialise()





                ######## get location ########
                bro.get(newspaperInfoLink)

                try: 
                    location = bro.find_element_by_xpath('//*[@id="titleheader"]/div/div[1]/ul/li[1]').text
                    coverage[stateName][cityName][newspaperName]['location'] = location
                except:
                    coverage[stateName][cityName][newspaperName]['location'] = np.nan



                backup(coverage, path='coverage_dict.json')
                backup(spectacleMentions, path='spectacleMentions_dict.json')




    unravelCoverage(coverage)
    unravelSpectacleMentions(spectacleMentions)

                

scrape()

In [None]:
# coverage = restore('coverage_dict.json') if exists('coverage_dict.json') else {}
# spectacleMentions = restore('spectacleMentions_dict.json') if exists('spectacleMentions_dict.json') else {}

# unravelCoverage(coverage)
# unravelSpectacleMentions(spectacleMentions)