In [64]:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import sys
import os
from pathlib import Path
import re
import queue
import datetime, time
import traceback

from urllib.parse import urlparse
from urllib.parse import unquote

import requests
import lxml
from lxml import html

import multiprocessing as mp

import ipywidgets as widgets
#from ipywidgets import interact, interact_manual
from ipywidgets import *

In [65]:
# exception が発生した function の名前と内容
def print_exception_content(function_name, ex):
    print('例外が発生しました ({0}) {1}, {2}'.format(function_name, ex.__class__.__name__, ex))

def findAllHref(driver):
    try:
        elems = driver.find_elements(By.XPATH, "//a[@href]")
        return elems
    except Exception as ex:
            print_exception_content(sys._getframe().f_code.co_name, ex)
            raise
def isInternaURL(baseURL, tgtURL):
    try:
        strPtn = '^https?:'
        strBasePath = re.sub(strPtn, '', baseURL)
        strTgtPath = re.sub(strPtn, '', tgtURL)
        return strTgtPath.startswith(strBasePath)
    except Exception as ex:
            print_exception_content(sys._getframe().f_code.co_name, ex)
            raise

# make absolute URL path 
def makeAbsoluteURL(currentBrowsingURL, strLinkPath):
    strAbsoluteURl = ''
    try:
        parsed_linkuri = urlparse(strLinkPath)
        
        if parsed_linkuri.scheme == '':
            parsed_browsing_url = urlparse(currentBrowsingURL)
            if parsed_linkuri.netloc == '':
                # strLinkPath is like /path/to/some/content
                strAbsoluteURL = '{uri.scheme}://{uri.netloc}/{path}'.format(uri=parsed_browsing_url, path=strLinkPath.lstrip('/'))
            else:
                # strLinkPath is like //hostname/path/to/some/content
                strAbsoluteURL = '{uri.scheme}:{path}'.format(uri=parsed_browsing_url, path=strLinkPath)
        else:
            strAbsoluteURL = strLinkPath
        return strAbsoluteURL
    except Exception as ex:
            print_exception_content(sys._getframe().f_code.co_name, ex)
            raise

# do GET request
def doRequestGET(target_url, argAuth=None, argVerify=None):
    resp = None
    try:
        if argAuth is not None and argVerify is not None:
            # basic auth and https
            resp = requests.get(target_url, allow_redirects=False, auth=argAuth, verify=argVerify)
        elif argVerify is not None:
            # https without basic auth
            resp = requests.get(target_url, allow_redirects=False,verify=argVerify)
        elif argAuth is not None:
            # basic auth
            resp = requests.get(target_url, allow_redirects=False,auth=argAuth)
        else:
            # http without basic auth
            resp = requests.get(target_url, allow_redirects=False,)
        return resp
    except Exception as ex:
            print_exception_content(sys._getframe().f_code.co_name, ex)
            raise

# check http responce with GET request
def getResponse(target_url, basic_auth_id=None, basic_auth_pass=None, boolVerifySsl=None):
    
    resp = None
    argAuth = None
    argVerify = None
    
    try:
        if basic_auth_id is not None and basic_auth_pass is not None:
            argAuth = (basic_auth_id, basic_auth_pass)
        if target_url.startswith('https:'):
            if boolVerifySsl == True:
                argVerify = True
            elif boolVerifySsl == False:
                argVerify = False  # do not check ssl certificate
            
        resp = doRequestGET(target_url, argAuth, argVerify)
        
        return resp
    except Exception as ex:
            print_exception_content(sys._getframe().f_code.co_name, ex)
            raise

In [66]:
# prepare for multiprocessing

from modules import settings, statics
from modules import workers

statics.init()

# Set running options
proc_num_slider = widgets.IntSlider(
            min=1,
            max=mp.cpu_count(),
            step=1,
            description='CPU num to use:',
            value=mp.cpu_count(),
            style = {'description_width': 'initial'}
        )
str_radio_title1 = 'Target URL'
str_radio_title2 = 'URL List file (under "url-list" dir)'
radio_url1 = widgets.RadioButtons(options=[str_radio_title1])
radio_url2 = widgets.RadioButtons(options=[str_radio_title2])
radio_selected = None

# not yet implemented version of using url_list
radio_url1.index = None
radio_url2.index = None

text01 = widgets.Text()
text02 = widgets.Text()

def radio_url1_observer(sender):
    #print(sender)
    radio_url2.unobserve(radio_url2_observer, names=['value'])
    radio_url2.index = None

    global text02
    text02.value = ''

    global radio_selected
    radio_selected = sender['owner']

    radio_url2.observe(radio_url2_observer, names=['value'])
    
def radio_url2_observer(sender):
    #print(sender)
    radio_url1.unobserve(radio_url1_observer, names=['value'])
    radio_url1.index = None
    
    global text01
    text01.value = ''
    global radio_selected
    radio_selected = sender['owner']

    radio_url1.observe(radio_url1_observer, names=['value'])

radio_url1.observe(radio_url1_observer, names=['value'])
radio_url2.observe(radio_url2_observer, names=['value'])

### set running options

In [67]:
#print('Set running options')
display(proc_num_slider)
vbox_target_url = widgets.HBox([radio_url1, text01], layout=Layout(flex='flex-shrink'))
vbox_url_list   = widgets.HBox([radio_url2, text02], layout=Layout(flex='flex-shrink'))

widgets.VBox([vbox_target_url, vbox_url_list])

IntSlider(value=8, description='CPU num to use:', max=8, min=1, style=SliderStyle(description_width='initial')…

VBox(children=(HBox(children=(RadioButtons(options=('Target URL',), value=None), Text(value='')), layout=Layou…

### confirm options

In [71]:
num_processors = proc_num_slider.value

strConfirmMsg = 'Program will run with following parameters\n'

strConfirmMsg += '- Running process : {}\n'.format(num_processors)

strBaseURL = None
strURLListFile = None

if radio_selected.value == str_radio_title1:
    strBaseURL = text01.value
    strURLListFile = None
    strConfirmMsg += '- Target URL : {}\n'.format(strBaseURL)
elif radio_selected.value == str_radio_title2:
    strBaseURL = None
    strURLListFile = text02.value
    strConfirmMsg += '- URL List file : url-list/{}\n'.format(strURLListFile)

print(strConfirmMsg)

Program will run with following parameters
- Running process : 8
- URL List file : url-list/url-list.txt



### subroutine to reformat the Console Log as TSV

In [72]:
def printOutHashTable(dictOutput, strResultDirname, strDestFile):
    
    try:
        with open(os.path.join(strResultDirname, strDestFile), '+a', encoding=strFileEncoding) as fp:
            fp.write('{}\t{}\t{}\n'.format('LogLevel', 'Message', 'URL'))

            boolFirstLogLevel = True
            boolFirstMessage  = True

            for loglevel in dictOutput.keys():
                messages = dictOutput[loglevel]

                for msg in messages.keys():
                    for url in messages[msg]:
                        if boolFirstLogLevel == True and boolFirstMessage == True:
                            fp.write('{}\t{}\t{}\n'.format(loglevel, msg, url))
                            boolFirstLogLevel = False
                            boolFirstMessage  = False
                        elif boolFirstMessage == True:
                            fp.write('{}\t{}\t{}\n'.format(loglevel, msg, url))
                            boolFirstMessage  = False
                        else:
                            fp.write('\t\t{}\n'.format(url))
                    boolFirstMessage = True
                boolFirstLogLevel = True
    except Exception as ex:
        raise
    
    
def ReformatConsoleLogOutputFileAsTSV(str_start_datetime, str_file_in):
    
    dictOutput = {}
    
    strResultDirname    = 'results-' + str_start_datetime
    strFileConsoleLogs  = str_file_in
    strDestFile         = '07-02.formatted_console_logs-' + str_start_datetime + '.csv'
    
    try:
    
        with open(os.path.join(strResultDirname, strFileConsoleLogs), 'r', encoding=strFileEncoding) as fp:
            line = fp.readline()
            #print(line)
            while line:

                strPageUrl   = ''
                strLogLevel  = ''
                strLogMsg    = ''

                m = re.match('^([^,]+),\s*"(.+)",\s*([^,]+),\s*([^,]+),\s*([^,]+)$', line)

                if m:
                    strLogLevel  = m.group(1).strip()
                    strLogMsg    = m.group(2).strip()
                    strPageUrl   = m.group(5).strip()
                    #print(strPageUrl)

                    if not strLogLevel in dictOutput:
                        dictOutput[strLogLevel] = {}
                    if not strLogMsg in dictOutput[strLogLevel]:
                        dictOutput[strLogLevel][strLogMsg] = []
                    if not strPageUrl in dictOutput[strLogLevel][strLogMsg]:
                        dictOutput[strLogLevel][strLogMsg].append(strPageUrl)

                line = fp.readline()
        #print(dictOutput)
        printOutHashTable(dictOutput, strResultDirname, strDestFile)
    except Exception as ex:
        raise

### main

In [73]:
if __name__ == '__main__':
    
    try:
        datetime_start = datetime.datetime.now()
        str_start_datetime = datetime_start.strftime('%Y%m%d-%H%M%S')
        print('started at {}'.format(str_start_datetime))
        
        strFileEncoding      = settings.FILE_ENCODING
        strResultDirname     = 'results-' + str_start_datetime
        strFileSummary       = '01.summary-' + str_start_datetime + '.txt'
        strFileBrowsedPages  = '02.browsed_pages-' + str_start_datetime + '.csv'
        strFileConsoleLogs   = '07.consolelogs-' + str_start_datetime + '.csv'
        
        # create directory to save output files.
        Path(strResultDirname).mkdir(exist_ok=True, parents=True)
        
        # add title line to "Browsed Pages" and "Console Logs" files.
        strTitle = '{},' \
                    '{},' \
                    '"{}",' \
                    '"{}",' \
                    '"{}",' \
                    '"{}",' \
                    '"{}"'.format( 'URL'\
                                  , 'Title' \
                                  , 'Response Code' \
                                  , 'Response Message' \
                                  , 'Redirect To' \
                                  , 'Response Code (RedirectTo)' \
                                  , 'Response Message (RedirectTo)')
        with open(os.path.join(strResultDirname, strFileBrowsedPages), 'a+', encoding=strFileEncoding) as fp:
            fp.write( strTitle + "\n")
        
        strTitle ='{},{},{},{},{}'.format('level' \
                                              , 'message' \
                                              , 'source' \
                                              , 'timestamp' \
                                              , 'URL')
        with open(os.path.join(strResultDirname, strFileConsoleLogs), 'a+', encoding=strFileEncoding) as fp:
            fp.write( strTitle + "\n")
        
        #mp.set_start_method('spawn') # it's default on windows and mac.
        #num_processors = mp.cpu_count()
        #num_processors = 4
        
         
        numInvalidLink   = mp.Value('i', 0)
        numExceptions    = mp.Value('i', 0)
        numHealthyLink   = mp.Value('i', 0)
        numExternalLinks = mp.Value('i', 0)
        numBrowsedPages  = mp.Value('i', 0)
        numConsoleSevere = mp.Value('i', 0)
        numConsoleWarn   = mp.Value('i', 0)
        numCriticalExceptions    = mp.Value('i', 0)
        
        manager = mp.Manager()
        '''q = manager.Queue()
        q_browsed_urls  = manager.Queue()
        q_checked_links = manager.Queue()'''
        lock = manager.Lock()
        # gave up to use multiprocessor.Queue() since the Manager.QUeue() does not support the feature to check if an item is already in it.
        q = manager.list()
        q_browsed_urls  = manager.list()
        q_checked_links = manager.dict() # to save url and it's status code (q_checked_links[url] = status_code)
        
        flagRunningMode = ''
        if strBaseURL:
            if re.match('.*[.](html?|pdf)$', strBaseURL) == None \
                and re.match('.*[A-Za-z0-9]$', strBaseURL):
                strBaseURL = strBaseURL + "/"

            q.append(strBaseURL)
            
            # traversa url
            flagRunningMode = statics.RUNNING_MODE_TRAVERSAL
            
        elif strURLListFile:
            # urls are specified by file
            with open(os.path.join('url-list', strURLListFile), 'r') as fp:
                url = fp.readline().strip()
                while url:
                    q.append(url)              
                    url = fp.readline().strip()
            flagRunningMode = statics.RUNNING_MODE_URLLIST
        
        arr_mgr_list = []
        for i in range(num_processors):
            arr_mgr_list.append([q, q_browsed_urls, q_checked_links, lock])
        #print(arr_mgr_list)
        
        print('Starting multiprocessing....')
            
        p = mp.Pool(processes = num_processors, initializer = workers.init_link_check_worker, initargs= [str_start_datetime \
                                                                                                        , flagRunningMode \
                                                                                                        , strBaseURL
                                                                                                        , numInvalidLink \
                                                                                                        , numExceptions \
                                                                                                        , numHealthyLink \
                                                                                                        , numExternalLinks \
                                                                                                        , numBrowsedPages \
                                                                                                        , numConsoleSevere \
                                                                                                        , numConsoleWarn \
                                                                                                        , numCriticalExceptions])

        p.starmap(workers.link_check_worker, arr_mgr_list)
        
        print('DOne multiprocessing....')
        
        datetime_end = datetime.datetime.now()
        duration = datetime_end - datetime_start
        print('end at {}.'.format(datetime_end.strftime('%Y%m%d-%H%M%S')))
        
        ReformatConsoleLogOutputFileAsTSV(str_start_datetime, strFileConsoleLogs)
        
        strMsg = "It took {} seconds.\n" \
                "Total healthy Links   = {}\n" \
                "Total broken Links    = {}\n" \
                "Total Exceptions      = {}\n" \
                "Total External Links  = {}\n" \
                "Total Browsed Pages   = {}\n" \
                "Total SEVERE in browser console = {}\n" \
                "Total WARNING in browser console = {}\n" \
                "Total Critical Exceptions   = {}\n".format(duration.total_seconds() \
                                                                 , numHealthyLink.value \
                                                                 , numInvalidLink.value \
                                                                 , numExceptions.value \
                                                                 , numExternalLinks.value \
                                                                 , numBrowsedPages.value \
                                                                 , numConsoleSevere.value \
                                                                 , numConsoleWarn.value \
                                                                 , numCriticalExceptions.value)

        print(strMsg)
        with open(os.path.join(strResultDirname, strFileSummary), 'a+', encoding=strFileEncoding) as fp:
            fp.write(strMsg)
    except Exception as ex:
        raise # to display the stack trace

started at 20210516-181748
Starting multiprocessing....
DOne multiprocessing....
end at 20210516-181941.
It took 112.534762 seconds.
Total healthy Links   = 740
Total broken Links    = 4
Total Exceptions      = 0
Total External Links  = 117
Total Browsed Pages   = 4
Total SEVERE in browser console = 5
Total Critical Exceptions   = 0

