In [1]:
import numpy as np
import time
import re

In [2]:
import datetime

In [3]:
from html.parser import HTMLParser  
from urllib.request import urlopen  
from urllib import parse

# We are going to create a class called LinkParser that inherits some
# methods from HTMLParser which is why it is passed into the definition
class LinkParser(HTMLParser):

    # This is a function that HTMLParser normally has
    # but we are adding some functionality to it
    def handle_starttag(self, tag, attrs):
        # We are looking for the begining of a link. Links normally look
        # like <a href="www.someurl.com"></a>
        if tag == 'a':
            for (key, value) in attrs:
                if key == 'href':
                    # We are grabbing the new URL. We are also adding the
                    # base URL to it. For example:
                    # www.netinstructions.com is the base and
                    # somepage.html is the new URL (a relative URL)
                    #
                    # We combine a relative URL with the base URL to create
                    # an absolute URL like:
                    # www.netinstructions.com/somepage.html
                    newUrl = parse.urljoin(self.baseUrl, value)
                    # And add it to our colection of links:
                    self.links = self.links + [newUrl]

    # This is a new function that we are creating to get links
    # that our spider() function will call
    def getLinks(self, url):
        self.links = []
        # Remember the base URL which will be important when creating
        # absolute URLs
        self.baseUrl = url
        # Use the urlopen function from the standard Python 3 library
        response = urlopen(url)
        # Make sure that we are looking at HTML and not other things that
        # are floating around on the internet (such as
        # JavaScript files, CSS, or .PDFs for example)
        if 'text/html' in response.getheader('Content-Type'):
            htmlBytes = response.read()
            # Note that feed() handles Strings well, but not bytes
            # (A change from Python 2.x to Python 3.x)
            htmlString = htmlBytes.decode("utf-8")
            self.feed(htmlString)
            return htmlString, self.links
        else:
            return "",[]

# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxLvl):  
    pagesToVisit = list(url)
    pagesVisited = []
    numberVisited = 0
    numberLvl = 0
    lvl = 0
    foundWord = False
    # The main loop. Create a LinkParser and get all the links on the page.
    # Also search the page for the word or string
    # In our getLinks function we return the web page
    # (this is useful for searching for the word)
    # and we return a set of links from that web page
    # (this is useful for where to go next)
    while numberLvl < maxLvl and pagesToVisit != [] and not foundWord:
        numberVisited = numberVisited +1
#         print('Number Visited:', numberVisited, len(pagesToVisit))
        # Start from the beginning of our collection of pages to visit:
        if numberVisited == lvl+1:
            numberLvl += 1
        url = pagesToVisit[0]
        pagesToVisit = pagesToVisit[1:]

        fl = 0
#         for reg in check:
#             if re.match(reg, url) is not None:
#                 fl = 1
        for reg in check:
            if reg in url:
                fl = 1
                
        if fl == 0:
            continue
            
        if url in pagesVisited:
            continue
            
        try:
            print(numberVisited, "Visiting:", url)
            parser = LinkParser()
            data, links = parser.getLinks(url)
            
            pagesVisited.append(url)
            
            if 'document' in url:
                dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                with open('files/file_' + dt + str(numberVisited), 'w') as f:
                    f.writelines(url)
                    f.writelines('')
                    f.write(data)
                continue
                
            pagesToVisit = pagesToVisit + links
            if numberVisited == lvl+1:
                print('NEW LVL BORD')
                lvl = len(pagesToVisit)
                print(lvl)
                print()
                
        except:
            print(" **Failed!**")
    if foundWord:
        print("The word", word, "was found at", url)
    else:
        print("Word never found")
    return (pagesToVisit, pagesVisited)

In [4]:
# check = ['https?://(www\\.)?([a-z\\-_]+\\.)?ieeexplore\\.ieee\\.org/xpl/tocresult.jsp.*$', 
#          'https?://(www\\.)?([a-z\\-_]+\\.)?ieeexplore\\.ieee\\.org/document.*$', 
#          'https?://(www\\.)?([a-z\\-_]+\\.)?ieeexplore\\.ieee\\.org/xpl/RecentIssue.*$']

check = ['tocresult.jsp', 
         'document']

In [None]:
journals = ['https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83']
for journal in journals:
    parser = LinkParser()
    data, links = parser.getLinks(journal)
    print(journal, len(links))
    
    _time = time.time()
    pagesToVisit, pagesVisited = spider(links, '', 2)
    print(time.time() - _time)

https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83 960
43 Visiting: https://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=4358840
 **Failed!**
78 Visiting: https://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=6106034&punumber=83
 **Failed!**
79 Visiting: https://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=6129825&punumber=83


In [7]:
pagesVisited

1

In [10]:
links[0]

'https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83'

In [11]:
with open('files/file_' + links[0], 'w') as f:
    f.write(pagesToVisit)

FileNotFoundError: [Errno 2] No such file or directory: 'files/file_https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83'

In [8]:
pagesToVisit

'<!DOCTYPE html>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<html lang="en-US">\n\t<head>\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t \n\t\t\n\t\t\n\t\t\n\t\t<meta name="Description" content="IEEE Xplore. Delivering full text access to the world\'s highest quality technical literature in engineering and technology.">\n\t\t\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n\t\t\n\t\t<!-- Disable "click" touch event 300ms delay for Chrome/Firefox on Android -->\n\t\t<meta name="viewport" content="width=device-width">\n\t\t\n\t\t<title>\n\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\tIEEE Xplore: IEEE Transactions on Image Processing\n\t\t\t\t\t\t \n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t- (\t\t\n\t\t\t\t\t\t\t\t)\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\n\t\t\t\n\t\t</title>\n\t\t\r\n\t\r\n\t\r\n\n\t\t<script src="https://s3-us-west-2.amazonaws.com/ieeeshutpages/gdpr/set

In [78]:
sm = 0
for i in pagesToVisit:
    for j in check:
        if j in i:
            sm += 1
#             print(i)
print(sm)

544


In [79]:
pagesToVisit

['https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83',
 'https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83',
 'http://www.ieee.org/',
 'http://standards.ieee.org/',
 'http://spectrum.ieee.org/',
 'http://www.ieee.org/sitemap.html',
 'https://www.ieee.org/cart/public/myCart/page.html?refSite=http://ieeexplore.ieee.org&refSiteName=IEEE%20Xplore',
 'https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83',
 'https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83',
 'https://www.ieee.org/security-privacy.html',
 'https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=83',
 'https://ieeexplore.ieee.org/Xplore/home.jsp',
 "javascript:Modal.show('/xpl/mwInstSignIn.jsp')",
 'https://ieeexplore.ieee.org/browse/books/title/',
 'https://ieeexplore.ieee.org/browse/conferences/title/',
 'https://ieeexplore.ieee.org/xpl/courses.jsp',
 'https://ieeexplore.ieee.org/browse/periodicals/title/',
 'https://ieeexplore.ieee.org/browse/standards/collection/ieee',
 'https://ieeex