# spider

In [5]:
import sqlite3
import urllib.error
import ssl
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import urlopen
from bs4 import BeautifulSoup

start_url = 'https://www.bio.purdue.edu/'

# Security bridge
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

cur.execute('''CREATE TABLE IF NOT EXISTS Pages (
    id INTEGER PRIMARY KEY,
    url TEXT UNIQUE,
    html TEXT,
    error INTEGER,
    old_rank REAL,
    new_rank REAL)''')

cur.execute('''CREATE TABLE IF NOT EXISTS Webs 
    (url TEXT UNIQUE)
''')

cur.execute('''CREATE TABLE IF NOT EXISTS Links (
    from_id INTEGER,
    to_id INTEGER,
    UNIQUE (from_id, to_id))
''')

###
def url_end_trim(start_url):
    web = start_url.strip(' /')
    # strip blank and /
    if web.endswith('.htm') or web.endswith('.html'):
        pos = web.rfind('/')
        # find in reverse the last / sign
        web = web[:pos]
    return web

###
def enter_number():
    while True:
        many = input('How many pages (0 = quit):')
        # print('in', type(many), many)
        try:
            many = int(many)
            if many >= 0:
                break
            # only accepting number > 0
        except:
            pass
        print('Invalid input. Please enter an integer >= 1.')
    return many

###
def href_filter(href, url):
    up = urlparse(href)
    if len(up.scheme) <1:
        href = urljoin(url, href)
        # if no scheme, join the original url with the new href path (href being a subunit of url)
    ipos = href.find('#')
    if ipos > 1:
        href = href[:ipos]
    if href.endswith('/'):
        href = href[:-1]
    if href.endswith('.png') or href.endswith('.jpg') or href.endswith('.png'):
        href = None
#     if len(href) < 1:
#         href = None
    # return None for invalid href
    return href

###
def in_webs_range(href, webs):
    # define if the page inside the set of webs we want to limit our search to
    # return True if in range, false otherwise
    if len(webs) < 1:
        print('Error: no web range defined.')
        return False
    for web in webs:
        if href.startswith(web):
            return True
    return False

# Initialize
cur.execute('''SELECT id, url FROM Pages WHERE html IS NULL AND error IS NULL ORDER BY RANDOM() LIMIT 1''')
row = cur.fetchone()
# print(f'{type(row)} :::{row}::')
if row is not None:
    # if databse is not empty
    print('Restarting existing crawl. Remove the sqlite file to start a new crawl.')
else:
    start_url = start_url.strip()
    web = url_end_trim(start_url)
    # formating the web url
    if len(web) > 1:
        cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES (?)', (web, ))
        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES (?, NULL, 1.0)', (start_url, ))
        conn.commit() # write to the hard drive
 
# set up webs list
webs = []
cur.execute('SELECT url FROM Webs')
for row in cur:
#     print(row) # each row is a tuple of 1 element (url, )
    webs.append(str(row[0]))
    
print(webs)
# printing all the (unfinished) webs in the database

# main
while True:
#     print(f'Loop1:{row}')
    many = enter_number()
    print(many)
    if many < 1:
        break
    
    while many > 0:
#         print(f'{many} times left')
#         print(f'Loop2a:{row}')
        cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
        try:
#             print(f'Loop2b:{row}')
            row = cur.fetchone()
#             print(f'Loop2c:{row}')
            fromid = row[0]
            url = row[1]
        except:
            print('No un-retrieved HTML Pages found.')
            many = 0
            break
        print(f'{fromid: 3} {url}', end=' ')
        
        # now we already have the url and id of the page
        cur.execute('DELETE FROM Links WHERE from_id = ?', (fromid,))
        try:
            doc = urlopen(url, context=ctx)
            print(':-)', end=' ')
            status = doc.getcode()
            # get status code for request
            if status != 200:
                print('Error on page:', doc.getcode())
                cur.execute('UPDATE Pages SET (error=?) WHERE url=?)', (status, url))
            if doc.info().get_content_type() != 'text/html':
                print('Ignore non text/html page.')
                cur.execute('DELETE FROM Pages WHERE url=?', (url,))
                conn.commit()
                continue 
            html = doc.read()
            print(f'({len(html)})', end=' ')
            soup = BeautifulSoup(html, "html.parser")
        except KeyboardInterrupt:
            print('\nProgram interrupted by user...')
            break
        except:
            print('Unable to retrieve or parse the page')
            cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ))
            conn.commit()
            continue
            
        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES (?, NULL, 1.0)', (start_url, )) 
        # make sure the page is in the db
        cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url))
        # update the html content of the page
        conn.commit()
        
        # retrieve all of the anchor tags
        tags = soup('a')
        count = 0
        for tag in tags:
            href = tag.get('href', None)
            href = href_filter(href, url)
            # if href is None or not passing the filter
            if href is None:
                continue
            # if href is not in webs range
            if in_webs_range(href, webs) is False:
                continue
                
            cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES (?, NULL, 1.0)', (href,))
            count += 1
            conn.commit()
            
            # retrieve the to_id
            cur.execute('SELECT id FROM Pages WHERE url=?', (href,))
            try:
                row = cur.fetchone()
                toid = row[0]
            except:
                print('Could not retrieve id.')
                continue
            # insert into the Links table    
            cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES (?, ?)', (fromid, toid))
                
            
        print(f'{count} valid link(s).')    
        many -= 1

cur.close()

Restarting existing crawl. Remove the sqlite file to start a new crawl.
['https://serenesforest.net/wiki', 'https://serenesforest.net/three-houses', 'http://www.dr-chuck.com', 'https://www.bio.purdue.edu']
How many pages (0 = quit):1
1
 86 https://www.bio.purdue.edu/About/partnerships-centers.html :-) (30707) 68 valid link(s).
How many pages (0 = quit):0
0
