# spider
A page crawler

In [2]:
import sqlite3
import urllib.error
import ssl
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import urlopen
from bs4 import BeautifulSoup

start_url = 'https://www.bio.purdue.edu/'

# Security bridge
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

cur.execute('''CREATE TABLE IF NOT EXISTS Pages (
    id INTEGER PRIMARY KEY,
    url TEXT UNIQUE,
    html TEXT,
    error INTEGER,
    old_rank REAL,
    new_rank REAL)''')

cur.execute('''CREATE TABLE IF NOT EXISTS Webs 
    (url TEXT UNIQUE)
''')

cur.execute('''CREATE TABLE IF NOT EXISTS Links (
    from_id INTEGER,
    to_id INTEGER,
    UNIQUE (from_id, to_id))
''')

###
def url_end_trim(start_url):
    web = start_url.strip(' /')
    # strip blank and /
    if web.endswith('.htm') or web.endswith('.html'):
        pos = web.rfind('/')
        # find in reverse the last / sign
        web = web[:pos]
    return web

###
def enter_number():
    while True:
        many = input('How many pages (0 = quit):')
        # print('in', type(many), many)
        try:
            many = int(many)
            if many >= 0:
                break
            # only accepting number > 0
        except:
            pass
        print('Invalid input. Please enter an integer >= 1.')
    return many

###
def href_filter(href, url):
    up = urlparse(href)
    if len(up.scheme) <1:
        href = urljoin(url, href)
        # if no scheme, join the original url with the new href path (href being a subunit of url)
    ipos = href.find('#')
    if ipos > 1:
        href = href[:ipos]
    if href.endswith('/'):
        href = href[:-1]
    if href.endswith('.png') or href.endswith('.jpg') or href.endswith('.png'):
        href = None
#     if len(href) < 1:
#         href = None
    # return None for invalid href
    return href

###
def in_webs_range(href, webs):
    # define if the page inside the set of webs we want to limit our search to
    # return True if in range, false otherwise
    if len(webs) < 1:
        print('Error: no web range defined.')
        return False
    for web in webs:
        if href.startswith(web):
            return True
    return False

# Initialize
cur.execute('''SELECT id, url FROM Pages WHERE html IS NULL AND error IS NULL ORDER BY RANDOM() LIMIT 1''')
row = cur.fetchone()
# print(f'{type(row)} :::{row}::')
if row is not None:
    # if databse is not empty
    print('Restarting existing crawl. Remove the sqlite file to start a new crawl.')
else:
    start_url = start_url.strip()
    web = url_end_trim(start_url)
    # formating the web url
    if len(web) > 1:
        cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES (?)', (web, ))
        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES (?, NULL, 1.0)', (start_url, ))
        conn.commit() # write to the hard drive
 
# set up webs list
webs = []
cur.execute('SELECT url FROM Webs')
for row in cur:
#     print(row) # each row is a tuple of 1 element (url, )
    webs.append(str(row[0]))
    
print(webs)
# printing all the (unfinished) webs in the database

# main
while True:
#     print(f'Loop1:{row}')
    many = enter_number()
    print(many)
    if many < 1:
        break
    
    while many > 0:
#         print(f'{many} times left')
#         print(f'Loop2a:{row}')
        cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
        try:
#             print(f'Loop2b:{row}')
            row = cur.fetchone()
#             print(f'Loop2c:{row}')
            fromid = row[0]
            url = row[1]
        except:
            print('No un-retrieved HTML Pages found.')
            many = 0
            break
        print(f'{fromid: 3} {url}', end=' ')
        
        # now we already have the url and id of the page
        cur.execute('DELETE FROM Links WHERE from_id = ?', (fromid,))
        try:
            doc = urlopen(url, context=ctx)
            print(':-)', end=' ')
            status = doc.getcode()
            # get status code for request
            if status != 200:
                print('Error on page:', doc.getcode())
                cur.execute('UPDATE Pages SET (error=?) WHERE url=?)', (status, url))
            if doc.info().get_content_type() != 'text/html':
                print('Ignore non text/html page.')
                cur.execute('DELETE FROM Pages WHERE url=?', (url,))
                conn.commit()
                continue 
            html = doc.read()
            print(f'({len(html)})', end=' ')
            soup = BeautifulSoup(html, "html.parser")
        except KeyboardInterrupt:
            print('\nProgram interrupted by user...')
            break
        except:
            print('Unable to retrieve or parse the page')
            cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ))
            conn.commit()
            continue
            
        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES (?, NULL, 1.0)', (start_url, )) 
        # make sure the page is in the db
        cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url))
        # update the html content of the page
        conn.commit()
        
        # retrieve all of the anchor tags
        tags = soup('a')
        count = 0
        for tag in tags:
            href = tag.get('href', None)
            href = href_filter(href, url)
            # if href is None or not passing the filter
            if href is None:
                continue
            # if href is not in webs range
            if in_webs_range(href, webs) is False:
                continue
                
            cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES (?, NULL, 1.0)', (href,))
            count += 1
            conn.commit()
            
            # retrieve the to_id
            cur.execute('SELECT id FROM Pages WHERE url=?', (href,))
            try:
                row = cur.fetchone()
                toid = row[0]
            except:
                print('Could not retrieve id.')
                continue
            # insert into the Links table    
            cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES (?, ?)', (fromid, toid))
                
            
        print(f'{count} valid link(s).')    
        many -= 1
        if many < 1:
            print('Job complete!')

cur.close()

Restarting existing crawl. Remove the sqlite file to start a new crawl.
['https://serenesforest.net/wiki', 'https://serenesforest.net/three-houses', 'http://www.dr-chuck.com', 'https://www.bio.purdue.edu']
How many pages (0 = quit):6
6
 2780 https://www.bio.purdue.edu/lab/leung/blog/?feed=rss2&p=1836 :-) Ignore non text/html page.
 1618 https://www.bio.purdue.edu/lab/leung/blog/?tag=dmd :-) (25025) 45 valid link(s).
 1046 https://www.bio.purdue.edu/resources/illustration/eco/files_images/lucas/2006.Chapter%2013%20-%20Lucas%20&%20%20Freeberg%20final.REAL.pdf :-) Ignore non text/html page.
 2334 https://www.bio.purdue.edu/bionews/articles/documents/articles/2019/Mattoo 4 12 19.html Unable to retrieve or parse the page
 1497 https://www.bio.purdue.edu/lab/leung/blog/?tag=microscopy :-) (36624) 63 valid link(s).
 2301 https://www.bio.purdue.edu/bionews/articles/Academic/People/alumni/articles/2019/deng-research-neutrophil-migration.html :-) (68021) 94 valid link(s).
 1486 https://www.bio.p

# Sprank
A ranking algorithm
>Basic idea: 
- loop(new rank = old rank + sum(inbound link contributions)); 
- Inbound link contribution = inbound page rank / links on inbound page.
- Evaporation corrsction = sum(abs(old rank - new rank)) / no of pages

Since my database (from bio.purdue) contains many one-sided links, the evediff doesn't seem to decrease

In [1]:
import sqlite3

conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

###
def enter_number():
    while True:
        many = input('How many pages (0 = quit):')
        # print('in', type(many), many)
        try:
            many = int(many)
            if many >= 0:
                break
            # only accepting number > 0
        except:
            pass
        print('Invalid input. Please enter an integer >= 1.')
    return many

# Find the ids that send out page rank - we only are interested
# in pages in the SCC that have in and out links
cur.execute('SELECT DISTINCT from_id FROM Links')
fids = []
for row in cur:
    fids.append(row[0])

# Find the ids that receive page rank
cur.execute('SELECT DISTINCT from_id, to_id FROM Links')
tids = []
links = []
# only look for: !) valid from_id 2) not self linking 3) also being linked
for row in cur:
    if row[0] in fids:
        if row[0] != row[1]:
            if row[1] in fids:
                links.append(row)
                if row[1] not in tids:
                    tids.append(row[1])
# check the result
# print(tids)

# Get latest page ranks for strongly connected component
oranks= dict()
for fid in fids:
    cur.execute('SELECT new_rank FROM Pages WHERE id=?', (fid,))
    row = cur.fetchone()
    oranks[fid] = row[0]
 
many = enter_number()
if many == 0:
    quit()

# Lets do Page Rank in memory so it is really fast
for i in range(many):
    # print prev_ranks.items()[:5]
    nranks= dict()
    total = 0.0
    for (fid, rank) in list(oranks.items()):
        total += rank
        nranks[fid] = 0.0
#     print('Old total:', total)
    
    # Find the number of outbound links and sent the page rank down each
    for (fid, rank) in list(oranks.items()):
        outlinks = []
        for (from_id, to_id) in links:
            if from_id == fid:
                if to_id in tids:
                    outlinks.append(to_id)
        n = len(outlinks) # no of out links
#         print(f'{fid}:: {n} outbound links.')
        if n > 0:
            diff = rank / n
            for tid in outlinks:
                nranks[tid] = nranks[tid] + diff
    
    # calculate new total
    newtot = 0.0
    for (fid, rank) in list(nranks.items()):
        newtot += rank
    
    # compute the evaporation value => keep the total unchanged
    evap = (total - newtot)/ len(nranks) 
    for fid in nranks:
        nranks[fid] = nranks[fid] + evap
        # correct each term by evap to avoid local trapping
        
#     # calculate new total
#     newtot = 0.0
#     for (fid, rank) in list(nranks.items()):
#         newtot += rank
#     print('New total:', total)
        
    # Compute the per-page average change from old rank to new rank
    # As indication of convergence of the algorithm
    totaldiff = 0
    for (fid, old) in list(oranks.items()):
        diff = abs(nranks[tid] - old)
        totaldiff += diff
    avediff = totaldiff / len(nranks)
    print(f'{i+1} - {avediff}')
    
    # ratate values
    oranks = nranks

# Put the final ranks back into the database
print(list(nranks.items())[:6]) 
cur.execute('UPDATE Pages SET old_rank=new_rank')
for (fid, rank) in list(nranks.items()):
    cur.execute('UPDATE Pages SET new_rank=? WHERE id=?', (rank, fid))
conn.commit()
# close
cur.close()

How many pages (0 = quit):1
1 - 1.2097475291280309
[(3, 0.00107752559374563), (4, 0.3030108524614837), (6, 0.10824243064463338), (7, 0.10824243064463338), (8, 0.27043564965607847), (13, 0.13204101370982219)]


# Spreset

In [2]:
import sqlite3

conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''')
conn.commit()

cur.close()

print("All pages set to a rank of 1.0")

All pages set to a rank of 1.0


# Spdump

In [2]:
import sqlite3

conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 
     FROM Pages JOIN Links ON Pages.id = Links.to_id
     WHERE html IS NOT NULL
     GROUP BY id ORDER BY inbound DESC''')

count = 0
for row in cur :
    if count < 20 : print(row)
    count = count + 1
print(count, 'rows.')
cur.close()

(450, 10.894714562863069, 10.861087065802701, 42, 'https://www.bio.purdue.edu')
(411, 10.35736889253767, 10.374112810411978, 43, 'https://www.bio.purdue.edu/index.php')
(411, 10.321220329015757, 10.337896496679763, 80, 'https://www.bio.purdue.edu/life-sciences-postdoc/index.html')
(410, 10.289932338348184, 10.306544006338136, 44, 'https://www.bio.purdue.edu/About/index.php')
(410, 10.289932338348184, 10.306544006338136, 46, 'https://www.bio.purdue.edu/About/facts.html')
(410, 10.289932338348184, 10.306544006338136, 47, 'https://www.bio.purdue.edu/About/contact.html')
(410, 10.289932338348184, 10.306544006338136, 48, 'https://www.bio.purdue.edu/About/diversity.html')
(410, 10.289932338348184, 10.306544006338136, 66, 'https://www.bio.purdue.edu/About/biology-research-areas.html')
(410, 10.400073256702127, 10.416898596257333, 67, 'https://www.bio.purdue.edu/bio-education/index.html')
(410, 10.406818820086418, 10.423656813182879, 68, 'https://www.bio.purdue.edu/cell/index.html')
(410, 10.4

# Spjson

In [1]:
import sqlite3

conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

print("Creating JSON output on spider.js...")
howmany = int(input("How many nodes? "))

cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 
    FROM Pages JOIN Links ON Pages.id = Links.to_id
    WHERE html IS NOT NULL AND ERROR IS NULL
    GROUP BY id ORDER BY id,inbound''')

fhand = open('spider.js','w')
nodes = list()
maxrank = None
minrank = None
for row in cur :
    nodes.append(row)
    rank = row[2]
    if maxrank is None or maxrank < rank: maxrank = rank
    if minrank is None or minrank > rank : minrank = rank
    if len(nodes) > howmany : break

if maxrank == minrank or maxrank is None or minrank is None:
    print("Error - please run sprank.py to compute page rank")
    quit()

fhand.write('spiderJson = {"nodes":[\n')
count = 0
map = dict()
ranks = dict()
for row in nodes :
    if count > 0 : fhand.write(',\n')
    # print row
    rank = row[2]
    rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 
    # relative rank
    fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',')
    fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}')
    # row[0] is no of inbound nodes
    map[row[3]] = count # map the processing order
    ranks[row[3]] = rank # rank data
    count = count + 1
fhand.write('],\n')

cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
fhand.write('"links":[\n')

count = 0
for row in cur :
    # print row
    if row[0] not in map or row[1] not in map : continue
    if count > 0 : fhand.write(',\n')
    rank = ranks[row[0]]
    srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 
    fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}')
    count = count + 1
fhand.write(']};')
fhand.close()
cur.close()

print("Open force.html in a browser to view the visualization")

Creating JSON output on spider.js...
How many nodes? 39
Open force.html in a browser to view the visualization
