## NodeRank Algorithm
Includes code for requesting page content.

In [1]:
from bs4 import BeautifulSoup
import requests
requests.packages.urllib3.disable_warnings()

import random
import math
import string
import re
import numpy as np
import nltk
import networkx as nx
import json

def getUA():
    uastrings = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",\
                "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",\
                "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",\
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"\
                ]

    return random.choice(uastrings)

def prepare_sentences(text):
    
    # Clean characters
    text = "".join([t for t in text if t.isalnum() or t in string.punctuation or t == ' '])
    
    # Get rid of whitespace characters
    sentences = nltk.sent_tokenize(text)
    text = " ".join([s.strip() for s in sentences])
    
    # Fix puctuation spacing
    text = re.sub(r"(\w{2,})([\.\!\?]+)(\w)", r"\1\2 \3", text)
    
    return text


def find_children(n1, n2):
    
    children1 = [c for c in n1.children]
    children2 = [c for c in n2.children]
    
    for i, children in enumerate([children1,children2]):
        level = 1

        while children:
            lchildren = children
            children = []
            for ch in lchildren:
                if not ch.name:
                    continue
                if ch == n1 or ch == n2:
                    if i == 0:
                        return len( nltk.sent_tokenize(n2.get_text()) ) * ( 1/math.exp(level) )
                    else:
                        return len( nltk.sent_tokenize(n1.get_text()) ) * ( 1/math.exp(level) )
                                        
                children.extend(ch.children)

            level += 1
            
    return 0


def find_content(html):
    
    # Get body and extract out non-content nodes
    soup = BeautifulSoup(html, 'lxml')
    body = soup.find('body')
    remove = [s.extract() for s in body(["script", "style","iframe","noscript","nav","footer","header", "svg", "h1","h2", "h3", "h4", "h5", "xml"])]
    text = ""
    
    # Further clean nodes and build a lookup list
    nodes = body.findAll()
    nodes = [n for n in nodes if n.name and n.get_text()]
    nodeix = [i for i,v in enumerate(nodes)]    

    # Build similarity matrix and use number of sentences child resursion depth as metric
    sim_mat = np.zeros([len(nodes), len(nodes)])
    for i in range(len(nodes)):
        for j in range(len(nodes)):
            if i != j:
                sim_mat[i][j] = find_children(nodes[i], nodes[j])

    # Run pagerank algorithm on matrix
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    
    # Sort nodex by best and get text of best node
    ranked_nodes = sorted(((scores[i],s) for i,s in enumerate(nodeix)), reverse=True)
    text = nodes[ranked_nodes[0][1]].get_text()
    
    return prepare_sentences(text)


def extract_content_noderank(url= None, html=None, timeout=10):
    if url:
        return find_content(extract_html(url, timeout))
    elif html:
        return find_content(html)
    else:
        raise Exception('Neither `url` nor `html` was suplied.')
    

def extract_html(url, timeout=10):
    headers = {'user-agent': getUA()}
    r = requests.get(url, headers = headers, verify=False, timeout=timeout)
    html = r.content
    return html

#print(extract_content_pagerank(url, timeout=10))

## Import Boilerpipe and Dragnet

In [2]:
from boilerpipe.extract import Extractor
from dragnet import extract_content

  from numpy.core.umath_tests import inner1d


## Sample URLs taken from Hacker News on 2/12/2019

In [3]:
articles = ["https://remysharp.com/2019/02/12/cern-day-1",
"https://www.nytimes.com/2019/02/12/magazine/climeworks-business-climate-change.html",
"https://kishorepv.github.io/The-value-of-Incremental_learning/",
"https://e360.yale.edu/digest/arborists-have-cloned-ancient-redwoods-from-their-massive-stumps",
"https://alexanderperrin.com.au/triangles/ballooning/",
"http://www.randomhacks.net/2005/10/11/amb-operator/",
"https://www.zdnet.com/article/microsoft-security-chief-ie-is-not-a-browser-so-stop-using-it-as-your-default/",
"https://github.com/Jeff-Ciesielski/synesthesia",
"https://www.nowpublishers.com/article/Details/RBE-0092",
"https://eng.uber.com/introducing-ludwig/",
"https://www.jasonhickel.org/blog/2019/2/3/pinker-and-global-poverty",
"https://www.nytimes.com/2019/02/11/health/artificial-intelligence-medical-diagnosis.html",
"https://www.cnbc.com/2019/02/12/google-facebook-apple-news-should-be-regulated-uk-government-report.html",
"https://techcrunch.com/2019/02/11/amazon-is-buying-home-mesh-router-startup-eero/",
"http://www.greatdisasters.co.uk/the-de-havilland-comet/",
"https://techcrunch.com/2019/02/11/google-docs-gets-an-api-for-task-automation/",
"https://ephemeralnewyork.wordpress.com/2019/02/11/the-bobbed-hair-bandit-on-the-run-in-brooklyn/",
"https://www.cbc.ca/news/technology/mars-one-bankrupt-1.5014522",
"https://medium.com/@shnatsel/how-rusts-standard-library-was-vulnerable-for-years-and-nobody-noticed-aebf0503c3d6",
"https://opensource.zalando.com/blog/2019/02/Open-Source-Harassment-Policy/",
"https://blog.parse.ly/post/7689/analyst-demystify-traffic-google-sends-publishers/",
"https://www.nytimes.com/2019/02/11/travel/northern-lights-tourism-in-sweden.html",
"https://techcrunch.com/2019/02/11/us-iphone-users-spent-79-last-year-up-36-from-2017/",
"https://blog.wolfram.com/2019/02/01/the-data-science-of-mathoverflow/",
"http://www.cat-bus.com/2017/12/gadgetbahn/",
"https://source.android.com/security/bulletin/2019-02-01.html"]

## Article Content Extractions for NodeRank, Dragnet, and BoilerPipe
One of the key components in the design of NodeRank was the ability to run on AWS Lambda using the restrictive environments of Amazon Linux AMIs. In addition, Dragnet is difficult, if not impossible, to compile on Windows due to dependencies on GNU and Boilerpipe relies on a dependency to the JDK.

In [5]:
results = {}
RENDER_ENDPOINT_AWS = 'https://xxxxxxxxxxxxxxx.xxxxxxxxxx.xxx/v1/'

for url in articles:
    
    print('Article:', url)
    rurl = RENDER_ENDPOINT_AWS + url
    html = ""
    
    try:
        html = extract_html(rurl, timeout=60)
        if "Internal server error" in html:
            raise Exception('Internal server error')
    except:
        html = extract_html(url, timeout=60)
    
    results[url] = {}
    
    # NodeRank
    print('NodeRank:')
    try:
        noderank_content = extract_content_noderank(html=html)
    except:
        print('Error')
        noderank_content = ""
    print(noderank_content)
    noderank_sentences = nltk.sent_tokenize(noderank_content)
    print('Sentences:',len(noderank_sentences))
    results[url]['noderank_sentences'] = len(noderank_sentences)
    print('\n')
    
    # Dragnet
    print('Dragnet:')
    try:
        dragnet_content = prepare_sentences(extract_content(html))
    except:
        print('Error')
        dragnet_content = ""
    dragnet_sentences = nltk.sent_tokenize(dragnet_content)
    print(dragnet_content)
    print('Sentences:',len(dragnet_sentences))
    results[url]['dragnet_sentences'] = len(dragnet_sentences)
    print('\n')  
    
    # NodeRank
    print('Boilerpipe:')
    try:
        boilerpipe_content = prepare_sentences(Extractor(extractor='ArticleExtractor', html=html).getText())
    except:
        print('Error')
        boilerpipe_content = ""
    boilerpipe_sentences = nltk.sent_tokenize(boilerpipe_content)
    print(boilerpipe_content)
    print('Sentences:',len(boilerpipe_sentences))
    results[url]['boilerpipe_sentences'] = len(boilerpipe_sentences)
    print('\n')
    


Article: https://remysharp.com/2019/02/12/cern-day-1
NodeRank:
This marks the beginning of a week long adventure in Geneva Switzerland at CERN, to work on a hackproject. The project is to rebuilding the very first web browser, aptly called WorldWideWeb (though shortly thereafter being renamed to Nexus, sincethe whole world wide web thing being a bigger deal). This browser was written by Sir Tim Berners-Lee in 1990 and the project marks the 30th anniversary of theweb. This event also reunites most of the team that made up the 2013 hack project to recreate the Line Mode Browser. On being asked if I was interested in returning, I jumped at the chance. It's CERN. There's some proper smarties rolling around here. Maybe some of that will rub off onme! The project is a quasi historical restoration mixed with simulation as we bring the original browser to the public via modern technology, specifically and ironically, via today'sbrowsers. The first day is always a lot of finding our feet. Tryin

## Raw Counts

In [6]:
print(json.dumps(results, indent=4, sort_keys=True))

{
    "http://www.cat-bus.com/2017/12/gadgetbahn/": {
        "boilerpipe_sentences": 84,
        "dragnet_sentences": 83,
        "noderank_sentences": 341
    },
    "http://www.greatdisasters.co.uk/the-de-havilland-comet/": {
        "boilerpipe_sentences": 164,
        "dragnet_sentences": 163,
        "noderank_sentences": 163
    },
    "http://www.randomhacks.net/2005/10/11/amb-operator/": {
        "boilerpipe_sentences": 73,
        "dragnet_sentences": 29,
        "noderank_sentences": 18
    },
    "https://alexanderperrin.com.au/triangles/ballooning/": {
        "boilerpipe_sentences": 0,
        "dragnet_sentences": 0,
        "noderank_sentences": 1
    },
    "https://blog.parse.ly/post/7689/analyst-demystify-traffic-google-sends-publishers/": {
        "boilerpipe_sentences": 86,
        "dragnet_sentences": 83,
        "noderank_sentences": 83
    },
    "https://blog.wolfram.com/2019/02/01/the-data-science-of-mathoverflow/": {
        "boilerpipe_sentences": 28,
     