In [80]:
import requests
from lxml import html
import urllib
import time
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import json
import datetime
import pickle
import numpy as np
filter_formats = ['.pdf', '.png', '.txt', '.svg', '.jpg', '.gz', '.md', '.zip']

filter_blacklist = ['t.co']

def grabDomainRoot(url):
    base_url = "{0.scheme}://{0.netloc}/".format(urllib.parse.urlsplit(url))
    
    if 'http' in base_url:
        try:
            base_url = [i for i in base_url.split('/') if len(i)>0]
            base_url = base_url[1]
        except:
            print ('NO BASE URL')
            return None
    #base_url = base_url.split('.')
    
    return base_url

#Need to explore with the least amount of filters because otherwise the crawler may get stuck in closed loops easily
#Can prune during graph construction instead by applying domain filters
def grabLinks(dom, filter_domains):
    
    links = [i for i in dom.xpath('//a/@href') if 'http' in i]
    new_links = links
    
    #Filter not html pages
    new_links = [i for i in new_links if True not in [extension in i for extension in filter_formats]]
    #Filter black list of web pages
    new_links = [i for i in new_links if True not in [web in i for web in filter_blacklist]]
    
    return new_links

In [116]:
def recursiveDescent(initial_html, current_depth, max_depth, graph, domains):

    if len(graph)>max_graph:
        print ('MAX GRAPH SIZE REACHED')
        return graph, domains
    
    connection = urllib.request.urlopen(initial_html, timeout=6)
    read_connect = connection.read()
    dom =  html.fromstring(read_connect)

    links = grabLinks(dom, domains)
    #print (links , current_depth, max_depth, n_calls, max_calls)
    for link in links:
        
        base_url = grabDomainRoot(link)
        
        if base_url in domains.keys(): 
            pass
        else: 
            domains[base_url]=1      
        
        print ('from:%s, to:%s, base_url:%s, depth:%d, max_depth:%d, graph_size:%d' %\
               (initial_html, link, base_url, current_depth, max_depth, len(graph)))
        
        if initial_html in graph.keys():
            
            connections = graph[initial_html].transpose()[0]
            if link in connections:
                print ('PATH EXISTS, PASSING')
                continue
    
        if initial_html not in graph.keys():
            graph[initial_html] = np.array([link, datetime.datetime.now()])

        else:
            graph[initial_html] = np.append(graph[initial_html], [link, datetime.datetime.now()])

        if current_depth+1>max_depth:
            print ('MAX DEPTH REACHED PASS')
            continue
        elif domains[base_url]>=max_domains:#this a tunable parameter
            print ('BASE URL EXCEEDED PASS')
            continue
        else:
            print ('DESCEND')
            domains[base_url]+=1
            recursiveDescent(link, current_depth+1, max_depth, graph, domains)

        time.sleep(0.1)
    
    return graph, domains

In [117]:
#graph = {}
#domains = {}

In [120]:
max_graph=10
max_domains=2
max_depth=3

In [121]:
t = recursiveDescent('http://www.nytimes.com', 0, max_depth, {}, {})

from:http://www.nytimes.com, to:http://www.nytimes.com/content/help/site/ie9-support.html, base_url:www.nytimes.com, depth:0, max_depth:3, graph_size:0
DESCEND
from:http://www.nytimes.com/content/help/site/ie9-support.html, to:https://nytimes.com, base_url:nytimes.com, depth:1, max_depth:3, graph_size:1
DESCEND
from:https://nytimes.com, to:http://www.nytimes.com/content/help/site/ie9-support.html, base_url:www.nytimes.com, depth:2, max_depth:3, graph_size:2
BASE URL EXCEEDED PASS
from:https://nytimes.com, to:http://cn.nytimes.com, base_url:cn.nytimes.com, depth:2, max_depth:3, graph_size:3
DESCEND
from:http://cn.nytimes.com, to:http://www.nytimes.com/, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:3
MAX DEPTH REACHED PASS
from:http://cn.nytimes.com, to:http://www.nytimes.com/es/, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:4
MAX DEPTH REACHED PASS
from:http://cn.nytimes.com, to:https://sso.nytcn.me/email/?source=top-right, base_url:sso.nytcn.me, depth:3, max

from:https://wordplay.blogs.nytimes.com, to:http://www.nytimes.com/content/help/site/ie9-support.html, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:4
MAX DEPTH REACHED PASS
from:https://wordplay.blogs.nytimes.com, to:https://www.nytimes.com/, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:5
MAX DEPTH REACHED PASS
from:https://wordplay.blogs.nytimes.com, to:https://www.nytimes.com/crosswords/index.html, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:5
MAX DEPTH REACHED PASS
from:https://wordplay.blogs.nytimes.com, to:https://www.nytimes.com/interactive/2017/crosswords/guide-crosswords.html, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:5
MAX DEPTH REACHED PASS
from:https://wordplay.blogs.nytimes.com, to:https://www.nytimes.com/interactive/2017/crosswords/guide-crosswords.html, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:5
MAX DEPTH REACHED PASS
from:https://wordplay.blogs.nytimes.com, to:https://www.nytimes.com/2017/06/01

from:https://lens.blogs.nytimes.com/, to:http://www.nytimes.com/content/help/site/ie9-support.html, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:5
MAX DEPTH REACHED PASS
from:https://lens.blogs.nytimes.com/, to:https://www.nytimes.com/, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:6
MAX DEPTH REACHED PASS
from:https://lens.blogs.nytimes.com/, to:https://www.nytimes.com/2018/03/22/lens/moments-big-and-small-in-vintage-photos.html, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:6
MAX DEPTH REACHED PASS
from:https://lens.blogs.nytimes.com/, to:https://www.nytimes.com/2018/03/22/lens/moments-big-and-small-in-vintage-photos.html, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:6
MAX DEPTH REACHED PASS
from:https://lens.blogs.nytimes.com/, to:https://www.nytimes.com/2018/03/21/lens/capturing-photos-of-corporate-office-life-in-1970s-america.html, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:6
MAX DEPTH REACHED PASS
from:https://

from:https://store.nytimes.com/?action=click&contentCollection=NYT%20Store&contentPlacement=2&module=SectionsNav&pgtype=Homepage&region=TopBar&t=qry542&utm_campaign=NYT-HP&utm_content=hp_browsetree&utm_medium=HPB&utm_source=nytimes&version=BrowseTree, to:https://store.nytimes.com/collections/photography/collection-historical+, base_url:store.nytimes.com, depth:3, max_depth:3, graph_size:6
MAX DEPTH REACHED PASS
from:https://store.nytimes.com/?action=click&contentCollection=NYT%20Store&contentPlacement=2&module=SectionsNav&pgtype=Homepage&region=TopBar&t=qry542&utm_campaign=NYT-HP&utm_content=hp_browsetree&utm_medium=HPB&utm_source=nytimes&version=BrowseTree, to:https://store.nytimes.com/collections/photography/collection-new-york+, base_url:store.nytimes.com, depth:3, max_depth:3, graph_size:7
MAX DEPTH REACHED PASS
from:https://store.nytimes.com/?action=click&contentCollection=NYT%20Store&contentPlacement=2&module=SectionsNav&pgtype=Homepage&region=TopBar&t=qry542&utm_campaign=NYT-HP&

from:https://myaccount.nytimes.com/mem/tnt.html, to:http://www.nytimes.com/, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:8
MAX DEPTH REACHED PASS
from:https://myaccount.nytimes.com/mem/tnt.html, to:http://www.nytimes.com, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:9
PATH EXISTS, PASSING
from:https://myaccount.nytimes.com/mem/tnt.html, to:https://www.nytimes.com/newsletters, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:9
MAX DEPTH REACHED PASS
from:https://myaccount.nytimes.com/mem/tnt.html, to:https://www.facebook.com/nytimes, base_url:www.facebook.com, depth:3, max_depth:3, graph_size:9
MAX DEPTH REACHED PASS
from:https://myaccount.nytimes.com/mem/tnt.html, to:https://twitter.com/nytimes/lists/nyt-journalists/members, base_url:twitter.com, depth:3, max_depth:3, graph_size:9
MAX DEPTH REACHED PASS
from:https://myaccount.nytimes.com/mem/tnt.html, to:https://www.nytimes.com/content/help/front.html, base_url:www.nytimes.com, depth:3, max_depth:

from:https://mobile.nytimes.com/, to:https://www.nytimes.com/es/?URI=http%3A%2F%2Fmobile.nytimes.com%2F, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:10
MAX DEPTH REACHED PASS
from:https://mobile.nytimes.com/, to:https://cn.nytimes.com/?URI=http%3A%2F%2Fmobile.nytimes.com%2F, base_url:cn.nytimes.com, depth:3, max_depth:3, graph_size:11
MAX DEPTH REACHED PASS
from:https://mobile.nytimes.com/, to:https://www.nytimes.com/subscriptions/Multiproduct/lp8HYKU.html?campaignId=6W74R&URI=http%3A%2F%2Fmobile.nytimes.com%2F, base_url:www.nytimes.com, depth:3, max_depth:3, graph_size:11
MAX DEPTH REACHED PASS
from:https://mobile.nytimes.com/, to:https://myaccount.nytimes.com/auth/login?URI=http%3A%2F%2Fmobile.nytimes.com%2F, base_url:myaccount.nytimes.com, depth:3, max_depth:3, graph_size:11
MAX DEPTH REACHED PASS
from:https://mobile.nytimes.com/, to:https://www.nytimes.com/subscriptions/Multiproduct/lp8HYKU.html?campaignId=6W74R&URI=http%3A%2F%2Fmobile.nytimes.com%2F, base_url:www.ny

from:http://www.nytimes.com/content/help/site/ie9-support.html, to:https://myaccount.nytimes.com/seg/, base_url:myaccount.nytimes.com, depth:1, max_depth:3, graph_size:11
BASE URL EXCEEDED PASS
from:http://www.nytimes.com/content/help/site/ie9-support.html, to:https://help.nytimes.com/hc/en-us/articles/115015385887-Contact-Us, base_url:help.nytimes.com, depth:1, max_depth:3, graph_size:11
DESCEND
MAX GRAPH SIZE REACHED
from:http://www.nytimes.com/content/help/site/ie9-support.html, to:https://www.nytimes.com/newsgraphics/2016/news-tips/, base_url:www.nytimes.com, depth:1, max_depth:3, graph_size:11
BASE URL EXCEEDED PASS
from:http://www.nytimes.com/content/help/site/ie9-support.html, to:https://myaccount.nytimes.com/mem/cancel.html, base_url:myaccount.nytimes.com, depth:1, max_depth:3, graph_size:11
BASE URL EXCEEDED PASS
from:http://www.nytimes.com/content/help/site/ie9-support.html, to:https://myaccount.nytimes.com/seg/report-missing-paper, base_url:myaccount.nytimes.com, depth:1, ma

In [113]:
len(t[0])

10

In [114]:
t[0].keys()

dict_keys(['http://www.nytimes.com', 'http://cn.nytimes.com', 'http://www.nytco.com', 'https://mobile.nytimes.com/', 'https://store.nytimes.com/?action=click&contentCollection=NYT%20Store&contentPlacement=2&module=SectionsNav&pgtype=Homepage&region=TopBar&t=qry542&utm_campaign=NYT-HP&utm_content=hp_browsetree&utm_medium=HPB&utm_source=nytimes&version=BrowseTree', 'http://www.nytimes.com/content/help/site/ie9-support.html', 'https://wordplay.blogs.nytimes.com', 'https://myaccount.nytimes.com/mem/tnt.html', 'https://lens.blogs.nytimes.com/', 'http://spiderbites.nytimes.com'])

In [109]:
t[1].keys()

dict_keys(['mobile.nytimes.com', 'itunes.apple.com', 'www.nytimes.com', 'myaccount.nytimes.com', 'cn.nytimes.com', 'www.nytco.com', 'wordplay.blogs.nytimes.com', 'nytcnapps.oss-cn-hongkong.aliyuncs.com', 'sso.nytcn.me', 'lens.blogs.nytimes.com', 'timestalks.com', 'www.facebook.com', 'www.instagram.com', 'query.nytimes.com', 'play.google.com', 'store.nytimes.com', 'www.shi-magazine.com', 'help.nytimes.com', 'www.nytfilmclub.com', 'nyt.qualtrics.com', 'spiderbites.nytimes.com', 'nytimes.com', 'www.nytedu.com', 'eedition.nytimes.com', 'www.nytwineclub.com'])

In [85]:
graph.keys()

dict_keys(['http://www.nytimes.com', 'http://cn.nytimes.com', 'http://www.nytco.com', 'https://mobile.nytimes.com/', 'https://store.nytimes.com/?action=click&contentCollection=NYT%20Store&contentPlacement=2&module=SectionsNav&pgtype=Homepage&region=TopBar&t=qry542&utm_campaign=NYT-HP&utm_content=hp_browsetree&utm_medium=HPB&utm_source=nytimes&version=BrowseTree', 'http://www.nytimes.com/content/help/site/ie9-support.html', 'https://wordplay.blogs.nytimes.com', 'https://myaccount.nytimes.com/mem/tnt.html', 'https://lens.blogs.nytimes.com/', 'http://spiderbites.nytimes.com'])

In [115]:
domains.keys()

dict_keys(['travel.nytimes.com', 'itunes.apple.com', 'www.nytimes.com', 'myaccount.nytimes.com', 'cn.nytimes.com', 'www.nytco.com', 'wordplay.blogs.nytimes.com', 'nytcnapps.oss-cn-hongkong.aliyuncs.com', 'sso.nytcn.me', 'lens.blogs.nytimes.com', 'timestalks.com', 'www.facebook.com', 'www.instagram.com', 'mobile.nytimes.com', 'query.nytimes.com', 'play.google.com', 'store.nytimes.com', 'www.shi-magazine.com', 'help.nytimes.com', 'www.nytfilmclub.com', 'nyt.qualtrics.com', 'spiderbites.nytimes.com', 'nytimes.com', 'www.nytedu.com', 'investors.nytco.com', 'jobmarket.nytimes.com', 'twitter.com', 'eedition.nytimes.com', 'www.nytwineclub.com'])

In [13]:
pickle.dump(graph, open('graph_root_sina.pkl', 'wb'))
pickle.dump(domains, open('domains_root_sina.pkl', 'wb'))