In [1]:
%matplotlib inline
import csv
from datetime import datetime
from urlparse import urlparse
import json
import snap

In [2]:
def extractDomain(rawURL):
	parsedUrl = urlparse(rawURL)
	if parsedUrl.netloc.startswith('www.'):
		return parsedUrl.netloc[4:]
	return parsedUrl.netloc

def isUninteresting(url):
    if 'facebook' in url:
        return True
    elif 'instagram' in url:
        return True
    elif 'twitter' in url:
        return True
    elif 'google' in url:
        return True
    elif 'youtube' in url:
        return True
    elif url == 'https://t.co/':
        return True
    elif 'bit.ly' in url:
        return True
    else:
        return False

In [3]:
LinkGraph = snap.TNGraph.New()

currID = 0
domainToNodeID = {}
IDtoDomain = {}
DomainCounts = {}
years = {}
months = {}

progress = 0

for i in [1, 2, 3, 5]: 
    min_date = -1
    max_date = -1
    with open("web-2016-09-links-clean-{}.txt".format(i)) as tsvfile:
        linkReader = csv.reader(tsvfile, delimiter='\t')
        for row in linkReader:
            fromDomain = extractDomain(row[0])
            
            if isUninteresting(fromDomain):
            	continue
            if fromDomain not in domainToNodeID:
            	domainToNodeID[fromDomain] = currID
                IDtoDomain[currID] = fromDomain
            	LinkGraph.AddNode(currID)
            	currID += 1

            uniqueToDomains = set()

            for link in row[2:]:
                try:
                    toDomain = extractDomain(link)
                    
                    if isUninteresting(toDomain):
                        continue
                    if toDomain not in DomainCounts: 
                        DomainCounts[toDomain] = 1
                    else:
                        DomainCounts[toDomain] += 1
                        
                    if toDomain not in domainToNodeID:
                        IDtoDomain[currID] = toDomain
                        domainToNodeID[toDomain] = currID
                        LinkGraph.AddNode(currID)
                        currID += 1
                    if toDomain not in uniqueToDomains:
                        LinkGraph.AddEdge(domainToNodeID[fromDomain],domainToNodeID[toDomain])
                        uniqueToDomains.add(toDomain)
                except ValueError:
                    pass

In [6]:
DomainCounts

{'http://s5m14.seesaa.net/': 2,
 'http://toscanoirriverente.tumblr.com/': 3,
 'http://www.bexhillobserver.net/': 37,
 'http://starcancer.tumblr.com/': 2,
 'http://www.davenportfamily.com/': 9,
 'http://www.ellos.fi/': 2,
 'http://photoboothz.wikia.com/': 1,
 'http://www.wildstar-central.com/': 3,
 'http://www.jacarebanguela.com.br/': 12,
 'http://papum-pare.classi4u.com/': 1,
 'http://www.EarthChanges.org/': 14,
 'http://www.comune.roma.it/': 6,
 'http://www.houseweb.com.tw/': 21,
 'http://artozinos.blogspot.com/': 1,
 'http://wirralglobe.co.uk/': 305,
 'https://www.hbbtv.org/': 2,
 'http://academy.hubspot.com/': 1,
 'http://www.ministryofsound.com/': 8,
 'http://notorious-yung-sushi.tumblr.com/': 2,
 'http://kovastudios.com/': 2,
 'http://bonryo.blog6.fc2.com/': 4,
 'http://www.josemarti.cu/': 1,
 'http://fortheloveoftech.com/': 5,
 'http://mostlycajun.com/': 14,
 'http://1.lifether.pay.clickbank.net/': 2,
 'https://ida.stampinup.com/': 1,
 'http://amaterials.exblog.jp/': 1,
 'http://

In [25]:
import operator
TopDomainCounts = list(sorted(DomainCounts.iteritems(), key=operator.itemgetter(1), reverse=True)[:1000])


In [61]:
DomainCounts['dailynews.com']

344

In [64]:
top_news_sources = ['abcnews.go.com', 'cbsnews.com', 'cnn.com', 'foxnews.com', 'msnbc.com', 'nytimes.com', 'latimes.com',\
                   'usatoday.com', 'wsj.com', 'washingtonpost.com', 'bloomberg.com', 'vice.com', 'huffingtonpost.com', 'npr.org']

In [96]:
len(top_news_sources)

14

In [None]:
conservative = ['foxnews.com', 'washingtonpost.com', 'theguardian.com', 'nypost.com', 'bloomberg.com', '']
liberal = ['nytimes.com', 'theatlantic.com', 'washingtonpost.com', 'huffingtonpost.com', 'vice.com', 'sfgate.com', '']
neutral = ['usatoday.com', 'cnn.com', 'bbc.com']

In [48]:
def printNeighborsAtHop(url, hops):
    node_id = domainToNodeID[url]
    NodeVec = snap.TIntV()
    snap.GetNodesAtHop(LinkGraph, node_id, hops, NodeVec, True)
    print NodeVec.Len()
    for item in NodeVec:
        domain = IDtoDomain[item]
        if domain in TopDomains:
            print domain

In [70]:
start_id = domainToNodeID["foxnews.com"]
results = []

for news_source in top_news_sources: 
    node_id = domainToNodeID[news_source]
    Length = snap.GetShortPath(LinkGraph, start_id, node_id)
    results.append((news_source, Length))
print results

[('abcnews.go.com', 2), ('cbsnews.com', 1), ('cnn.com', 1), ('foxnews.com', 0), ('msnbc.com', 2), ('nytimes.com', 1), ('latimes.com', 1), ('usatoday.com', 1), ('wsj.com', 1), ('washingtonpost.com', 1), ('bloomberg.com', 1), ('vice.com', 2), ('huffingtonpost.com', 1), ('npr.org', 2)]


In [71]:
start_id = domainToNodeID["nytimes.com"]
results = []

for news_source in top_news_sources: 
    node_id = domainToNodeID[news_source]
    Length = snap.GetShortPath(LinkGraph, start_id, node_id)
    results.append((news_source, Length))
print results

[('abcnews.go.com', 1), ('cbsnews.com', 1), ('cnn.com', 1), ('foxnews.com', 1), ('msnbc.com', 1), ('nytimes.com', 0), ('latimes.com', 1), ('usatoday.com', 1), ('wsj.com', 1), ('washingtonpost.com', 1), ('bloomberg.com', 1), ('vice.com', 1), ('huffingtonpost.com', 1), ('npr.org', 1)]


In [99]:
farthest_distance = -1
max_pair = ()
results = []

for news_source1 in top_news_sources:
    for news_source2 in top_news_sources: 
        if news_source1==news_source2:
            continue
        node_id1 = domainToNodeID[news_source1]
        node_id2 = domainToNodeID[news_source2]
        Length1 = snap.GetShortPath(LinkGraph, node_id1, node_id2)
        Length2 = snap.GetShortPath(LinkGraph, node_id2, node_id1)
        distance = max(Length1, Length2)
        results.append(((news_source1, news_source2), distance))

In [107]:
count = 0
for i in range(14):
    count += i
print count

91


In [102]:
count1 = 0
count2 = 0
count3 = 0

for result in results:
    if result[1] == 1:
        count1 += 1
    elif result[1] == 2:
        count2 += 1
    else:
        count3 += 1
    

In [104]:
results

[(('abcnews.go.com', 'cbsnews.com'), 2),
 (('abcnews.go.com', 'cnn.com'), 1),
 (('abcnews.go.com', 'foxnews.com'), 2),
 (('abcnews.go.com', 'msnbc.com'), 2),
 (('abcnews.go.com', 'nytimes.com'), 1),
 (('abcnews.go.com', 'latimes.com'), 2),
 (('abcnews.go.com', 'usatoday.com'), 1),
 (('abcnews.go.com', 'wsj.com'), 2),
 (('abcnews.go.com', 'washingtonpost.com'), 1),
 (('abcnews.go.com', 'bloomberg.com'), 2),
 (('abcnews.go.com', 'vice.com'), 2),
 (('abcnews.go.com', 'huffingtonpost.com'), 1),
 (('abcnews.go.com', 'npr.org'), 1),
 (('cbsnews.com', 'abcnews.go.com'), 2),
 (('cbsnews.com', 'cnn.com'), 1),
 (('cbsnews.com', 'foxnews.com'), 1),
 (('cbsnews.com', 'msnbc.com'), 2),
 (('cbsnews.com', 'nytimes.com'), 1),
 (('cbsnews.com', 'latimes.com'), 1),
 (('cbsnews.com', 'usatoday.com'), 1),
 (('cbsnews.com', 'wsj.com'), 1),
 (('cbsnews.com', 'washingtonpost.com'), 1),
 (('cbsnews.com', 'bloomberg.com'), 2),
 (('cbsnews.com', 'vice.com'), 2),
 (('cbsnews.com', 'huffingtonpost.com'), 1),
 (('

In [None]:
farthest_distance = -1
max_pair = ()
common_neighbors = []

for news_source1 in top_news_sources:
    for news_source2 in top_news_sources: 
        if news_source1 == news_source2:
            continue
        node_id1 = domainToNodeID[news_source1]
        node_id2 = domainToNodeID[news_source2]
        Nbrs = snap.TIntV()
        snap.GetCmnNbrs(LinkGraph, node_id1, node_id2, Nbrs)
        
        distance = Nbrs.Len()
        
        NodeVec = snap.TIntV()
        snap.GetNodesAtHop(LinkGraph, node_id1, 1, NodeVec, False)
        count1 = NodeVec.Len() 
        NodeVec = snap.TIntV()
        snap.GetNodesAtHop(LinkGraph, node_id2, 1, NodeVec, False)
        count2 = NodeVec.Len()
        denominator = min(count1, count2)        
        common_neighbors.append(((news_source1, news_source2), distance/float(denominator)))

In [93]:
common_neighbors

[(('abcnews.go.com', 'cbsnews.com'), 0.3514644351464435),
 (('abcnews.go.com', 'cnn.com'), 0.4726962457337884),
 (('abcnews.go.com', 'foxnews.com'), 0.2883959044368601),
 (('abcnews.go.com', 'msnbc.com'), 0.4117647058823529),
 (('abcnews.go.com', 'nytimes.com'), 0.5665529010238908),
 (('abcnews.go.com', 'latimes.com'), 0.31399317406143346),
 (('abcnews.go.com', 'usatoday.com'), 0.3703071672354949),
 (('abcnews.go.com', 'wsj.com'), 0.3447098976109215),
 (('abcnews.go.com', 'washingtonpost.com'), 0.5546075085324232),
 (('abcnews.go.com', 'bloomberg.com'), 0.30204778156996587),
 (('abcnews.go.com', 'vice.com'), 0.18592964824120603),
 (('abcnews.go.com', 'huffingtonpost.com'), 0.40955631399317405),
 (('abcnews.go.com', 'npr.org'), 0.27474402730375425),
 (('cbsnews.com', 'abcnews.go.com'), 0.3514644351464435),
 (('cbsnews.com', 'cnn.com'), 0.4560669456066946),
 (('cbsnews.com', 'foxnews.com'), 0.29916317991631797),
 (('cbsnews.com', 'msnbc.com'), 0.36764705882352944),
 (('cbsnews.com', 'nyt

In [95]:
from operator import itemgetter

print max(common_neighbors,key=itemgetter(1))
print min(common_neighbors,key=itemgetter(1))

(('msnbc.com', 'nytimes.com'), 0.6985294117647058)
(('msnbc.com', 'vice.com'), 0.1323529411764706)


In [113]:
farthest_distance = -1
max_pair = ()
common_neighbors = []

for news_source1 in top_news_sources:
    for news_source2 in top_news_sources: 
        if news_source1 == news_source2:
            continue
        node_id1 = domainToNodeID[news_source1]
        node_id2 = domainToNodeID[news_source2]
        
        NodeVec1 = snap.TIntV()
        snap.GetNodesAtHop(LinkGraph, node_id1, 2, NodeVec1, False)
        count1 = NodeVec1.Len() 
        
        NodeVec2 = snap.TIntV()
        snap.GetNodesAtHop(LinkGraph, node_id2, 2, NodeVec2, False)
        count2 = NodeVec2.Len()
        
        nodes_1 = []
        nodes_2 = []
        for item in NodeVec1:
            nodes_1.append(item)
        for item in NodeVec2: 
            nodes_2.append(item)
            
        intersection = len(list(set(nodes_1) & set(nodes_2)))
        
        denominator = min(len(nodes_1), len(nodes_2))        
        common_neighbors.append(((news_source1, news_source2), intersection/float(denominator)))

In [114]:
print max(common_neighbors,key=itemgetter(1))
print min(common_neighbors,key=itemgetter(1))

(('msnbc.com', 'latimes.com'), 0.9641341980787048)
(('wsj.com', 'npr.org'), 0.7527530641448392)


In [121]:
common_neighbors = sorted(common_neighbors, key=itemgetter(1), reverse=True)
for result in common_neighbors:
    if result[0][1] == 'nytimes.com': 
        print result

(('msnbc.com', 'nytimes.com'), 0.9328278148924758)
(('cbsnews.com', 'nytimes.com'), 0.9288963860348617)
(('cnn.com', 'nytimes.com'), 0.92171736798744)
(('abcnews.go.com', 'nytimes.com'), 0.9216555022240976)
(('vice.com', 'nytimes.com'), 0.9144191426918875)
(('npr.org', 'nytimes.com'), 0.9087911404218254)
(('foxnews.com', 'nytimes.com'), 0.8958316457951949)
(('latimes.com', 'nytimes.com'), 0.8868552903035661)
(('wsj.com', 'nytimes.com'), 0.8832681391723772)
(('usatoday.com', 'nytimes.com'), 0.8777611856192945)
(('bloomberg.com', 'nytimes.com'), 0.8694601469756925)
(('washingtonpost.com', 'nytimes.com'), 0.8577147445738174)
(('huffingtonpost.com', 'nytimes.com'), 0.8434992079389393)


In [120]:
for result in common_neighbors:
    if result[0][1] == 'foxnews.com': 
        print result

(('npr.org', 'foxnews.com'), 0.7932347271048326)
(('latimes.com', 'foxnews.com'), 0.8052229980397557)
(('cnn.com', 'foxnews.com'), 0.813501385131304)
(('huffingtonpost.com', 'foxnews.com'), 0.8171950686085505)
(('wsj.com', 'foxnews.com'), 0.8183614949697863)
(('abcnews.go.com', 'foxnews.com'), 0.8360321395291594)
(('cbsnews.com', 'foxnews.com'), 0.8437526623387688)
(('usatoday.com', 'foxnews.com'), 0.8507136261279505)
(('bloomberg.com', 'foxnews.com'), 0.8537268942278096)
(('nytimes.com', 'foxnews.com'), 0.8958316457951949)
(('washingtonpost.com', 'foxnews.com'), 0.9007403567320621)
(('vice.com', 'foxnews.com'), 0.9025085152417471)
(('msnbc.com', 'foxnews.com'), 0.9306578241576047)


In [None]:
keys = IDtoDomain.keys()

key_list = []

for i in range(len(keys)):
    for j in range(i + 1, len(keys)): 
        node_1 = keys[i]
        node_2 = keys[j]
        Length1 = snap.GetShortPath(LinkGraph, node_1, node_2)
        Length2 = snap.GetShortPath(LinkGraph, node_2, node_1)
        min_length = min(Length1, Length2)
        max_length = max(Length1, Length2)
        key_list.append( ((node_1, node_2), min_length, max_length))