In [1]:
%%capture
import sys
!{sys.executable} -m pip install py2neo;
import py2neo
import pandas as pd
from collections import defaultdict

In [2]:
graph = py2neo.Graph('http://neo4j:7474')

# New York Times Comments
The data contains information about the comments made on the articles published in New York Times in Jan-May 2017 and Jan-April 2018.

See: https://www.kaggle.com/aashita/nyt-comments/home

## Data Summary

In [3]:
def yield_record(cursor):
    halt = False
    while not halt:
        try:
            yield cur.next().data()
        except:
            halt = True
            
Ncomments = graph.run("""MATCH ()-[r:COMMENTED]->() RETURN count(*)""").next().data()['count(*)']
            
cur = graph.run("""MATCH (n)
RETURN
DISTINCT labels(n),
count(*) AS nNodes,
avg(size( (n)-[]-() ) ) as avgDeg,
stdev(size( (n)-[]-() ) ) as stdDeg,
percentileDisc(size( (n)-[]-() ), 0.5) as medDeg, 
min(size( (n)-[]-() ) ) as minDeg,
max(size( (n)-[]-() ) ) as maxDeg""")

pd.DataFrame(
    [list(record.values()) for record in yield_record(cur)] + [[['COMMENTED'], Ncomments]], 
    columns=['Label', 'N', 'Avg. Degree', 'Std. Dev. Degree', 'Median Degree', 'Min. Degree', 'Max. Degree']
)

Unnamed: 0,Label,N,Avg. Degree,Std. Dev. Degree,Median Degree,Min. Degree,Max. Degree
0,[ARTICLE],9298,185.409228,315.736664,57.0,1.0,4996.0
1,[USER],301682,5.714411,21.186375,1.0,0.0,1571.0
2,[COMMENTED],1723935,,,,,


### What articles have the highest degree (i.e., most comments)?

In [4]:
cur = graph.run("""MATCH (a:ARTICLE)
WITH a, SIZE(()-[:COMMENTED]->(a)) as cmtCnt
ORDER BY cmtCnt DESC LIMIT 5
MATCH p=()-[:COMMENTED]->(a)
RETURN a, count(p)""")

pd.DataFrame([{**res['a'], 'Comments': res['count(p)']} for res in yield_record(cur)]) \
    .sort_values('Comments', ascending=False) \
    .set_index('articleID') \
    .style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,Comments,byline,connComponent,headline,newDesk,pagerank,pubDate,webURL
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
58b0894195d0e0247463875e,4996,By JULIE HIRSCHFELD DAVIS and MICHAEL M. GRYNBAUM,159386,Trump Intensifies Criticism of F.B.I. and Journalists,National,195.696,2017-02-24 19:27:53,Link
5912391b7c459f24986de9ab,4184,By MICHAEL D. SHEAR and MATT APUZZO,167890,Trump Fires Comey Amid Russia Inquiry,National,157.107,2017-05-09 21:48:03,Link
58ebb1437c459f24986d96ed,4014,By DANIEL VICTOR and MATT STEVENS,167890,"Man Is Dragged From a Full Jet, Stirring a Furor",Business,239.803,2017-04-10 16:22:22,Link
591a524d7c459f24986dfc28,3791,By DAVID BROOKS,167890,When a Child Is Leading The World,OpEd,165.153,2017-05-16 01:13:44,Link
5930616f7c459f24986e2e41,3709,By MICHAEL D. SHEAR,167890,Trump Abandoning Global Climate Accord,Foreign,171.479,2017-06-01 18:48:08,Link


### What users have the highest degree (i.e., most comments)?

In [5]:
cur = graph.run("""MATCH (u:USER)
WITH u, SIZE((u)-[:COMMENTED]->()) as cmtCnt
ORDER BY cmtCnt DESC LIMIT 5
MATCH p=(u)-[:COMMENTED]->()
RETURN u, count(p)""")

pd.DataFrame([{**res['u'], 'Comments': res['count(p)']} for res in yield_record(cur)]) \
    .sort_values('Comments', ascending=False) \
    .set_index('userID')

Unnamed: 0_level_0,Comments,connComponent,pagerank,userDisplayName,userLocation
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
61986282.0,1571,159386,0.15,Phyliss Dalmatian,"Wichita, Kansas"
17374907.0,1235,159386,0.15,Blackmamba,Il
47123844.0,1232,159386,0.15,Richard Luettgen,New Jersey
47112177.0,1174,159386,0.15,manfred m,Bolivia
37475504.0,1157,159386,0.15,John Doe,Johnstown


## How many subgraphs are there?

In [14]:
op = """MATCH (n) RETURN distinct(n.connComponent) as partition, count(*) as nNodes ORDER by nNodes DESC"""
cur = graph.run(op)
records = list(yield_record(cur))
print('Subgraphs: {}'.format(len(records)))
df = pd.DataFrame(records).set_index('partition')
df.head()

Subgraphs: 2369


Unnamed: 0_level_0,nNodes
partition,Unnamed: 1_level_1
159386,208774
167890,99666
150435,13
190309,6
217346,6


In [7]:
(df.nNodes > 2).sum()

35

In [42]:
partitions = {}
for idx in df[df.nNodes > 1].index:
    op = "MATCH (n) WHERE n.connComponent = {} RETURN collect(n.newDesk)".format(idx)
    cur = graph.run(op)
    value_counts = defaultdict(int)
    for ii in cur.next().data()['collect(n.newDesk)']:
        value_counts[ii] += 1
    partitions[idx] = dict(value_counts)

In [60]:
foo = pd.DataFrame([ii for ii in partitions.values()])
bar = pd.DataFrame((foo.values / foo.sum(axis=1).values[:, None]))
bar.columns = foo.columns
pd.set_option('display.max_columns', 500)
bar.head()

Unnamed: 0,Arts&Leisure,Automobiles,BookReview,Business,Climate,Culture,Dining,EdLife,Editorial,Express,Foreign,Games,Insider,Investigative,Learning,Letters,Magazine,Metro,Metropolitan,NYTNow,National,NewsDesk,Obits,OpEd,Photo,Podcasts,Politics,RealEstate,Science,Smarter Living,Society,SpecialSections,Sports,Styles,Summary,SundayBusiness,TStyle,Travel,Unknown,Upshot,Video,Washington,Weekend,Well
0,0.008978,,0.007074,0.047885,0.004217,0.05251,0.03809,0.000816,0.03156,0.002585,0.052782,0.038226,0.014692,0.003673,0.059312,0.000544,0.037274,0.064889,0.006122,0.000544,0.06761,0.003945,0.002993,0.181608,0.005169,0.003537,0.002993,0.026391,0.03088,0.004217,0.000272,0.002449,0.032104,0.007074,0.000408,0.006122,0.001905,0.012379,0.009795,0.022854,0.000408,0.051422,0.014284,0.03741
1,0.002728,0.000546,0.005456,0.02946,,0.057283,0.045827,0.002728,0.052373,,0.065466,0.039825,0.02455,,0.049645,0.000546,0.060011,0.062739,0.006547,,0.072013,,,0.200764,,,,0.01473,0.031097,,,,0.050736,0.003819,0.000546,0.006547,,0.010911,0.016912,0.033279,,,0.016912,0.036007
2,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [62]:
from scipy import spatial
1 - spatial.distance.cosine(bar.iloc[0, :].fillna(0), bar.iloc[1, :].fillna(0))


0.96413538651473352

## What are the most influential articles?

In [9]:
cur = graph.run("""MATCH (n:ARTICLE)
RETURN n
ORDER by n.pagerank
DESC LIMIT 5""")
pd.DataFrame([rec['n'] for rec in yield_record(cur)]) \
    .sort_values('pagerank', ascending=False) \
    .set_index('articleID') \
    .drop('connComponent', axis=1) \
    .style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,byline,headline,newDesk,pagerank,pubDate,webURL
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
58e4d28e7c459f24986d87c9,By KATHERINE SCHULTEN,Our Eighth Annual Found Poem Student Contest,Learning,272.599,2017-04-05 11:18:34,Link
58ebb1437c459f24986d96ed,By DANIEL VICTOR and MATT STEVENS,"Man Is Dragged From a Full Jet, Stirring a Furor",Business,239.803,2017-04-10 16:22:22,Link
58b0894195d0e0247463875e,By JULIE HIRSCHFELD DAVIS and MICHAEL M. GRYNBAUM,Trump Intensifies Criticism of F.B.I. and Journalists,National,195.696,2017-02-24 19:27:53,Link
5930616f7c459f24986e2e41,By MICHAEL D. SHEAR,Trump Abandoning Global Climate Accord,Foreign,171.479,2017-06-01 18:48:08,Link
591a524d7c459f24986dfc28,By DAVID BROOKS,When a Child Is Leading The World,OpEd,165.153,2017-05-16 01:13:44,Link


## Simple collaborative filtering

In [10]:
cur = graph.run("""MATCH (n:ARTICLE) 
WITH n, rand() as r
ORDER BY r
RETURN n
LIMIT 1
""")
article = cur.next().data()['n']
pd.DataFrame([article]).set_index('articleID').style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,byline,connComponent,headline,newDesk,pagerank,pubDate,webURL
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5a8b2f0e10f40f00018c1d8f,By ABBY GOODNOUGH,159386,The Heartburn Behind Free Care for Only Some,Science,2.58539,2018-02-19 20:09:41,Link


In [11]:
cur = graph.run("""MATCH (n:ARTICLE {{articleID: '{0}'}})-[]-(:USER)-[]-(m:ARTICLE)
WHERE NOT m.articleID = '{0}'
RETURN m""".format(article['articleID']))
df = pd.DataFrame([rec['m'] for rec in yield_record(cur)])
N = pd.DataFrame(df.groupby('articleID').size().rename('N'))
df1 = df.merge(N, left_on='articleID', right_on='articleID').drop_duplicates().set_index('articleID')
df1.sort_values(['N', 'pagerank'], ascending=False).head(5).style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,byline,connComponent,headline,newDesk,pagerank,pubDate,webURL,N
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
58ac68f595d0e02474637c29,By MICHAEL D. SHEAR and RON NIXON,159386,More Immigrants Face Deportation Under New Rules,National,59.3884,2017-02-21 16:21:02,Link,15
5ada0944068401528a2a9d2f,By AMY CHOZICK,159386,‘They Were Never Going To Let Me Be President’,OpEd,35.7204,2018-04-20 15:37:32,Link,15
58d525e17c459f247805dc9e,"By ROBERT PEAR, THOMAS KAPLAN and MAGGIE HABERMAN",159386,G.O.P. Revolt Sinks Bid to Void Health Law,National,82.2424,2017-03-24 13:57:40,Link,14
5add2004068401528a2aa14b,By PATRICIA COHEN and ROBERT GEBELOFF,159386,Public Servants Losing Foothold In Middle Class,Business,13.2119,2018-04-22 23:51:29,Link,13
588e0b1e95d0e0392607d69a,By PETER BAKER,159386,Trump Modifies Ban on Migrants as Outcry Grows,National,53.5918,2017-01-29 15:32:40,Link,12
