In [1]:
%%capture
import sys
!{sys.executable} -m pip install py2neo;
import py2neo
import pandas as pd

In [2]:
graph = py2neo.Graph('http://neo4j:7474')

# New York Times Comments
The data contains information about the comments made on the articles published in New York Times in Jan-May 2017 and Jan-April 2018.

See: https://www.kaggle.com/aashita/nyt-comments/home

## Data Summary
### Nodes

In [3]:
def yield_record(cursor):
    halt = False
    while not halt:
        try:
            yield cur.next().data()
        except:
            halt = True
            
Ncomments = graph.run("""MATCH ()-[r:COMMENTED]->() RETURN count(*)""").next().data()['count(*)']
            
cur = graph.run("""MATCH (n)
RETURN
DISTINCT labels(n),
count(*) AS nNodes,
avg(size( (n)-[]-() ) ) as avgDeg,
stdev(size( (n)-[]-() ) ) as stdDeg,
percentileDisc(size( (n)-[]-() ), 0.5) as medDeg, 
min(size( (n)-[]-() ) ) as minDeg,
max(size( (n)-[]-() ) ) as maxDeg""")

pd.DataFrame(
    [list(record.values()) for record in yield_record(cur)] + [[['COMMENTED'], Ncomments]], 
    columns=['Label', 'N', 'Avg. Degree', 'Std. Dev. Degree', 'Median Degree', 'Min. Degree', 'Max. Degree']
)

Unnamed: 0,Label,N,Avg. Degree,Std. Dev. Degree,Median Degree,Min. Degree,Max. Degree
0,[ARTICLE],9298,185.409228,315.736664,57.0,1.0,4996.0
1,[USER],301682,5.714411,21.186375,1.0,0.0,1571.0
2,[COMMENTED],1723935,,,,,


### What articles have the highest degree (i.e., most comments)?

In [4]:
cur = graph.run("""MATCH (a:ARTICLE)
WITH a, SIZE(()-[:COMMENTED]->(a)) as cmtCnt
ORDER BY cmtCnt DESC LIMIT 5
MATCH p=()-[:COMMENTED]->(a)
RETURN a, count(p)""")

pd.DataFrame([{**res['a'], 'Comments': res['count(p)']} for res in yield_record(cur)]) \
    .sort_values('Comments', ascending=False) \
    .set_index('articleID') \
    .style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,Comments,byline,connComponent,headline,labelProp,newDesk,pagerank,pubDate,webURL
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
58b0894195d0e0247463875e,4996,By JULIE HIRSCHFELD DAVIS and MICHAEL M. GRYNBAUM,155742,Trump Intensifies Criticism of F.B.I. and Journalists,1749,National,195.696,2017-02-24 19:27:53,Link
5912391b7c459f24986de9ab,4184,By MICHAEL D. SHEAR and MATT APUZZO,173896,Trump Fires Comey Amid Russia Inquiry,15988,National,157.107,2017-05-09 21:48:03,Link
58ebb1437c459f24986d96ed,4014,By DANIEL VICTOR and MATT STEVENS,173896,"Man Is Dragged From a Full Jet, Stirring a Furor",2781,Business,239.803,2017-04-10 16:22:22,Link
591a524d7c459f24986dfc28,3791,By DAVID BROOKS,173896,When a Child Is Leading The World,16169,OpEd,165.153,2017-05-16 01:13:44,Link
5930616f7c459f24986e2e41,3709,By MICHAEL D. SHEAR,173896,Trump Abandoning Global Climate Accord,27060,Foreign,171.479,2017-06-01 18:48:08,Link


### What users have the highest degree (i.e., most comments)?

In [5]:
cur = graph.run("""MATCH (u:USER)
WITH u, SIZE((u)-[:COMMENTED]->()) as cmtCnt
ORDER BY cmtCnt DESC LIMIT 5
MATCH p=(u)-[:COMMENTED]->()
RETURN u, count(p)""")

pd.DataFrame([{**res['u'], 'Comments': res['count(p)']} for res in yield_record(cur)]) \
    .sort_values('Comments', ascending=False) \
    .set_index('userID')

Unnamed: 0_level_0,Comments,connComponent,labelProp,pagerank,userDisplayName,userLocation
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
61986282.0,1571,155742,62885,0.15,Phyliss Dalmatian,"Wichita, Kansas"
17374907.0,1235,155742,44749,0.15,Blackmamba,Il
47123844.0,1232,155742,62885,0.15,Richard Luettgen,New Jersey
47112177.0,1174,155742,6882,0.15,manfred m,Bolivia
37475504.0,1157,155742,6882,0.15,John Doe,Johnstown


# Run PageRank and Connected Components

In [6]:
op = """CALL algo.pageRank(null, null, {iterations:20, dampingFactor:0.85, write: true, writeProperty:"pagerank"})
YIELD nodes, iterations, loadMillis, computeMillis, writeMillis, dampingFactor, write, writeProperty"""
pd.DataFrame([graph.run(op).next().data()])

Unnamed: 0,computeMillis,dampingFactor,iterations,loadMillis,nodes,write,writeMillis,writeProperty
0,196,0.85,20,2236,310980,True,15639,pagerank


In [7]:
op = """CALL algo.unionFind(null, null, {write:true, partitionProperty:"connComponent"})
YIELD nodes, setCount, loadMillis, computeMillis, writeMillis;"""
pd.DataFrame([graph.run(op).next().data()])

Unnamed: 0,computeMillis,loadMillis,nodes,setCount,writeMillis
0,37,2006,310980,2369,11370


## How many subgraphs are there?

In [8]:
op = """MATCH (n) RETURN distinct(n.connComponent) as partition, count(*) as nNodes ORDER by nNodes DESC"""
cur = graph.run(op)
records = list(yield_record(cur))
print('Subgraphs: {}'.format(len(records)))
pd.DataFrame(records).set_index('partition').head()

Subgraphs: 2369


Unnamed: 0_level_0,nNodes
partition,Unnamed: 1_level_1
155742,208774
173896,99666
158068,13
195223,6
258842,6


## What are the most influential articles?

In [9]:
cur = graph.run("""MATCH (n:ARTICLE)
RETURN n
ORDER by n.pagerank
DESC LIMIT 5""")
pd.DataFrame([rec['n'] for rec in yield_record(cur)]) \
    .sort_values('pagerank', ascending=False) \
    .set_index('articleID') \
    .drop('connComponent', axis=1) \
    .style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0_level_0,byline,headline,labelProp,newDesk,pagerank,pubDate,webURL
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
58e4d28e7c459f24986d87c9,By KATHERINE SCHULTEN,Our Eighth Annual Found Poem Student Contest,2095,Learning,272.599,2017-04-05 11:18:34,Link
58ebb1437c459f24986d96ed,By DANIEL VICTOR and MATT STEVENS,"Man Is Dragged From a Full Jet, Stirring a Furor",2781,Business,239.803,2017-04-10 16:22:22,Link
58b0894195d0e0247463875e,By JULIE HIRSCHFELD DAVIS and MICHAEL M. GRYNBAUM,Trump Intensifies Criticism of F.B.I. and Journalists,1749,National,195.696,2017-02-24 19:27:53,Link
5930616f7c459f24986e2e41,By MICHAEL D. SHEAR,Trump Abandoning Global Climate Accord,27060,Foreign,171.479,2017-06-01 18:48:08,Link
591a524d7c459f24986dfc28,By DAVID BROOKS,When a Child Is Leading The World,16169,OpEd,165.153,2017-05-16 01:13:44,Link


## Simple collaborative filtering

In [71]:
cur = graph.run("""MATCH (n:ARTICLE) 
WITH n, rand() as r
ORDER BY r
RETURN n
LIMIT 1
""")
article = cur.next().data()['n']
article

(_63025:ARTICLE {articleID: '5ad52a7e068401528a2a902a', byline: 'By MICHELLE GOLDBERG', connComponent: 155742, headline: 'Lordy, Is There a Tape?', labelProp: 63025, newDesk: 'OpEd', pagerank: 4.320236, pubDate: '2018-04-16 22:58:05', webURL: 'https://www.nytimes.com/2018/04/16/opinion/comey-book-steele-dossier.html'})

In [72]:
cur = graph.run("""MATCH (n:ARTICLE {{articleID: '{0}'}})-[]-(:USER)-[]-(m:ARTICLE)
WHERE NOT m.articleID = '{0}'
RETURN m""".format(article['articleID']))
rows = [rec['m'] for rec in yield_record(cur)]

In [73]:
df = pd.DataFrame(rows)
N = pd.DataFrame(df.groupby('articleID').size().rename('N'))
df1 = df.merge(N, left_on='articleID', right_on='articleID').drop_duplicates()
df1.sort_values(['N', 'pagerank'], ascending=False).head(5).style.format({'webURL': lambda x: f'<a target="_blank" href="{x}"">Link</a>'})

Unnamed: 0,articleID,byline,connComponent,headline,labelProp,newDesk,pagerank,pubDate,webURL,N
55439,5acd2e87068401528a2a690c,By THE EDITORIAL BOARD,155742,"The Law Is Coming, Mr. Trump",62425,Editorial,33.0822,2018-04-10 21:37:09,Link,117
14771,5aaf074647de81a90121246d,By CHARLES M. BLOW,155742,Trump: The Un-American President,6922,OpEd,14.336,2018-03-19 00:41:39,Link,106
11803,5aa5ccab47de81a90120d30c,By CHARLES M. BLOW,155742,Melania Knew,6587,OpEd,23.7514,2018-03-12 00:41:11,Link,105
35746,58bcc2ba7c459f2525d1ff90,By THE EDITORIAL BOARD,155742,When One President Smears Another,44269,Editorial,34.0502,2017-03-06 02:00:17,Link,94
53585,5ad3a684068401528a2a8cfd,By MICHAEL D. SHEAR and PETER BAKER,155742,"Comey, in Interview, Launches All-Out War Against President",62234,Washington,18.349,2018-04-15 19:22:39,Link,94
