In [187]:
from neo4j import GraphDatabase
import pandas as pd
import networkx as nx

In [367]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "mag"))

In [169]:
def get_top_level_fields():
    TOP_LEVEL_FOS_QUERY = """
        MATCH (c:FieldsOfStudy) WHERE c.level = 0 RETURN c
    """
    with driver.session() as session:
        return pd.DataFrame(
            [dict(res['c']) for res in session.run(TOP_LEVEL_FOS_QUERY).data()]
        ).drop(['createdDate', 'mainType', 'displayName'], axis=1)    

In [256]:
get_top_level_fields()

Unnamed: 0,citationCount,level,paperCount,fieldOfStudyId,rank,normalizedName
0,5899803,0,6881470,95457728,14747,history
1,7487874,0,3977456,205649164,8880,geography
2,36760247,0,4965136,127313418,8467,geology
3,71789329,0,6671940,33923547,7216,mathematics
4,89882893,0,14168189,41008148,7358,computer science
5,317500143,0,16932898,86803240,6663,biology
6,14578948,0,4605062,144133560,25000,business
7,13219878,0,4429896,17744445,8806,political science
8,43768427,0,3950679,162324750,25000,economics
9,78784820,0,17935539,127413603,25000,engineering


In [369]:
def get_subfields_by_size(parentName, minPapers, maxPapers, limit=10):
    FOS_QUERY = """
        MATCH (c:FieldsOfStudy)<-[:PARENT]-(parent:FieldsOfStudy{normalizedName: $fosName})
        WHERE c.paperCount > $minct AND c.paperCount < $maxct
        RETURN DISTINCT c
        ORDER BY c.citationCount DESC
        LIMIT $limit
    """
    with driver.session() as session:
        results = session.run(FOS_QUERY, minct=minPapers, maxct=maxPapers, limit=limit, fosName=parentName)
        return pd.DataFrame(
            [dict(res['c']) for res in results.data()]
        )   

In [371]:
get_subfields_by_size('linguistics', 300, 800, 50)

Unnamed: 0,citationCount,createdDate,level,displayName,paperCount,fieldOfStudyId,rank,normalizedName,mainType
0,23498,2016-06-24,2,Social interactionist theory,313,79500916,16876,social interactionist theory,
1,22183,2018-01-05,2,Audience design,322,2778906372,14805,audience design,
2,20120,2016-06-24,2,Biolinguistics,327,35511919,15108,biolinguistics,
3,19685,2016-06-24,2,Nominal group,631,141728528,14905,nominal group,
4,16826,2018-01-05,2,Brown Corpus,314,2777744975,15165,brown corpus,
5,16043,2018-01-05,2,Radical interpretation,373,2778569360,15556,radical interpretation,
6,13328,2016-06-24,2,Fast mapping,690,117015117,14783,fast mapping,
7,13219,2018-01-05,2,Origin of language,763,2780989915,14542,origin of language,
8,12671,2016-06-24,2,FrameNet,791,185754541,15122,framenet,
9,12529,2016-06-24,2,Linguistic sequence complexity,627,194926167,14749,linguistic sequence complexity,


In [172]:
def get_child_fields_by_id(parentFosID):
    CHILD_FOS_QUERY = """
        MATCH (c:FieldsOfStudy)<-[:PARENT]-(parent:FieldsOfStudy{fieldOfStudyId: $fosid}) RETURN c
    """
    with driver.session() as session:
        return pd.DataFrame(
            [dict(res['c']) for res in session.run(CHILD_FOS_QUERY, fosid=parentFosID).data()]
        ).drop(['createdDate', 'mainType', 'displayName'], axis=1)

def get_child_fields_by_name(parentName):
    CHILD_FOS_QUERY = """
        MATCH (c:FieldsOfStudy)<-[:PARENT]-(parent:FieldsOfStudy{normalizedName: $fosName}) 
        RETURN c
        ORDER BY c.paperCount
    """
    with driver.session() as session:
        return pd.DataFrame(
            [dict(res['c']) for res in session.run(CHILD_FOS_QUERY, fosName=parentName).data()]
        ).drop(['createdDate', 'mainType', 'displayName'], axis=1)

In [171]:
get_child_fields_by_name("social epistemology")

Unnamed: 0,citationCount,level,paperCount,fieldOfStudyId,rank,normalizedName
0,8309,3,945,184066805,14430,genetic epistemology
1,10257,3,1192,73059102,15975,formal epistemology
2,14898,3,1771,145097170,25000,epistemology of wikipedia


In [173]:
def get_authors(fieldOfStudyName):
    AUTHOR_QUERY = """
        MATCH (p:Paper)-[:IN_FIELD]->(parent:FieldsOfStudy{normalizedName: $name}) WITH p
        MATCH (p)-[:AUTHORED_BY]->(a:Author)
        RETURN DISTINCT a
        ORDER BY a.citationCount DESC
    """
    with driver.session() as session:
        return pd.DataFrame(
            [dict(res['a']) for res in session.run(AUTHOR_QUERY, name=fieldOfStudyName).data()]
        ).drop(['createdDate', 'displayName'], axis=1)

In [259]:
res = get_authors("social epistemology")
res[res.normalizedName.str.startswith('kevin')]

Unnamed: 0,citationCount,paperCount,rank,authorId,normalizedName
447,1101,71,15392,2097678952,kevin t kelly
648,551,32,16732,2088804034,kevin j s zollman
885,276,34,16756,2053622605,kevin t jackson
1046,171,47,16846,2071876023,kevin mccain
1998,8,1,20517,2720828136,kevin vallier
2055,7,1,19975,2792820455,kevin r gregg
2092,6,1,20516,2181692093,kevin williams
3108,0,1,20873,2688938239,kevin kurdylo
4083,0,1,20873,2251971931,kevin anderson
4106,0,1,20873,2889292227,kevin hermberg


In [213]:
def get_coauthor_graph(fieldOfStudyName):
    COAUTHOR_QUERY = """
        MATCH (p:Paper)-[:IN_FIELD]->(parent:FieldsOfStudy{normalizedName: $fosName}) WITH p
        MATCH (p)-[:AUTHORED_BY]->(a:Author) WITH a
        MATCH (a:Author)<-[r1:AUTHORED_BY]-(p1:Paper)-[r2:AUTHORED_BY]->(a2:Author)
        RETURN DISTINCT {start:a.normalizedName,end:a2.normalizedName}
    """

    with driver.session() as session:
        query_results = session.run(COAUTHOR_QUERY, fosName=fieldOfStudyName).data()
        edge_list = [
            (
                res['{start:a.normalizedName,end:a2.normalizedName}']["start"], 
            res['{start:a.normalizedName,end:a2.normalizedName}']['end']
            ) for res in query_results]
        G = nx.DiGraph()
        G.add_edges_from(edge_list)
        return nx.convert_node_labels_to_integers(G)

In [260]:
graph = get_coauthor_relations("social epistemology")

In [287]:
def get_papers(fieldOfStudyName):
    PAPERS_QUERY = """
        MATCH (p:Paper)-[:IN_FIELD]->(parent:FieldsOfStudy{normalizedName: $fosName})
        WHERE p.citationCount > 0 AND p.referenceCount > 0 AND p.docType = "Journal"
        RETURN p
    """

    with driver.session() as session:
        query_results = session.run(PAPERS_QUERY, fosName=fieldOfStudyName).data()
        return query_results

In [375]:
len(get_papers("sociolinguistics"))

3787

In [290]:
def get_authors(fieldOfStudyName):
    AUTHORS_QUERY = """
        MATCH (p:Paper)-[:IN_FIELD]->(parent:FieldsOfStudy{normalizedName: $fosName})
        WHERE p.citationCount > 0 AND p.referenceCount > 0 AND p.docType = "Journal"
        MATCH (p)-[:AUTHORED_BY]->(a:Author)
        RETURN DISTINCT a
    """

    with driver.session() as session:
        query_results = session.run(AUTHORS_QUERY, fosName=fieldOfStudyName).data()
        return query_results

In [376]:
len(get_authors("sociolinguistics"))

4577

In [179]:
res

[{'{start:a.authorId,rel:"COAUTHOR",end:a2.authorId}': {'start': '2165615180',
   'rel': 'COAUTHOR',
   'end': '2558932145'}},
 {'{start:a.authorId,rel:"COAUTHOR",end:a2.authorId}': {'start': '2165615180',
   'rel': 'COAUTHOR',
   'end': '2157414154'}},
 {'{start:a.authorId,rel:"COAUTHOR",end:a2.authorId}': {'start': '742588681',
   'rel': 'COAUTHOR',
   'end': '2625440941'}},
 {'{start:a.authorId,rel:"COAUTHOR",end:a2.authorId}': {'start': '742588681',
   'rel': 'COAUTHOR',
   'end': '2497628704'}},
 {'{start:a.authorId,rel:"COAUTHOR",end:a2.authorId}': {'start': '742588681',
   'rel': 'COAUTHOR',
   'end': '2016486316'}},
 {'{start:a.authorId,rel:"COAUTHOR",end:a2.authorId}': {'start': '742588681',
   'rel': 'COAUTHOR',
   'end': '1738948060'}},
 {'{start:a.authorId,rel:"COAUTHOR",end:a2.authorId}': {'start': '742588681',
   'rel': 'COAUTHOR',
   'end': '2505626879'}},
 {'{start:a.authorId,rel:"COAUTHOR",end:a2.authorId}': {'start': '742588681',
   'rel': 'COAUTHOR',
   'end': '28893

In [384]:
def authors_to_graph(authorPairs):
    edge_list = [(p['a1']["normalizedName"], p['a2']['normalizedName']) for p in authorPairs]
    G = nx.DiGraph()
    G.add_edges_from(edge_list)
    return nx.convert_node_labels_to_integers(G)

def get_cited_relations(fieldOfStudyName):
    CITED_QUERY = """
        MATCH (p1:Paper)-[:IN_FIELD]->(parent:FieldsOfStudy{normalizedName: $fosName})
        WHERE p1.citationCount > 0 AND p1.referenceCount > 0 AND p1.docType = "Journal" WITH p1
        MATCH (p1:Paper)-[r:REFERENCES]->(p2:Paper)
        WHERE p2.citationCount > 0 AND p2.referenceCount > 0 AND p2.docType = "Journal" WITH p1, p2
        MATCH (p2:Paper)-[:AUTHORED_BY]->(a2:Author)<-[:AUTHORED_BY]-(:Paper)-[:IN_FIELD]->(parent:FieldsOfStudy{normalizedName: $fosName})
        WITH p1, a2
        MATCH (p1:Paper)-[:AUTHORED_BY]->(a1:Author)
        RETURN DISTINCT a1, a2
        LIMIT 50000
    """
    with driver.session() as session:
        return session.run(CITED_QUERY, fosName=fieldOfStudyName).data()

In [359]:
def authors_to_graph(authorPairs):
    edge_list = [(p['a1']["normalizedName"], p['a2']['normalizedName']) for p in authorPairs]
    G = nx.DiGraph()
    G.add_edges_from(edge_list)
    return nx.convert_node_labels_to_integers(G)

In [360]:
rels = get_cited_relations("social epistemology")

KeyboardInterrupt: 

In [388]:
g = authors_to_graph(rels)
print("Number nodes: {}".format(len(g.nodes())))
print("Number edges: {}".format(len(g.edges())))
print("Average clustering coefficient: {}".format(nx.average_clustering(g)))
print("Number strongly connected components: {}".format(nx.number_strongly_connected_components(g)))
print("Number weakly connected components: {}".format(nx.number_weakly_connected_components(g)))
nx.write_graphml(g, "social_epistemology.graphml")

Number nodes: 799
Number edges: 1999
Average clustering coefficient: 0.12939346623157844
Number strongly connected components: 723
Number weakly connected components: 78


In [389]:
ling_rels = get_cited_relations("sociolinguistics")

In [390]:
ling_g = authors_to_graph(ling_rels)
print("Number nodes: {}".format(len(ling_g.nodes())))
print("Number edges: {}".format(len(ling_g.edges())))
print("Average clustering coefficient: {}".format(nx.average_clustering(ling_g)))
print("Number strongly connected components: {}".format(nx.number_strongly_connected_components(ling_g)))
print("Number weakly connected components: {}".format(nx.number_weakly_connected_components(ling_g)))
nx.write_graphml(ling_g, "sociolinguistics.graphml")

Number nodes: 4188
Number edges: 23485
Average clustering coefficient: 0.1681175586475732
Number strongly connected components: 3064
Number weakly connected components: 66
