**Nodes 2020 Talk: Network Like an Egghead**


This notebook examines a linkedin personal network via a python interface with a Neo4j Desktop Environment, which includes a graph database. 

Due to the inclusion of personal information, I will not share the data directly, but will by happy to answer questions about it so that the results can be reproduced. 

In [21]:
# Environment Setup

In [1]:
from neo4j import GraphDatabase # The neo4j driver for python; allows interface with Neo4j 
import pandas as pd # Data inspection and Manipulation
import pickle # Data serialization
from py2neo import Graph # Allows easier neo4j database & graph commands/queries w/ python
from py2neo import Node, Relationship, NodeMatcher
pd.set_option('display.max_columns', None)

In [2]:
# Establish bolt connection to database. Make sure database is started from Neo4j Desktop!

url = "bolt://localhost:7687"
graph_3 = Graph(url, auth=("neo4j", "testy"))

In [5]:
len(graph_3.nodes), len(graph_3.relationships) # Check the nodes and links in database

(0, 0)

In [4]:
graph_3.delete_all() # In case you need to delete the database

In [None]:
# new_dict

In [None]:
# newnewdict

Import csv data
- "data_for_common_nodes": a table of my 'nodes'; each row is a linkedin 1st order contact
- "mutual connections": a python dictionary of my mutual connections; 
                        Each key is a direct contact
                        Each value is a list of mutual contacts

In [6]:
# Direct, 1st order nodes
df = pd.read_csv('data_for_common_nodes_final.csv')

In [7]:
# df.head()

In [8]:
# Mutual Connections between 1st order nodes
# Collected and cleaned, and placed in serialized (pickled) file
filename = 'mutual_connections'
infile = open(filename,'rb')
new_dict = pickle.load(infile)
infile.close()

Data Wrangling

In [9]:
# create dictionarys to convert between profile urls and profile IDs
url_to_id = dict(zip(df.linkedinProfile,df.profileId))
id_to_url = dict(zip(df.profileId,df.linkedinProfile))
id_to_title = dict(zip(df.profileId,df.jobTitle))
url_to_title = dict(zip(df.linkedinProfile,df.jobTitle))

In [10]:
# These lines are meant for cleanup. Since the nodes from the two sources above 
# were collected at different times, the may be inconsistent. 
# This uses the nodes drawn from the mutual connections files as the 'master' set.
# It also eliminates nans, zeros, and nulls if they exist

node_list = new_dict.keys()  # 
list_of_node_lists = [id_to_url[x] for x in list(node_list) if x != 0]

for k,v in new_dict.items():
    trimmed_v = [x for x in v if x in list_of_node_lists] 
    new_dict[k] = trimmed_v

In [12]:
# Deduplicate the dictionary of mutual connections. In the mutual connections dictionary, 
# mirrored relationships exist. E.g., x-->y and y-->x are represented. Since my interpretation
# is an undirectly graph, one of these relationships is superflous. This quick and dirty 
# code simply loops through the mutual connections to create a new dictionary. If it finds
# superfluous connections, it eliminates them.

newnewdict={}
for k,v in new_dict.items():
    newnewdict[k]=[]
    newlist=[]
    for val in v:
        
        if url_to_id[val] in newnewdict:
        
            try:
                if (id_to_url[k] in new_dict[url_to_id[val]]) and (id_to_url[k] in newnewdict[url_to_id[val]]):
                    continue
                elif (id_to_url[k] in new_dict[url_to_id[val]]) and not (id_to_url[k] in newnewdict[url_to_id[val]]):
                    newlist.append(val)
            except:
                continue
                
        else:
            newlist.append(val)          
        
    newnewdict[k]=newlist  
    

In [13]:
# How many 1st order LinkedIn Connections

len(set(list_of_node_lists))

1842

Node and Edge Creation

In [14]:
# Create Nodes, than Edges for Direct Connections

# First create the 'central' node (the linkedin account owner)

me = 'Keita Broadwater'

me = Node('Person', name='Keita Broadwater', title='ML Eng') # creates a node object of type "Person"
                                             # using the attributes 'name' and 'title'
graph_3.create(me) # Creates a node in the Neo4j database


# This code loops through the set of nodes, 
# first, creating the node in the Neo4j database,
# then, creating the edge between the 'central' node (me) and that 1st order node

for node in list(set(list_of_node_lists)):
    try:
        idd= url_to_id[node]
        titl= url_to_title[node]
        you = Node('Person', name=idd, title=titl) # assigns a node object to the variable 'you'
        graph_3.create(you) # create the node in Neo4j
        graph_3.create(Relationship(me, "knows", you)) # creates the edge of type 'knows' 
                                                       # in Neo4j
    except: continue
    

In [15]:
# Create Edges for mutual connections

for k,v in newnewdict.items(): # 'k' is the node under consideration
                             # 'v' is the list of mutually connected nodes of 'k'
    
    
    # The next 3 lines do cleanup of zero items, and items that are not in master list.
    if k==0: continue # Some items in original were zero. This line eliminates them.
    uk = id_to_url[k] # convert id to URL, to match to master list.  
    if uk not in list_of_node_lists:continue
    
    # This cycles through the mutual connections, and creates their edges
    for node in v:
        
        if node not in list_of_node_lists:continue # must be in master list
        if node != '': # must not have blank value
            
            nodeu = url_to_id[node] # converts to ID
            
            # Next two lines create python objects from the Neo4j node objects
            # Then these objects are used to create a mutual edge in the third line
            existing_u1 = graph_3.evaluate('MATCH (x) WHERE x.name="{}" RETURN(x)'.format(k))
            existing_u2 = graph_3.evaluate('MATCH (x) WHERE x.name="{}" RETURN(x)'.format(nodeu)) 
            graph_3.create(Relationship(existing_u1, "knows", existing_u2))
            

Prep data for Export to Gephi

In [16]:
# Cypher to create Node Table for export to Gephi
cypher_all_nodes="MATCH (n) RETURN id(n),n.title "
nodes_df = graph_3.run(cypher_all_nodes).to_data_frame()
nodes_df.columns =['Id', 'Title'] 
nodes_df.to_csv('nodes_for_gephi.csv')
nodes_df.head()

Unnamed: 0,Id,Title
0,4018,ML Eng
1,4019,CTO
2,4020,
3,4021,General Manager
4,4022,Founder


In [17]:
# Cypher to create Edge Table for export to Gephi
cypher_all_edges="MATCH (n)-[r]->(m) RETURN id(n),id(m)"
edges_df = graph_3.run(cypher_all_edges).to_data_frame()
edges_df.columns =['Source', 'Target'] 
edges_df.to_csv('edges_for_gephi.csv')
edges_df.head()

Unnamed: 0,Source,Target
0,5726,4019
1,5085,4019
2,4018,4019
3,4018,4020
4,4398,4021


The next cells are not meant to be run in order, since some of them modify the database. Some of the query results are best viewed by running the queries directly in the Neo4j interface. Some are best viewed by collecting the results in pandas dataframes and visualizing in the notebook. Or by exporting the query results to Gephi.

In [None]:
# Queries for to create and modify graphs

In [18]:
# Create a graph object
cypher_create_graph = '''CALL gds.graph.create(
    'my-linkedin-graph',
    'Person',
    'knows'
)
YIELD graphName, nodeCount, relationshipCount, createMillis;'''

In [None]:
# Delete a node from the graph database
cypher_delete_node ='''MATCH (p:Person)
WHERE p.name = 'Keita Broadwater'
DETACH DELETE  p'''

In [None]:
# Queries for EDA

In [None]:
# Return all edges
cypher_all_edges="MATCH (n)-[r]->(m) RETURN id(n),id(m)"

# Return all nodes
cypher_all_nodes="MATCH (n) RETURN id(n),n.title "

In [None]:
cypher_component_count='''CALL gds.wcc.stats('my-linkedin-graph')
YIELD componentCount, componentDistribution
RETURN componentCount, 
       componentDistribution.min as min,
       componentDistribution.max as max,
       componentDistribution.mean as mean'''

In [None]:
# Get a list of components and their size
cypher_components_list='''CALL gds.wcc.stream('my-linkedin-graph-sans-keita') 
YIELD nodeId, componentId RETURN componentId, count(*) as size 
ORDER BY size DESC LIMIT 1000'''


cypher_diameter_count_and_show='''MATCH (a:Person), (b:Person) WHERE id(a) > id(b)
MATCH path = shortestPath((a)-[:knows*]-(b))
RETURN path, length(path) AS len
ORDER BY len DESC
LIMIT 10'''


In [None]:
# Queries for Centrality Measures

In [None]:
# Degree
cypher_degree_centrality='''CALL gds.alpha.degree.stream('my-linkedin-graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, score
ORDER BY score DESC'''

# Betweenness
cypher_betweenness_centrality='''CALL gds.betweenness.stream('my-linkedin-graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, score
ORDER BY score DESC'''

# PageRank
cypher_page_rank='''CALL gds.pageRank.stream('my-linkedin-graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, score
ORDER BY score DESC'''

In [None]:
# Queries for Community Detection

In [None]:
# Modularity Optimization
modularity_opt = '''CALL gds.beta.modularityOptimization.stream('my-linkedin-graph')
YIELD nodeId, communityId
RETURN gds.util.asNode(nodeId).title AS title, communityId
ORDER BY title'''
modularity_results_df = graph_3.run(modularity_opt).to_data_frame()
modularity_results = nodes_df.merge(modularity_results_df, left_on='Id', right_on='nodeId')
modularity_results.to_csv('modopt_nodes_for_gephi.csv')


In [None]:
# Weakly Connected Components
wcc = '''CALL gds.wcc.stream('my-linkedin-graph')
YIELD nodeId, componentId
RETURN nodeId,gds.util.asNode(nodeId).title AS title, componentId
ORDER BY nodeId,componentId, title'''
wcc_results_df = graph_3.run(wcc).to_data_frame()
wcc_results = nodes_df.merge(wcc_results_df, left_on='Id', right_on='nodeId')
wcc_results.to_csv('wcc_nodes_for_gephi.csv')

In [20]:
# K1 Coloring - This assigns differnt colors to adjacent nodes; good as a counterpoint to 
# community detection
k1_detection = '''CALL gds.beta.k1coloring.stream('my-linkedin-graph')
YIELD nodeId, color
RETURN nodeId,gds.util.asNode(nodeId).title AS title, color
ORDER BY nodeId'''
k1_results_df = graph_3.run(k1_detection).to_data_frame()
k1_results = nodes_df.merge(k1_results_df, left_on='Id', right_on='nodeId')
k1_results.to_csv('k1_nodes_for_gephi.csv')

In [None]:
# K1 Coloring - This assigns differnt colors to adjacent nodes
# Same as above, but run on a graph where the 'central' node has been removed. This results
# in a few connected components and changes many of the centrality characteristics

k1_detection2 = '''CALL gds.beta.k1coloring.stream('my-linkedin-graph-undirected')
YIELD nodeId, color
RETURN nodeId,gds.util.asNode(nodeId).title AS title, color
ORDER BY nodeId'''
k1_results_df2 = graph_3.run(k1_detection2).to_data_frame()
k1_results2 = nodes_df.merge(k1_results_df2, left_on='Id', right_on='nodeId')
k1_results2.to_csv('k1_nodes2_for_gephi.csv') # Node table output. This can be used with the 
# edge table generated above in a previous cell

In [None]:
# Using Graph Sage to create embeddings, then perform K-means clustering on those embeddings
# Finally shaping the results for output as a Node Table

In [None]:
import numpy as np
import altair as alt
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics

In [None]:
graph_sage = '''CALL gds.alpha.graphSage.stream(
  'my-linkedin-graph-undirected',
  {
    aggregator: 'mean',
    activationFunction: 'sigmoid',
    embeddingSize: 3,
    sampleSizes: [25, 10],
    degreeAsProperty: true
  }
)'''
graphsage_results_df = graph_3.run(graph_sage).to_data_frame()
graphsage_results_df.head()

In [None]:
graphsage_results_df.head()

In [None]:
X = graphsage_results_df.embeddings.to_numpy() # Convert embeddings to numpy array
node_ids = graphsage_results_df.nodeId # The node ids

In [None]:
X=[x for x in X]

In [None]:
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1,)

In [None]:
km.fit(X)

In [None]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(node_ids, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(node_ids, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(node_ids, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(node_ids, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

In [None]:
graphsage_results_df['community'] = list(km.labels_)

In [None]:
graphsage_results_nodes = nodes_df.merge(graphsage_results_df, left_on='Id', right_on='nodeId')  


In [None]:
graphsage_results_nodes.head()

In [None]:
graphsage_results_nodes.to_csv('graphsage_nodes_for_gephi.csv') # The node table

In [None]:
edges_df.to_csv('edges_for_gephi.csv') # The edges table