# Build the NetworkX graph and apply Centrality Measures to examine users/sites importance/influence

# 1. Build the NetworkX graph

In [None]:
%matplotlib inline

import os
import re
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split

from collections import Counter
import sqlite3
from html.parser import HTMLParser
from tqdm import tqdm
import random
import pickle
import itertools

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

import networkx as nx

In [None]:
data_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/csv_data"
assert os.path.exists(data_dir)

In [None]:
# Create a User->Site map. Format: dict
users = pd.read_csv("/home/srivbane/shared/caringbridge/data/projects/sna-social-support/csv_data/pcts.csv")
print(len(users))
multi_site_count = 0
user_site_map = {}
for userId, group in users.groupby(by='userId', sort=False):
    siteIds = tuple(group.siteId.tolist())
    if len(siteIds) > 1:
        multi_site_count += 1
    user_site_map[userId] = siteIds
print(f"{len(user_site_map.keys())} users mapped to sites. ({multi_site_count} users to multiple sites.)")

In [None]:
# Test User->Site map
user_site_map[7326018]

In [None]:
users.head()

In [None]:
# Drop users with userId == 0
users.drop(users[users.userId == 0].index, inplace=True)
len(users)

In [None]:
# Read the journal-replies network data file
jr = pd.read_csv(os.path.join(data_dir, "jr.csv"))
len(jr)

In [None]:
jr.head()

In [None]:
# Trim to only authors
jr.drop(jr[~jr.from_userId.isin(set(users.userId))].index, inplace=True)
len(jr)

In [None]:
jr.head()

In [None]:
# Create a Site->User map. Format: dict
site_user_map = {}
for siteId, group in tqdm(users.groupby(by='siteId', sort=False)):
    userIds = tuple(group.userId.tolist())
    site_user_map[siteId] = userIds

In [None]:
# Test Site->User map
site_user_map[838509]

In [None]:
# Create the initial graph. Nodes: users (represented by userId)
G = nx.DiGraph()
nodes = list(set(jr.from_userId))
G.add_nodes_from(nodes)
len(G)

In [None]:
# Create a list of edges (connections) between users (from_userId to to_userId)
edges = []
for fromUser, siteId in tqdm(jr[['from_userId', 'siteId']].values):
    toUsers = site_user_map[siteId]
    for toUser in toUsers:
        edges.append((fromUser, toUser))

In [None]:
edges.sort()
edges[:10]

In [None]:
# Make a list of unique edges from the original list of edges and assign weights to each edge 
unique_edges = []
for key, group in itertools.groupby(edges):
    edge_weight = sum(1 for item in group)
    weighted_edge = (key[0], key[1], {'weight': edge_weight})
    unique_edges.append(weighted_edge)

assert len(unique_edges) < len(edges)
print (len(unique_edges))
unique_edges[:15]

In [None]:
# Add edges to connect the nodes from a list of unique edges
G.add_edges_from(unique_edges)
len(G)

In [None]:
# G.edges[{from_userId, to_userId}]: get the weight
G.edges[{16, 849533}]

In [None]:
# largestScc_G: largest strongly connected components subgraph
largestScc_G = G.subgraph(sorted(nx.strongly_connected_components(G), key=len, reverse=True)[0])
len(largestScc_G), len(G)

In [None]:
# Test for the correctness of the directed graph: sum(in_degrees) == sum(out_degrees)
indegrees = [in_degree for node, in_degree in largestScc_G.in_degree()]
outdegrees = [out_degree for node, out_degree in largestScc_G.out_degree()]
assert len(indegrees) == len(outdegrees)

In [None]:
# Histogram of nodes and their degrees
plt.hist([indegrees, outdegrees], log=True, label=["In-degree", "Out-degree"], bins=20, range=(1,40))
plt.ylabel("Node Count")
plt.xlabel("Node Degree")
plt.legend()
plt.show()

In [None]:
def getSortedKeys(d): # sort the values in descending order and get the corresponding list of keys
    keys = sorted(d, key=d.get)
    keys.reverse()
    return keys

def getSlice(l, n): # slice the list with length n
    if n > len(l):
        raise Exception('n exceeds list length')
    lst = l[:n]
    return lst

# Centrality Measures
## 1. PageRank centrality
## 2. (In/Out)-Degree centrality
## 3. Betweeness centrality
## 4. Closeness centrality

In [None]:
pr = nx.pagerank(largestScc_G)
pr

In [None]:
pr_keys = getSortedKeys(pr)
pr_slicedKeys = getSlice(pr_keys, 1000)
assert len(pr_slicedKeys) == 1000
pr_slicedKeys

## 2. (In/Out) Degree centrality

### Degree centrality

In [None]:
deg_central = nx.degree_centrality(largestScc_G)
deg_central

In [None]:
deg_central_keys = getSortedKeys(deg_central)
deg_central_slicedKeys = getSlice(deg_central_keys, 1000)
assert len(deg_central_slicedKeys) == 1000
deg_central_slicedKeys

### In-degree centrality

In [None]:
indeg_central = nx.in_degree_centrality(largestScc_G)
indeg_central

In [None]:
indeg_central_keys = getSortedKeys(indeg_central)
indeg_central_slicedKeys = getSlice(indeg_central_keys, 1000)
assert len(indeg_central_slicedKeys) == 1000
indeg_central_slicedKeys

### Out-degree centrality

In [None]:
outdeg_central = nx.out_degree_centrality(largestScc_G)
outdeg_central

In [None]:
outdeg_central_keys = getSortedKeys(outdeg_central)
outdeg_central_slicedKeys = getSlice(outdeg_central_keys, 1000)
assert len(outdeg_central_slicedKeys) == 1000
outdeg_central_slicedKeys

## 3. Betweeness centrality

In [None]:
%time
btw_central = nx.betweenness_centrality(largestScc_G)
btw_central

In [None]:
btw_central_keys = getSortedKeys(btw_central)
btw_central_slicedKeys = getSlice(btw_central_keys, 1000)
assert len(btw_central_slicedKeys) == 1000
btw_central_slicedKeys

## 4. Closeness centrality

In [None]:
%time
close_central = nx.closeness_centrality(largestScc_G)
close_central

In [None]:
close_central_keys = getSortedKeys(close_central)
close_central_slicedKeys = getSlice(close_central_keys, 1000)
assert len(close_central_slicedKeys) == 1000
close_central_slicedKeys

# Output processing

In [None]:
def getExtremes(n):
    common_nodes = set(deg_central_slicedKeys[:n]).intersection(indeg_central_slicedKeys[:n], 
                                                            outdeg_central_slicedKeys[:n],
                                                            btw_central_slicedKeys[:n],
                                                            close_central_slicedKeys[:n])
    print ("Number of extreme nodes (users): ", len(common_nodes))
    return list(common_nodes)

In [167]:
getExtremes(100)

Number of extreme nodes (users):  6


[7326018, 4007054, 17508946, 7781298, 4258911, 23274911]

In [168]:
getExtremes(200)

Number of extreme nodes (users):  22


[2875016,
 1030664,
 1706251,
 4007054,
 20270991,
 5072794,
 23274911,
 28523950,
 7781298,
 23582261,
 710846,
 7326018,
 6065347,
 574151,
 5731271,
 20682957,
 17508946,
 25423957,
 4258911,
 16152816,
 15246195,
 6746237]

In [None]:
# Combine centrality results into one dataframe
df1 = pd.DataFrame.from_dict(deg_central, orient='index')
df1 = df1.reset_index()
lst1 = list(indeg_central.values())
lst2 = list(outdeg_central.values())
lst3 = list(btw_central.values())
lst4 = list(close_central.values())
df1.columns = ['userId', 'degree_centrality']
df1['indeg_centrality'] = lst1
df1['outdeg_centrality'] = lst2
df1['btw_centrality'] = lst3
df1['close_centrality'] = lst4
df1

In [None]:
# Make sure the keys align
assert indeg_central.keys() == deg_central.keys() == outdeg_central.keys() == btw_central.keys() == close_central.keys()

In [None]:
# Write to csv file
df1.to_csv("/home/srivbane/vuong067/CaringBridge/journal-replies-analysis/KhiemV_work/current_work/notebooks/centrality_measures.csv")

In [None]:
# Create the list of extreme sites from extreme nodes
def convertToSiteList(nodeList):
    siteList = [site for node in nodeList for site in user_site_map[node]]
    print ("Number of extreme sites corresponding to extreme nodes/users: ", len(siteList))
    return siteList

In [169]:
convertToSiteList(getExtremes(100))

Number of extreme nodes (users):  6
Number of extreme sites corresponding to extreme nodes/users:  6


[156877, 88261, 599513, 552325, 983390, 612345]