In [1]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
# %matplotlib widget

In [2]:
df_social = pd.read_table('./data/higgs-social_network.edgelist', sep = ' ', names = ['A', 'B'])

In [3]:
df_mention = pd.read_table('./data/higgs-mention_network.edgelist', sep = ' ', names = ['A', 'B', 'w'])

In [4]:
df_retweet = pd.read_table('./data/higgs-retweet_network.edgelist', sep = ' ', names = ['A', 'B', 'w'])

In [5]:
df_reply = pd.read_table('./data/higgs-reply_network.edgelist', sep = ' ', names = ['A', 'B', 'w'])

In [18]:
import time

In [10]:
# Create directed graph
G_social = nx.from_pandas_edgelist(df_social, source = 'A', target = 'B', create_using=nx.DiGraph)
# prev Runtime:
# 479.2698624134064 seconds
# 7.98783104022344 min

479.2698624134064 seconds
7.98783104022344 min


In [7]:
import pickle

In [11]:
# Save directed graph in pickle form
with open('./pickles/G_social_DiGraph.pickle', 'wb') as file:
    pickle.dump(G_social, file)

In [9]:
import metrics

In [14]:
# # Compute all metrics from dataframes
all_metrics = metrics.all_metrics(G_social, df_mention, df_reply, df_retweet)
# prev Runtime
# 69.0646619796753 seconds
# 1.1510776996612548 min

69.0646619796753 seconds
1.1510776996612548 min


In [13]:
# Save all metrics in pickle form
with open('./pickles/all_metrics.pickle', 'wb') as file:
    pickle.dump(all_metrics, file)

In [11]:
all_metrics.keys()

dict_keys(['f1', 'f3', 'm1', 'm2', 'm3', 'm4', 'rt1', 'rt2', 'rt3', 'rp1', 'rp3'])

In [12]:
# Metrics stored in dataframe
df_metrics = pd.DataFrame()
df_metrics['nodeId'] = G_social.nodes()

f1_dict = all_metrics['f1']
f3_dict = all_metrics['f3']
m1_dict = all_metrics['m1']
m2_dict = all_metrics['m2']
m3_dict = all_metrics['m3']
m4_dict = all_metrics['m4']
rp1_dict = all_metrics['rp1']
rp3_dict = all_metrics['rp3']
rt1_dict = all_metrics['rt1']
rt2_dict = all_metrics['rt2']
rt3_dict = all_metrics['rt3']

from utilities import list_of_values
df_metrics['f1'] = list_of_values(f1_dict) # [dict(f1_dict)[key] for key in dict(f1_dict).keys()]
df_metrics['f3'] = list_of_values(f3_dict) 
df_metrics['m1'] = list_of_values(m1_dict) 
df_metrics['m2'] = list_of_values(m2_dict) 
df_metrics['m3'] = list_of_values(m3_dict) 
df_metrics['m4'] = list_of_values(m4_dict) 
df_metrics['rp1'] = list_of_values(rp1_dict) 
df_metrics['rp3'] = list_of_values(rp3_dict) 
df_metrics['rt1'] = list_of_values(rt1_dict) 
df_metrics['rt2'] = list_of_values(rt2_dict) 
df_metrics['rt3'] = list_of_values(rt3_dict) 
df_metrics

Unnamed: 0,nodeId,f1,f3,m1,m2,m3,m4,rp1,rp3,rt1,rt2,rt3
1,1,16280,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,4707,77,0.0,0.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0
3,3,137,25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,8643,402,7.0,6.0,106.0,104.0,1.0,3.0,7.0,86.0,77.0
5,5,2194,58,0.0,0.0,4.0,4.0,0.0,1.0,0.0,24.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...
456622,456622,0,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
456623,456623,0,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
456624,456624,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
456625,456625,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# # Save metrics dataframe in pickle form
with open('./pickles/df_metrics.pickle', 'wb') as file:
    pickle.dump(df_metrics, file)

In [13]:
import measures

In [14]:
# betweeness
start = time.time()
betweeness_dict = nx.betweenness_centrality(G_social, k = 1)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')
# Runtime
# 98.08658075332642 seconds
# 1.6347763458887736 min

71.60240054130554 seconds
1.1933733423550923 min


In [15]:
# Save betweeness in pickle form
with open('./pickles/betweeness_dict_k1.pickle', 'wb') as file:
    pickle.dump(betweeness_dict, file)

In [70]:
start = time.time()
betweeness_dict = nx.betweenness_centrality(G_social, k = 10)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')
# prev Runtime
# 590.0320670604706 seconds
# 9.833867784341177 min

509.9132812023163 seconds
8.49855468670527 min


In [71]:
# save betweeness in pickle form
with open('./pickles/betweeness_dictk10.pickle', 'wb') as file:
    pickle.dump(betweeness_dict, file)

In [135]:
# start = time.time()
# closeness_dict = nx.closeness_centrality(G_social)
# end = time.time()
# print(f'{end - start} seconds')
# print(f'{(end - start)/60.} min')

In [20]:
# start = time.time()

# katz_dict = nx.katz_centrality(G_social, max_iter=5, tol=1e-02)

# end = time.time()
# print(f'{end - start} seconds')
# print(f'{(end - start)/60.} min')

In [72]:
# # Compute eigenvector
start = time.time()

eigenvector_dict = nx.eigenvector_centrality(G_social, tol=1e-02)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

# Runtime
# 20.315030813217163 seconds
# 0.3385838468869527 min

21.40028715133667 seconds
0.35667145252227783 min


In [73]:
# save eigenvector in pickle form
with open('./pickles/eigenvector_dict_tol1e02.pickle', 'wb') as file:
    pickle.dump(eigenvector_dict, file)

In [15]:
# # Compute PageRank 
start = time.time()

pagerank_dict = nx.pagerank(G_social,tol=1e-02)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

# prev Runtime
# 351.44464778900146 seconds
# 5.857410796483358 min

348.38092517852783 seconds
5.806348752975464 min


In [16]:
# Save pagerank in pickle form
with open('./pickles/pagerank_dict_tol1e032.pickle', 'wb') as file:
    pickle.dump(pagerank_dict, file)

In [19]:
# degree centrality
start = time.time()
degc_dict = nx.degree_centrality(G_social)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

2.2150933742523193 seconds
0.03691822290420532 min


In [20]:
#  followerRank
start = time.time()
followerRank_dict = measures.follower_rank(G_social, f1_dict, f3_dict)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.16405415534973145 seconds
0.002734235922495524 min


In [21]:
# TFF
start = time.time()
tff_dict = measures.tff(G_social,f1_dict, f3_dict)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.21016287803649902 seconds
0.0035027146339416506 min


In [22]:
# popularity
start = time.time()
pop_dict = measures.popularity(G_social, f1_dict)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.7197210788726807 seconds
0.011995351314544678 min


In [23]:
# A-score
start = time.time()
a_score_dict = measures.a_score(G_social, f1_dict, m4_dict, rp3_dict, rt3_dict)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.8100881576538086 seconds
0.013501469294230144 min


In [24]:
# RI
start = time.time()
retweet_impact_dict = measures.retweet_impact(G_social, rt2_dict, rt3_dict)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.16015386581420898 seconds
0.0026692310969034833 min


In [25]:
# MI
start = time.time()
mention_impact_dict = measures.mention_impact(G_social, m1_dict, m2_dict, m3_dict, m4_dict)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.1699974536895752 seconds
0.002833290894826253 min


In [26]:
df_social_centralities = pd.DataFrame()
df_social_centralities['nodeId'] = G_social.nodes()
df_social_centralities['degc'] = list_of_values(degc_dict)
df_social_centralities['betwc'] = list_of_values(betweeness_dict)
# df_social_centralities['closc'] = list_of_values(closeness_dict)
df_social_centralities['eigenvectorc'] = list_of_values(eigenvector_dict)
df_social_centralities['pagerankc'] = list_of_values(pagerank_dict)
df_social_centralities['follower_rank'] = list_of_values(followerRank_dict)
df_social_centralities['tff'] = list_of_values(tff_dict)
df_social_centralities['popularity'] = list_of_values(pop_dict)
df_social_centralities['a_score'] = list_of_values(a_score_dict)
df_social_centralities['retweet_imp'] = list_of_values(retweet_impact_dict)
df_social_centralities['mention_imp'] = list_of_values(mention_impact_dict)
df_social_centralities = df_social_centralities.replace({'infty': 0.0, '-infty': 0.0})
df_social_centralities

Unnamed: 0,nodeId,degc,betwc,eigenvectorc,pagerankc,follower_rank,tff,popularity,a_score,retweet_imp,mention_imp
1,1,0.035701,0.079047,7.556036e-03,3.023522e-02,0.998650,740.000000,1.0,16280.000000,0.000000,0.000000
2,2,0.010477,0.001071,2.893191e-02,2.317112e-04,0.983905,61.129870,1.0,4710.000000,0.000000,0.000000
3,3,0.000355,0.000108,2.099042e-03,5.675661e-06,0.845679,5.480000,1.0,137.000000,0.000000,0.000000
4,4,0.019808,0.013509,5.788731e-02,3.825706e-04,0.955556,21.500000,1.0,8750.000169,373.567266,479.763119
5,5,0.004932,0.000749,2.487288e-02,7.903862e-05,0.974245,37.827586,1.0,2199.000053,76.273292,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
456622,456622,0.000004,0.000000,4.560751e-10,3.284964e-07,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
456623,456623,0.000026,0.000000,4.560751e-10,3.284964e-07,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
456624,456624,0.000002,0.000000,4.560751e-10,3.284964e-07,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
456625,456625,0.000002,0.000000,4.560751e-10,3.284964e-07,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


In [84]:
# top 1000 nodes for each measure
top_follower_rank = set(df_social_centralities.sort_values(by = 'follower_rank', ascending = False).head(1000).nodeId)
top_tff = set(df_social_centralities.sort_values(by = 'tff', ascending = False).head(1000).nodeId)
top_popularity = set(df_social_centralities.sort_values(by = 'popularity', ascending = False).head(1000).nodeId)
top_a_score = set(df_social_centralities.sort_values(by = 'a_score', ascending = False).head(1000).nodeId)
top_retweet_imp = set(df_social_centralities.sort_values(by = 'retweet_imp', ascending = False).head(1000).nodeId)
top_mention_imp = set(df_social_centralities.sort_values(by = 'mention_imp', ascending = False).head(1000).nodeId)
top_mention_imp = set(df_social_centralities.sort_values(by = 'mention_imp', ascending = False).head(1000).nodeId)
top_pagerank = set(df_social_centralities.sort_values(by = 'pagerankc', ascending = False).head(1000).nodeId)
top_degc = set(df_social_centralities.sort_values(by = 'degc', ascending = False).head(1000).nodeId)
top_betw = set(df_social_centralities.sort_values(by = 'betwc', ascending = False).head(1000).nodeId)
top_eigc = set(df_social_centralities.sort_values(by = 'eigenvectorc', ascending = False).head(1000).nodeId)

In [28]:
# top_1000_nodes = top_follower_rank.union(top_tff).union(top_popularity).union(top_a_score).union(top_retweet_imp).union(top_mention_imp).union(top_pagerank).union(top_degc).union(top_betw)
top_1000_nodes_for_each_measure = top_follower_rank.union(top_tff).union(top_popularity).union(top_a_score).union(top_retweet_imp).union(top_mention_imp).union(top_degc).union(top_betw)

In [29]:
len(top_1000_nodes_for_each_measure)

4426

In [30]:
top_1000_nodes = list(top_1000_nodes_for_each_measure)

In [31]:
start = time.time()

G_top_1000_nodes = G_social.subgraph(top_1000_nodes)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.010117530822753906 seconds
0.0001686255137125651 min


In [32]:
start = time.time()

nd_positions = nx.spring_layout(G_top_1000_nodes)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

124.56958293914795 seconds
2.076159715652466 min


In [34]:
# save node positions in pickle form
with open('./pickles/nd_positions.pickle', 'wb') as file:
    pickle.dump(nd_positions, file)

In [35]:
from utilities import normalize_vector, factor_vector

In [85]:
scaling_factor = 1000.0
top_1000_followerRank_normalized = [followerRank_dict[node] for node in list(top_1000_nodes)]
top_1000_followerRank_normalized= normalize_vector(top_1000_followerRank_normalized)
top_1000_followerRank_normalized = factor_vector(top_1000_followerRank_normalized, scaling_factor)

top_1000_tff_normalized = [tff_dict[node] for node in list(top_1000_nodes)]
top_1000_tff_normalized= normalize_vector(top_1000_tff_normalized)
top_1000_tff_normalized = factor_vector(top_1000_tff_normalized, scaling_factor)

top_1000_popularity_normalized = [pop_dict[node] for node in list(top_1000_nodes)]
top_1000_popularity_normalized= normalize_vector(top_1000_popularity_normalized)
top_1000_popularity_normalized = factor_vector(top_1000_popularity_normalized, scaling_factor)

top_1000_a_score_normalized = [a_score_dict[node] for node in list(top_1000_nodes)]
top_1000_a_score_normalized= normalize_vector(top_1000_a_score_normalized)
top_1000_a_score_normalized = factor_vector(top_1000_a_score_normalized, scaling_factor)

top_1000_retweet_impact_normalized = [retweet_impact_dict[node] for node in list(top_1000_nodes)]
top_1000_retweet_impact_normalized= normalize_vector(top_1000_retweet_impact_normalized)
top_1000_retweet_impact_normalized = factor_vector(top_1000_retweet_impact_normalized, scaling_factor)

top_1000_mention_impact_normalized = [mention_impact_dict[node] for node in list(top_1000_nodes)]
top_1000_mention_impact_normalized= normalize_vector(top_1000_mention_impact_normalized)
top_1000_mention_impact_normalized = factor_vector(top_1000_mention_impact_normalized, scaling_factor)

top_1000_pagerank_normalized = [pagerank_dict[node] for node in list(top_1000_nodes)]
top_1000_pagerank_normalized= normalize_vector(top_1000_pagerank_normalized)
top_1000_pagerank_normalized = factor_vector(top_1000_pagerank_normalized, scaling_factor)

top_1000_degc_normalized = [degc_dict[node] for node in list(top_1000_nodes)]
top_1000_degc_normalized= normalize_vector(top_1000_degc_normalized)
top_1000_degc_normalized = factor_vector(top_1000_degc_normalized, scaling_factor)

top_1000_betwc_normalized = [betweeness_dict[node] for node in list(top_1000_nodes)]
top_1000_betwc_normalized= normalize_vector(top_1000_betwc_normalized)
top_1000_betwc_normalized = factor_vector(top_1000_betwc_normalized, scaling_factor)

top_1000_eigc_normalized = [eigenvector_dict[node] for node in list(top_1000_nodes)]
top_1000_eigc_normalized= normalize_vector(top_1000_eigc_normalized)
top_1000_eigc_normalized = factor_vector(top_1000_eigc_normalized, scaling_factor)