In [1]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
# %matplotlib widget

In [2]:
df_social = pd.read_table('./data/higgs-social_network.edgelist', sep = ' ', names = ['A', 'B'])

In [3]:
df_mention = pd.read_table('./data/higgs-mention_network.edgelist', sep = ' ', names = ['A', 'B', 'w'])

In [4]:
df_retweet = pd.read_table('./data/higgs-retweet_network.edgelist', sep = ' ', names = ['A', 'B', 'w'])

In [5]:
df_reply = pd.read_table('./data/higgs-reply_network.edgelist', sep = ' ', names = ['A', 'B', 'w'])

In [6]:
import time

In [7]:
# Create directed graph
G_social = nx.from_pandas_edgelist(df_social, source = 'A', target = 'B', create_using=nx.DiGraph)
# prev Runtime:
# 479.2698624134064 seconds
# 7.98783104022344 min

In [8]:
import pickle

In [9]:
# Save directed graph in pickle form
with open('./pickles/G_social_DiGraph.pickle', 'wb') as file:
    pickle.dump(G_social, file)

In [10]:
import metrics

In [11]:
# # Compute all metrics from dataframes
all_metrics = metrics.all_metrics(G_social, df_mention, df_reply, df_retweet)
# prev Runtime
# 69.0646619796753 seconds
# 1.1510776996612548 min

In [12]:
# Save all metrics in pickle form
with open('./pickles/all_metrics.pickle', 'wb') as file:
    pickle.dump(all_metrics, file)

In [13]:
all_metrics.keys()

dict_keys(['f1', 'f3', 'm1', 'm2', 'm3', 'm4', 'rt1', 'rt2', 'rt3', 'rp1', 'rp3'])

In [14]:
# Metrics stored in dataframe
df_metrics = pd.DataFrame()
df_metrics['nodeId'] = G_social.nodes()

from utilities import list_of_values

for key in all_metrics.keys():
    df_metrics[key] = list_of_values(all_metrics[key])
df_metrics

Unnamed: 0,nodeId,f1,f3,m1,m2,m3,m4,rt1,rt2,rt3,rp1,rp3
1,1,16280,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,4707,77,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,1.0
3,3,137,25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,8643,402,7.0,6.0,106.0,104.0,7.0,86.0,77.0,1.0,3.0
5,5,2194,58,0.0,0.0,4.0,4.0,0.0,24.0,24.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
456622,456622,0,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
456623,456623,0,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
456624,456624,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
456625,456625,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# # Save metrics dataframe in pickle form
with open('./pickles/df_metrics.pickle', 'wb') as file:
    pickle.dump(df_metrics, file)

In [16]:
import measures

In [17]:
# betweeness
start = time.time()
betweeness_dict = nx.betweenness_centrality(G_social, k = 1)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')
# Runtime
# 98.08658075332642 seconds
# 1.6347763458887736 min

68.93601298332214 seconds
1.1489335497220357 min


In [18]:
# Save betweeness in pickle form
with open('./pickles/betweeness_dict_k1.pickle', 'wb') as file:
    pickle.dump(betweeness_dict, file)

In [19]:
start = time.time()
betweeness_dict = nx.betweenness_centrality(G_social, k = 10)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')
# prev Runtime
# 590.0320670604706 seconds
# 9.833867784341177 min

578.2995557785034 seconds
9.638325929641724 min


In [20]:
# save betweeness in pickle form
with open('./pickles/betweeness_dictk10.pickle', 'wb') as file:
    pickle.dump(betweeness_dict, file)

In [21]:
# start = time.time()
# closeness_dict = nx.closeness_centrality(G_social)
# end = time.time()
# print(f'{end - start} seconds')
# print(f'{(end - start)/60.} min')

In [22]:
# start = time.time()

# katz_dict = nx.katz_centrality(G_social, max_iter=5, tol=1e-02)

# end = time.time()
# print(f'{end - start} seconds')
# print(f'{(end - start)/60.} min')

In [23]:
# # Compute eigenvector
start = time.time()

eigenvector_dict = nx.eigenvector_centrality(G_social, tol=1e-02)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

# Runtime
# 20.315030813217163 seconds
# 0.3385838468869527 min

8.479838848114014 seconds
0.14133064746856688 min


In [24]:
# save eigenvector in pickle form
with open('./pickles/eigenvector_dict_tol1e02.pickle', 'wb') as file:
    pickle.dump(eigenvector_dict, file)

In [25]:
# # Compute PageRank 
start = time.time()

pagerank_dict = nx.pagerank(G_social,tol=1e-02)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

# prev Runtime
# 351.44464778900146 seconds
# 5.857410796483358 min

570.1673040390015 seconds
9.502788400650024 min


In [26]:
# Save pagerank in pickle form
with open('./pickles/pagerank_dict_tol1e02.pickle', 'wb') as file:
    pickle.dump(pagerank_dict, file)

In [27]:
# degree centrality
start = time.time()
degc_dict = nx.degree_centrality(G_social)
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

1.5099093914031982 seconds
0.025165156523386637 min


In [28]:
#  followerRank
start = time.time()
followerRank_dict = measures.follower_rank(G_social,
                                           all_metrics['f1'], 
                                           all_metrics['f3'])
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.5522477626800537 seconds
0.009204129378000895 min


In [29]:
# TFF
start = time.time()
tff_dict = measures.tff(G_social,
                        all_metrics['f1'],
                        all_metrics['f3'])
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.2720162868499756 seconds
0.004533604780832926 min


In [30]:
# popularity
start = time.time()
pop_dict = measures.popularity(G_social,
                               all_metrics['f1'])
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.7399075031280518 seconds
0.012331791718800863 min


In [31]:
# A-score
start = time.time()
a_score_dict = measures.a_score(G_social, 
                                all_metrics['f1'],
                                all_metrics['m4'],
                                all_metrics['rp3'], 
                                all_metrics['rt3'])
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.9701828956604004 seconds
0.01616971492767334 min


In [32]:
# RI
start = time.time()
retweet_impact_dict = measures.retweet_impact(G_social,
                                              all_metrics['rt2'],
                                              all_metrics['rt3'])
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.22511911392211914 seconds
0.003751985232035319 min


In [33]:
# MI
start = time.time()
mention_impact_dict = measures.mention_impact(G_social,
                                              all_metrics['m1'],
                                              all_metrics['m2'], 
                                              all_metrics['m3'], 
                                              all_metrics['m4'])
end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.3101644515991211 seconds
0.005169407526652018 min


In [34]:
# in_degree centrality
start = time.time()

in_degc_dict = nx.in_degree_centrality(G_social)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.2844107151031494 seconds
0.00474017858505249 min


In [35]:
df_social_centralities = pd.DataFrame()

cols = ['degc', 
        'indegc',
        'betwc',
        'eigenvectorc',
        'pagerankc',
        'follower_rank',
        'tff',
        'popularity',
        'a_score',
        'retweet_imp',
        'mention_imp']

dicts = [degc_dict, 
         in_degc_dict,
         betweeness_dict,
         eigenvector_dict,
         pagerank_dict,
         followerRank_dict,
         tff_dict,
         pop_dict,
         a_score_dict,
         retweet_impact_dict,
         mention_impact_dict]

df_social_centralities['nodeId'] = G_social.nodes()
for col, dicti in zip(cols, dicts):
    df_social_centralities[col] = list_of_values(dicti)
df_social_centralities

Unnamed: 0,nodeId,degc,indegc,betwc,eigenvectorc,pagerankc,follower_rank,tff,popularity,a_score,retweet_imp,mention_imp
1,1,0.035701,0.035653,2.600515e-05,0.069014,3.023522e-02,0.998650,740.000000,1.0,16280.000000,1.000000,1.000000
2,2,0.010477,0.010308,7.525704e-05,0.019957,2.317112e-04,0.983905,61.129870,1.0,4710.000000,1.000000,1.000000
3,3,0.000355,0.000300,5.024058e-07,0.000585,5.675661e-06,0.845679,5.480000,1.0,137.000000,1.000000,1.000000
4,4,0.019808,0.018928,7.316571e-04,0.036641,3.825706e-04,0.955556,21.500000,1.0,8750.000169,373.567266,479.763119
5,5,0.004932,0.004805,9.724488e-06,0.009304,7.903862e-05,0.974245,37.827586,1.0,2199.000053,76.273292,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
456622,456622,0.000004,0.000000,0.000000e+00,0.000004,3.284964e-07,0.000000,0.000000,0.0,0.000000,1.000000,1.000000
456623,456623,0.000026,0.000000,0.000000e+00,0.000004,3.284964e-07,0.000000,0.000000,0.0,0.000000,1.000000,1.000000
456624,456624,0.000002,0.000000,0.000000e+00,0.000004,3.284964e-07,0.000000,0.000000,0.0,0.000000,1.000000,1.000000
456625,456625,0.000002,0.000000,0.000000e+00,0.000004,3.284964e-07,0.000000,0.000000,0.0,0.000000,1.000000,1.000000


In [36]:
df_social_centralities.keys()

Index(['nodeId', 'degc', 'indegc', 'betwc', 'eigenvectorc', 'pagerankc',
       'follower_rank', 'tff', 'popularity', 'a_score', 'retweet_imp',
       'mention_imp'],
      dtype='object')

In [37]:
# top 1000 nodes for each measure
top_follower_rank = set(df_social_centralities.sort_values(by = 'follower_rank', ascending = False).head(1000).nodeId)
top_tff = set(df_social_centralities.sort_values(by = 'tff', ascending = False).head(1000).nodeId)
top_popularity = set(df_social_centralities.sort_values(by = 'popularity', ascending = False).head(1000).nodeId)
top_a_score = set(df_social_centralities.sort_values(by = 'a_score', ascending = False).head(1000).nodeId)
top_retweet_imp = set(df_social_centralities.sort_values(by = 'retweet_imp', ascending = False).head(1000).nodeId)
top_mention_imp = set(df_social_centralities.sort_values(by = 'mention_imp', ascending = False).head(1000).nodeId)
top_mention_imp = set(df_social_centralities.sort_values(by = 'mention_imp', ascending = False).head(1000).nodeId)
top_pagerank = set(df_social_centralities.sort_values(by = 'pagerankc', ascending = False).head(1000).nodeId)
top_degc = set(df_social_centralities.sort_values(by = 'degc', ascending = False).head(1000).nodeId)
top_betw = set(df_social_centralities.sort_values(by = 'betwc', ascending = False).head(1000).nodeId)
top_eigc = set(df_social_centralities.sort_values(by = 'eigenvectorc', ascending = False).head(1000).nodeId)
top_indegc = set(df_social_centralities.sort_values(by = 'indegc', ascending = False).head(1000).nodeId)

In [38]:
top_1000_nodes_for_each_measure = top_degc.union(top_betw) \
                                          .union(top_eigc) \
                                          .union(top_pagerank) \
                                          .union(top_retweet_imp) \
                                          .union(top_indegc) \
                                          .union(top_tff) \
                                          .union(top_mention_imp) \
                                          .union(top_a_score)

In [39]:
len(top_1000_nodes_for_each_measure)

3581

In [40]:
top_1000_nodes = list(top_1000_nodes_for_each_measure)

In [41]:
start = time.time()

G_top_1000_nodes = G_social.subgraph(top_1000_nodes)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

0.009026765823364258 seconds
0.00015044609705607096 min


In [42]:
start = time.time()

nd_positions = nx.spring_layout(G_top_1000_nodes)

end = time.time()
print(f'{end - start} seconds')
print(f'{(end - start)/60.} min')

84.26543521881104 seconds
1.4044239203135171 min


In [43]:
# save node positions in pickle form
with open('./pickles/nd_positions.pickle', 'wb') as file:
    pickle.dump(nd_positions, file)

In [44]:
# z score normalization
df = pd.DataFrame()
df['nodeId'] = top_1000_nodes

cols = list(df_social_centralities.columns)
cols.remove('nodeId')

dicts = [degc_dict, 
         in_degc_dict,
         betweeness_dict,
         eigenvector_dict,
         pagerank_dict,
         followerRank_dict,
         tff_dict,
         pop_dict,
         a_score_dict,
         retweet_impact_dict,
         mention_impact_dict
        ]

for col, dicti in zip(cols, dicts):
    df[col] = [dicti[node] for node in list(top_1000_nodes)]

# now iterate over the remaining columns and create a new zscore column
for col in cols:
    col_zscore = col + '_zscore'
    df[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)
df

Unnamed: 0,nodeId,degc,indegc,betwc,eigenvectorc,pagerankc,follower_rank,tff,popularity,a_score,...,indegc_zscore,betwc_zscore,eigenvectorc_zscore,pagerankc_zscore,follower_rank_zscore,tff_zscore,popularity_zscore,a_score_zscore,retweet_imp_zscore,mention_imp_zscore
0,65536,0.000876,0.000795,4.592485e-08,0.001543,0.000062,0.907500,9.810811,1.0,372.000151,...,-0.333959,-0.236346,-0.333959,-0.091705,0.447262,-0.167347,0.059718,-0.329970,-0.029327,-0.052242
1,1,0.035701,0.035653,2.600515e-05,0.069014,0.030235,0.998650,740.000000,1.0,16280.000000,...,4.158847,-0.219675,4.158847,50.893946,0.828230,0.589924,0.059718,4.025314,-0.115891,-0.052242
2,2,0.010477,0.010308,7.525704e-05,0.019957,0.000232,0.983905,61.129870,1.0,4710.000000,...,0.892199,-0.188045,0.892199,0.194497,0.766600,-0.114124,0.059718,0.857686,-0.115891,-0.052242
3,4,0.019808,0.018928,7.316571e-04,0.036641,0.000383,0.955556,21.500000,1.0,8750.000169,...,2.003192,0.233500,2.003192,0.449416,0.648113,-0.155224,0.059718,1.963755,-0.014011,0.130454
4,5,0.004932,0.004805,9.724488e-06,0.009304,0.000079,0.974245,37.827586,1.0,2199.000053,...,0.182868,-0.230131,0.182868,-0.063487,0.726227,-0.138291,0.059718,0.170225,-0.095307,-0.052242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3576,65520,0.000561,0.000250,6.919265e-04,0.000487,0.000004,0.445312,0.802817,1.0,114.000002,...,-0.404243,0.207985,-0.404243,-0.189455,-1.484478,-0.176689,0.059718,-0.400605,-0.116165,-0.052242
3577,16368,0.001848,0.001152,6.575428e-04,0.002234,0.000017,0.623223,1.654088,1.0,527.000011,...,-0.287950,0.185903,-0.287950,-0.167997,-0.740892,-0.175806,0.059718,-0.287534,-0.113964,-0.052242
3578,40950,0.001629,0.001459,1.722673e-05,0.002827,0.000087,0.895161,8.538462,1.0,666.000000,...,-0.248433,-0.225313,-0.248433,-0.049743,0.395691,-0.168666,0.059718,-0.249478,-0.115891,-0.052242
3579,90107,0.002177,0.002155,1.344142e-06,0.004175,0.000199,0.989940,98.400000,1.0,987.000044,...,-0.158673,-0.235513,-0.158673,0.138547,0.791823,-0.075472,0.059718,-0.161595,-0.098962,-0.052242


In [45]:
# minmax normalization
from utilities import normalize_vector, factor_vector

In [46]:
scaling_factor = 1000.0
top_1000_followerRank_normalized = [followerRank_dict[node] for node in list(top_1000_nodes)]
top_1000_followerRank_normalized= normalize_vector(top_1000_followerRank_normalized)
top_1000_followerRank_normalized = factor_vector(top_1000_followerRank_normalized, scaling_factor)

top_1000_tff_normalized = [tff_dict[node] for node in list(top_1000_nodes)]
top_1000_tff_normalized= normalize_vector(top_1000_tff_normalized)
top_1000_tff_normalized = factor_vector(top_1000_tff_normalized, scaling_factor)

top_1000_popularity_normalized = [pop_dict[node] for node in list(top_1000_nodes)]
top_1000_popularity_normalized= normalize_vector(top_1000_popularity_normalized)
top_1000_popularity_normalized = factor_vector(top_1000_popularity_normalized, scaling_factor)

top_1000_a_score_normalized = [a_score_dict[node] for node in list(top_1000_nodes)]
top_1000_a_score_normalized= normalize_vector(top_1000_a_score_normalized)
top_1000_a_score_normalized = factor_vector(top_1000_a_score_normalized, scaling_factor)

top_1000_retweet_impact_normalized = [retweet_impact_dict[node] for node in list(top_1000_nodes)]
top_1000_retweet_impact_normalized= normalize_vector(top_1000_retweet_impact_normalized)
top_1000_retweet_impact_normalized = factor_vector(top_1000_retweet_impact_normalized, scaling_factor)

top_1000_mention_impact_normalized = [mention_impact_dict[node] for node in list(top_1000_nodes)]
top_1000_mention_impact_normalized= normalize_vector(top_1000_mention_impact_normalized)
top_1000_mention_impact_normalized = factor_vector(top_1000_mention_impact_normalized, scaling_factor)

top_1000_pagerank_normalized = [pagerank_dict[node] for node in list(top_1000_nodes)]
top_1000_pagerank_normalized= normalize_vector(top_1000_pagerank_normalized)
top_1000_pagerank_normalized = factor_vector(top_1000_pagerank_normalized, scaling_factor)

top_1000_degc_normalized = [degc_dict[node] for node in list(top_1000_nodes)]
top_1000_degc_normalized= normalize_vector(top_1000_degc_normalized)
top_1000_degc_normalized = factor_vector(top_1000_degc_normalized, scaling_factor)

top_1000_betwc_normalized = [betweeness_dict[node] for node in list(top_1000_nodes)]
top_1000_betwc_normalized= normalize_vector(top_1000_betwc_normalized)
top_1000_betwc_normalized = factor_vector(top_1000_betwc_normalized, scaling_factor)

top_1000_eigc_normalized = [eigenvector_dict[node] for node in list(top_1000_nodes)]
top_1000_eigc_normalized= normalize_vector(top_1000_eigc_normalized)
top_1000_eigc_normalized = factor_vector(top_1000_eigc_normalized, scaling_factor)

top_1000_indegc_normalized = [in_degc_dict[node] for node in list(top_1000_nodes)]
top_1000_indegc_normalized= normalize_vector(top_1000_indegc_normalized)
top_1000_indegc_normalized = factor_vector(top_1000_indegc_normalized, scaling_factor)