# Build the Graph

In [2]:
from utils.friendship import GraphBuilder

### Test
user = 'user.json'
data_dir = 'data/dataset'

# Sample 10 entries from the json file
fields = ['user_id','friends']
n_samples = 120000
G = GraphBuilder(user, data_dir).build_graph(n_samples)

[0] N(nodes): 0
[0] N(edges): 0

[10000] N(nodes): 572098
[10000] N(edges): 1016194
[10000] N(nodes)/N(edges): 0.5630

[20000] N(nodes): 850060
[20000] N(edges): 1682199
[20000] N(nodes)/N(edges): 0.5053

[30000] N(nodes): 1094474
[30000] N(edges): 2318126
[30000] N(nodes)/N(edges): 0.4721

[40000] N(nodes): 1298589
[40000] N(edges): 2885366
[40000] N(nodes)/N(edges): 0.4501

[50000] N(nodes): 1551141
[50000] N(edges): 3527089
[50000] N(nodes)/N(edges): 0.4398

[60000] N(nodes): 1773430
[60000] N(edges): 4131718
[60000] N(nodes)/N(edges): 0.4292

[70000] N(nodes): 1875605
[70000] N(edges): 4405655
[70000] N(nodes)/N(edges): 0.4257

[80000] N(nodes): 2065953
[80000] N(edges): 4963206
[80000] N(nodes)/N(edges): 0.4163

[90000] N(nodes): 2280149
[90000] N(edges): 5642561
[90000] N(nodes)/N(edges): 0.4041

[100000] N(nodes): 2455635
[100000] N(edges): 6230531
[100000] N(nodes)/N(edges): 0.3941

[110000] N(nodes): 2582623
[110000] N(edges): 6638750
[110000] N(nodes)/N(edges): 0.3890

[12000

In [4]:
import pickle
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/friends_graph.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(G, f)

# Load the Graph

In [1]:
import pickle
import os
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/friends_graph.pkl')
with open(pkl, 'rb') as f:
    G = pickle.load(f)

Degree Inspection

In [2]:
import numpy as np
G_degrees = np.array(sorted(map(lambda x: x[1], G.degree()))[::-1])
print('Total number of nodes: %i' % G_degrees.shape[0])
print('maximum node degree: %i' % max(G_degrees))
for d in range(11):
    n_nodes = (G_degrees == d).sum()
    s = 'The number of nodes with degree %i: %i ' % (d, n_nodes)
    print(s)

Total number of nodes: 2745604
maximum node degree: 8350
The number of nodes with degree 0: 0 
The number of nodes with degree 1: 1803117 
The number of nodes with degree 2: 421422 
The number of nodes with degree 3: 166610 
The number of nodes with degree 4: 85169 
The number of nodes with degree 5: 50730 
The number of nodes with degree 6: 33417 
The number of nodes with degree 7: 23312 
The number of nodes with degree 8: 17319 
The number of nodes with degree 9: 13231 
The number of nodes with degree 10: 10536 


Get the top-K nodes with the highest degree centrality

In [5]:
import networkx as nx
dc = nx.degree_centrality(G)
K = 100000 # top_K
top_K = sorted(dc.items(), key = lambda x: -x[1])[:K]

In [6]:
top_K

[('ZIOCmdFaMIF56FR-nWr_2A', 0.0030412262807113775),
 ('F_5_UNX-wrAFCXuAkBZRDw', 0.002923583635361704),
 ('djxnI8Ux8ZYQJhiOQkrRhA', 0.002922855197929198),
 ('fgwI3rYHOv1ipfVfCSx7pg', 0.0028175959889321215),
 ('MeDuKsZcnI3IU2g7OlV-hQ', 0.0027356467777752283),
 ('5MCBLBxr10NLUKZ4AboAMg', 0.0025954225720178775),
 ('xsT4KZTu_KnOVavtuXn4RA', 0.002454105710111768),
 ('nkN_do3fJ9xekchVC-v68A', 0.002453377272679262),
 ('peuxbSQwXed-81cSqL7Ykw', 0.002443179148624182),
 ('YttDgOC9AlM4HcAlDsbB2A', 0.002422782900514022),
 ('1vXJWH7L0IMEz5-8aU3SOA', 0.002373249155103633),
 ('VHdY6oG2JPVNjihWhOooAQ', 0.002341926345505887),
 ('AHRrG3T1gJpHvtpZ-K0G_g', 0.0022785522888778896),
 ('9HGR8sU_zm15sI109H-SGQ', 0.0022734532268503496),
 ('dIIKEfOgo0KqUfGQvGikPg', 0.0022261047937374777),
 ('3zxy3LVBV3ttxoYbY4rQ8A', 0.002217363544547409),
 ('6tbXpUIU6upoeqWNDo9k_A', 0.0021933251092747202),
 ('w-w-k-QXosIKQ8HQVwU6IQ', 0.0021769352670433415),
 ('ACUVZ4SiN0gni7dzVDm9EQ', 0.002163094955825733),
 ('8DEyKVyplnOcSKx39va

In [7]:
import pickle
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/friends_top100000.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(set([n for n,c in top_K]), f)

In [5]:
# import os
# from os.path import join
# import json
# from utils.preprocess import JSONLoader

# class GraphBuilder(JSONLoader):
    
#     def __init__(self, json_file, dir_data):
#         super().__init__(json_file, dir_data, fields = ['user_id','friends'])
        
#     def build_graph(self, n_samples = 100, print_every = 10000, verbose = True):
#         '''
#         Incrementally build a friendship network
        
#         Args:
#             n_samples (int): the number of json user objects to use. -1 to use all samples.
#         '''
        
#         import networkx as nx
#         G = nx.Graph()

#         with open(self.dir_json) as f:
#             for i, line in enumerate(f):
#                 if i % print_every == 0 and verbose:
#                     print('[%i] N(nodes): %i' % (i ,len(G.nodes())))
#                     print('[%i] N(edges): %i' % (i, len(G.edges())))
#                     if i!= 0:
#                         print('[%i] N(nodes)/N(edges): %.4f' % (i, len(G.nodes()) / len(G.edges())))
#                     print()
#                 if i >= n_samples:
#                     break
#                 json_line = json.loads(line)
#                 user_id = json_line['user_id']
#                 friends = json_line['friends']
#                 edges = [(user_id, target) for target in friends]
#                 G.add_edges_from(edges)
                
#         return G