In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import time
import random

In [2]:
FILE_PATH_PREFIX = './data/'
comments = '#'
delimiter = '\t'

In [3]:
%%time
lscc = nx.read_edgelist(FILE_PATH_PREFIX + 'pokec_edges_lscc.txt', 
                     comments=comments, delimiter=delimiter, 
                     nodetype=float,create_using=nx.DiGraph())

CPU times: user 1min 49s, sys: 2.31 s, total: 1min 52s
Wall time: 1min 52s


In [4]:
%%time
lwcc = nx.read_edgelist(FILE_PATH_PREFIX + 'soc-pokec-relationships_lwcc.tsv', 
                     comments=comments, delimiter=delimiter,
                     nodetype=float)

CPU times: user 1min 20s, sys: 1.79 s, total: 1min 22s
Wall time: 1min 22s


In [6]:
len(lwcc.edges())

22301964

In [5]:
def getRandomShortestPathDistribution(ccGraph, numOfSamples):
    graphnodes = list(ccGraph.nodes())
    nodesLength = len(graphnodes)-1
    shortestPaths = []
    for i in range(numOfSamples):
        shortestPaths.append(nx.shortest_path_length(ccGraph, 
                                                     graphnodes[random.randint(0, nodesLength)],
                                                     graphnodes[random.randint(0, nodesLength)]))   
    return np.array(shortestPaths)

In [6]:
def getRandomSourcesSPDistribution(ccGraph, numOfSamples):
    graphnodes = list(ccGraph.nodes())
    nodesLength = len(graphnodes)-1
    shortestPaths = []
    for i in range(numOfSamples):
        try:
            shortestPaths = shortestPaths + list(nx.single_source_shortest_path_length(
                ccGraph,
                graphnodes[random.randint(0, nodesLength)]).values())[1:]
        except:
            pass
    return np.array(shortestPaths)                    

In [7]:
def getStatisticsForDistribution(dist):
    dist_mean = np.mean(dist);
    dist_median = np.percentile(dist, 50);
    dist_diameter = max(dist);
    dist_eff_diameter = np.percentile(dist, 90);
    return dist_mean, dist_median, dist_diameter, dist_eff_diameter 

### Random Pairs LSCC Stats

In [12]:
%%time
mean, median, diameter, eff_diameter = getStatisticsForDistribution(
    getRandomShortestPathDistribution(lscc, 100000));
print(mean);
print(median);
print(diameter);
print(eff_diameter);

5.11009
5.0
11
6.0
CPU times: user 1min 48s, sys: 8 ms, total: 1min 48s
Wall time: 1min 48s


### Random Sources BFS LSCC

In [9]:
%%time
mean, median, diameter, eff_diameter = getStatisticsForDistribution(
    getRandomSourcesSPDistribution(lscc, 100));
print(mean);
print(median);
print(diameter);
print(eff_diameter);

5.09969330091
5.0
13
6.0
CPU times: user 3min 9s, sys: 324 ms, total: 3min 9s
Wall time: 3min 9s


### Random Pairs LWCC Stats

In [13]:
%%time
mean, median, diameter, eff_diameter = getStatisticsForDistribution(
    getRandomShortestPathDistribution(lwcc, 100000));
print(mean);
print(median);
print(diameter);
print(eff_diameter);

4.68295
5.0
9
6.0
CPU times: user 1min 26s, sys: 0 ns, total: 1min 26s
Wall time: 1min 26s


### Random Sources BFS LWCC

In [10]:
%%time
mean, median, diameter, eff_diameter = getStatisticsForDistribution(
    getRandomSourcesSPDistribution(lwcc, 100));
print(mean);
print(median);
print(diameter);
print(eff_diameter);

4.47349788891
4.0
10
5.0
CPU times: user 5min 7s, sys: 564 ms, total: 5min 8s
Wall time: 5min 8s
