# setup

In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import scipy
import warnings
import csv
from IPython.display import display
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
%load_ext autoreload
%aimport networks
from networks import RAGraph
%aimport log_bin
# helper functions
%aimport utils
font_size = 20
%autoreload 1

# Fixed N, varying m ks comparison

## numerical data sets

In [3]:
N = 10**6
M = [1, 2, 4, 8, 16, 32]
raw_degrees_all = []
for repeat in range(20):
    folder = 'data/ra/deg_dist' + str(repeat+1)
    data = []
    for m in M:
        fn = '{0}/{1}_{2}.txt'.format(folder, N, m)
        with open(fn, 'r') as f:
            reader = csv.reader(f)
            data_as_list = list(reader)
        x = data_as_list[0]
        x = [int(i) for i in x]
        data.append(x)
#     raw_degrees = np.array(data)
    raw_degrees_all.append(data)
raw_degrees_all = np.array(raw_degrees_all)

In [4]:
raw_degrees_all[1][5].max()

506

## synthetic datasets

In [5]:
folder = "data/ra/synthetic"
synthetic = []
for m in M:
    data = []
    for i in range(25):
        fn = "{0}/{1}_{2}_{3}.csv".format(folder, N, m, i)
        with open(fn, 'r') as f:
            reader = csv.reader(f)
            data_as_list = list(reader)
        x = data_as_list[0]
        x = [int(i) for i in x[:-1]]
        data.append(x)
    synthetic.append(data)
synthetic_degrees = np.array(synthetic)

In [6]:
for i, d in enumerate(raw_degrees_all[0]):
    reference = synthetic_degrees[i][0]
    ks, raw_p = scipy.stats.ks_2samp(d, reference)
    print(ks)

0.001024
0.000496
0.000795
0.001498
0.000684
0.000695


In [7]:
# temp = np.array(plist_all)
# pd.DataFrame(temp, columns=M).to_csv('data/ra/ks_test.csv')
# pd.read_csv('data/ba/ks_test.csv', index_col=0)

In [8]:
plist_df = pd.read_csv('data/ba/ks_test.csv', index_col=0)
plist_df.columns = plist_df.columns.astype(int)

In [9]:
plist_df.mean()

1     0.714
2     0.446
4     0.502
8     0.580
16    0.706
32    0.456
dtype: float64

## power law comparison

In [10]:
folder = "data/ba/synthetic"
synthetic = []
for m in M:
    data = []
    for i in range(25):
        fn = "{0}/{1}_{2}_{3}.csv".format(folder, N, m, i)
        with open(fn, 'r') as f:
            reader = csv.reader(f)
            data_as_list = list(reader)
        x = data_as_list[0]
        x = [int(i) for i in x[:-1]]
        data.append(x)
    synthetic.append(data)
synthetic_degrees = np.array(synthetic)

In [11]:
plist_all = []
for repeat in range(20):
    print(repeat)
    raw_degrees = raw_degrees_all[repeat]
    plist = []
    for i, d in enumerate(raw_degrees):
        # define a reference dataset
        reference = synthetic_degrees[i][0]
        ks, raw_p = scipy.stats.ks_2samp(d, reference)
        count = 0
        total_counts = len(synthetic[i])

        for index in range(len(synthetic[i])):
            synthetic_ks, synthetic_raw_p = scipy.stats.ks_2samp(synthetic[i][index], reference)
            if synthetic_ks > ks:
                count += 1
        totalp = count / total_counts
        plist.append(totalp)
    plist_all.append(plist)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [12]:
plist_all

[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]