In [1]:
%load_ext autoreload

In [2]:
autoreload 2

In [3]:
%matplotlib inline

In [4]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import pyreto

<h2> Forest fires </h2>

In [5]:
fire_size = pd.read_csv("http://tuvalu.santafe.edu/~aaronc/powerlaws/data/fires.txt", names=['acres'])

In [6]:
fire_size.describe()

Unnamed: 0,acres
count,203785.0
mean,89.563111
std,2098.732181
min,0.1
25%,0.1
50%,0.2
75%,2.0
max,412050.0


In [7]:
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.2, 6324
result1 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=desired_xmin)

In [9]:
# check that I get the same estimates for both alpha and xmin using brute force minimization
result2 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=None, quantile=0.999, method='brute')

In [10]:
np.testing.assert_almost_equal(result2.params['alpha'], desired_alpha, decimal=1)

In [11]:
np.testing.assert_almost_equal(result2.xmin, desired_xmin, decimal=1)

In [12]:
# check that I get the same estimates for both alpha and xmin using bounded minimization
result3 = pyreto.distributions.Pareto.fit(fire_size.acres, xmin=None, quantile=0.999, method='bounded')

In [13]:
np.testing.assert_almost_equal(result3.params['alpha'], desired_alpha, decimal=1)

In [14]:
np.testing.assert_almost_equal(result3.xmin, desired_xmin, decimal=1)

AssertionError: 
Arrays are not almost equal to 1 decimals
 ACTUAL: 7149.9999509662648
 DESIRED: 6324

In [36]:
pvalue, Ds = pyreto.distributions.Pareto.test_goodness_of_fit(42, result3, fire_size.acres, method='bounded')

In [38]:
# pareto distribution should be rejected...
assert pvalue <= 0.10

<h2> Weblinks </h2>

In [20]:
weblinks_histogram = pd.read_csv('http://tuvalu.santafe.edu/~aaronc/powerlaws/data/weblinks.hist', sep='\t')

In [21]:
weblinks_histogram.describe()

Unnamed: 0,degree,frequency
count,14480.0,14480.0
mean,15499.9,19101.43
std,36571.73,1022556.0
min,0.0,1.0
25%,3619.75,1.0
50%,7468.5,2.0
75%,14750.5,11.0
max,1199466.0,106649800.0


In [22]:
# convert histogram data into degree series..
raw_counts = np.repeat(weblinks_histogram.degree.values, weblinks_histogram.frequency.values)
weblinks = pd.Series(raw_counts, name='count')

In [23]:
weblinks.describe()

count    2.765887e+08
mean     7.990814e+00
std      3.054431e+02
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      4.000000e+00
max      1.199466e+06
Name: count, dtype: float64

In [10]:
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.336, 3684
result1 = pyreto.distributions.Pareto.fit(weblinks, xmin=desired_xmin)

In [None]:
np.testing.assert_almost_equal(result1.params['alpha'], desired_alpha, decimal=3)

In [14]:
# check that I get the same estimates for both alpha and xmin using bounded minimization
result2 = pyreto.distributions.Pareto.fit(weblinks, xmin=None, quantile=0.9999, method='bounded')

AssertionError: 
Arrays are not almost equal to 3 decimals
 ACTUAL: 2.3261940228265701
 DESIRED: 2.336

In [None]:
np.testing.assert_almost_equal(result2.params['alpha'], desired_alpha, decimal=3)

In [17]:
test_scaling_threshold_estimation(desired_xmin, result2, decimal=1)

AssertionError: 
Arrays are not almost equal to 1 decimals
 ACTUAL: 3213.9999650697919
 DESIRED: 3684

<h2> Cities </h2>

In [6]:
cities = pd.read_csv('http://tuvalu.santafe.edu/~aaronc/powerlaws/data/cities.txt', names=['population'])
cities.population /= 1e3  # CSN units are in thousands of persons

In [7]:
cities.describe()

Unnamed: 0,population
count,19447.0
mean,9.002051
std,77.825051
min,0.001
25%,0.3695
50%,1.089
75%,4.1355
max,8008.654


In [8]:
# check that I get same estimate for alpha given reported xmin...
desired_alpha, desired_xmin = 2.37, 52.46
result1 = pyreto.distributions.Pareto.fit(cities.population, xmin=desired_xmin)
test_scaling_exponent_estimation(desired_alpha, result1, decimal=2)

In [9]:
# check that I get the same estimates for both alpha and xmin using brute force minimization
result2 = pyreto.distributions.Pareto.fit(cities.population, xmin=None, quantile=0.99, method='brute')
test_scaling_exponent_estimation(desired_alpha, result2, decimal=2)

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 2.3639368738287363
 DESIRED: 2.37

In [10]:
test_scaling_threshold_estimation(desired_xmin, result2, decimal=2)

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 51.442999999999998
 DESIRED: 52.46

In [11]:
# check that I get the same estimates for both alpha and xmin using bounded minimization
result3 = pyreto.distributions.Pareto.fit(cities.population, xmin=None, quantile=0.99, method='bounded')

In [12]:
test_scaling_exponent_estimation(desired_alpha, result3, decimal=2)

In [13]:
test_scaling_threshold_estimation(desired_xmin, result3, decimal=2)

AssertionError: 
Arrays are not almost equal to 2 decimals
 ACTUAL: 70.075391716136934
 DESIRED: 52.46

In [19]:
# using brute force minmization to find xmin makes this test take a while!
pvalue, Ds = pyreto.distributions.Pareto.test_goodness_of_fit(42, result2, cities.population, quantile=0.99,
                                                              method='brute')

In [15]:
# pareto distribution should not be rejected...
assert pvalue > 0.10

AssertionError: 

In [16]:
pvalue

0.026783583288262355

In [17]:
pyreto.distributions.Pareto.test_goodness_of_fit??