public keflavich /plfit

Subversion checkout URL

You can clone with HTTPS or Subversion.

first commit - all files copied from agpy

 ... ... @@ -0,0 +1,4 @@ 1 +#from plfit import discrete_likelihood 2 +from plfit import plfit 3 +from plfit import test_fitter,pl_inv,plexp_inv,plexp,plfit_lsq 4 +from plfit_py import plfit as plfit_py
 ... ... @@ -0,0 +1,423 @@ 1 +# intended to implement a power-law fitting routine as specified in..... 2 +# http://www.santafe.edu/~aaronc/powerlaws/ 3 +# 4 +# The MLE for the power-law alpha is very easy to derive given knowledge 5 +# of the lowest value at which a power law holds, but that point is  6 +# difficult to derive and must be acquired iteratively. 7 + 8 +""" 9 +plfit.py - a python power-law fitter based on code by Aaron Clauset 10 +http://www.santafe.edu/~aaronc/powerlaws/ 11 +http://arxiv.org/abs/0706.1062 "Power-law distributions in empirical data"  12 +Requires pylab (matplotlib), which requires numpy 13 + 14 +example use: 15 +from plfit import plfit 16 + 17 +MyPL = plfit(mydata) 18 +MyPL.plotpdf(log=True) 19 + 20 + 21 +""" 22 + 23 +import numpy  24 +import time 25 +import pylab 26 +try:  27 + import cplfit 28 + cyok=True 29 + print "Cython plfit module loaded successfully." 30 +except ImportError: 31 + print "Cython plfit module could not be loaded." 32 + cyok=False 33 + 34 +import numpy.random as npr 35 +from numpy import log,log10,sum,argmin,argmax,exp,min,max 36 + 37 +class plfit: 38 + """ 39 + A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m 40 + from http://www.santafe.edu/~aaronc/powerlaws/ 41 + 42 + See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions 43 + in empirical data" SIAM Review, to appear (2009). (arXiv:0706.1062) 44 + http://arxiv.org/abs/0706.1062 45 + 46 + The output "alpha" is defined such that p(x) ~ (x/xmin)^-alpha 47 + """ 48 + 49 + def __init__(self,x,**kwargs): 50 + """ 51 + Initializes and fits the power law. Can pass "quiet" to turn off  52 + output (except for warnings; "silent" turns off warnings) 53 + """ 54 + self.data = x 55 + self.plfit(**kwargs) 56 + 57 + 58 + def alpha_(self,x): 59 + def alpha(xmin,x=x): 60 + """ 61 + given a sorted data set and a minimum, returns power law MLE fit 62 + data is passed as a keyword parameter so that it can be vectorized 63 + """ 64 + x = x[x>=xmin] 65 + n = sum(x>=xmin) 66 + a = float(n) / sum(log(x/xmin)) 67 + return a 68 + return alpha 69 + 70 + def kstest_(self,x): 71 + def kstest(xmin,x=x): 72 + """ 73 + given a sorted data set and a minimum, returns power law MLE ks-test w/data 74 + data is passed as a keyword parameter so that it can be vectorized 75 + """ 76 + x = x[x>=xmin] 77 + n = float(len(x)) 78 + a = float(n) / sum(log(x/xmin)) 79 + cx = numpy.arange(n,dtype='float')/float(n) 80 + cf = 1-(xmin/x)**a 81 + ks = max(abs(cf-cx)) 82 + return ks 83 + return kstest 84 +  85 + 86 + # should probably use a decorator here 87 + def plfit(self,nosmall=True,finite=False,quiet=False,silent=False,usefortran=False,usecy=False, 88 + xmin=None): 89 + """ 90 + A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m 91 + from http://www.santafe.edu/~aaronc/powerlaws/ 92 + 93 + See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions 94 + in empirical data" SIAM Review, to appear (2009). (arXiv:0706.1062) 95 + http://arxiv.org/abs/0706.1062 96 + 97 + nosmall is on by default; it rejects low s/n points 98 + can specify xmin to skip xmin estimation 99 + 100 + There are 3 implementations of xmin estimation. The fortran version is fastest, the C (cython) 101 + version is ~10% slower, and the python version is ~3x slower than the fortran version. 102 + Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown 103 + reasons. 104 + """ 105 + x = self.data 106 + z = numpy.sort(x) 107 + t = time.time() 108 + xmins,argxmins = numpy.unique(z,return_index=True)#[:-1] 109 + t = time.time() 110 + if xmin is None: 111 + if usefortran: 112 + dat,av = fplfit.plfit(z,int(nosmall)) 113 + goodvals=dat>0 114 + sigma = ((av-1)/numpy.sqrt(len(z)-numpy.arange(len(z))))[argxmins] 115 + dat = dat[goodvals] 116 + av = av[goodvals] 117 + if not quiet: print "FORTRAN plfit executed in %f seconds" % (time.time()-t) 118 + elif usecy and cyok: 119 + dat,av = cplfit.plfit_loop(z,nosmall=nosmall,zunique=xmins,argunique=argxmins) 120 + goodvals=dat>0 121 + sigma = (av-1)/numpy.sqrt(len(z)-argxmins) 122 + dat = dat[goodvals] 123 + av = av[goodvals] 124 + if not quiet: print "CYTHON plfit executed in %f seconds" % (time.time()-t) 125 + else: 126 + av = numpy.asarray( map(self.alpha_(z),xmins) ,dtype='float') 127 + dat = numpy.asarray( map(self.kstest_(z),xmins),dtype='float') 128 + if nosmall: 129 + # test to make sure the number of data points is high enough 130 + # to provide a reasonable s/n on the computed alpha 131 + sigma = (av-1)/numpy.sqrt(len(z)-argxmins+1) 132 + goodvals = sigma<0.1 133 + nmax = argmin(goodvals) 134 + dat = dat[:nmax] 135 + av = av[:nmax] 136 + if not quiet: print "PYTHON plfit executed in %f seconds" % (time.time()-t) 137 + self._av = av 138 + self._xmin_kstest = dat 139 + self._sigma = sigma 140 + xmin = xmins[argmin(dat)]  141 + z = z[z>=xmin] 142 + n = len(z) 143 + alpha = 1 + n / sum( log(z/xmin) ) 144 + if finite: 145 + alpha = alpha*(n-1.)/n+1./n 146 + if n < 50 and not finite and not silent: 147 + print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n 148 + ks = max(abs( numpy.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) )) 149 + L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin)) 150 + #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum 151 + self._likelihood = L 152 + self._xmin = xmin 153 + self._xmins = xmins 154 + self._alpha= alpha 155 + self._alphaerr = (alpha-1)/numpy.sqrt(n) 156 + self._ks = ks # this ks statistic may not have the same value as min(dat) because of unique() 157 + self._ngtx = n 158 + 159 + if not quiet: 160 + print "xmin: %g n(>xmin): %i alpha: %g +/- %g Likelihood: %g ks: %g" % (xmin,n,alpha,self._alphaerr,L,ks) 161 + 162 + return xmin,alpha 163 + 164 + def alphavsks(self,autozoom=True,**kwargs): 165 + """ 166 + Plot alpha versus the ks value for derived alpha. This plot can be used 167 + as a diagnostic of whether you have derived the 'best' fit: if there are  168 + multiple local minima, your data set may be well suited to a broken  169 + powerlaw or a different function. 170 + """ 171 +  172 + pylab.plot(self._xmin_kstest,1+self._av,'.') 173 + pylab.errorbar([self._ks],self._alpha,yerr=self._alphaerr,fmt='+') 174 + 175 + ax=pylab.gca() 176 + ax.set_xlim(0.8*(self._ks),3*(self._ks)) 177 + ax.set_ylim((self._alpha)-5*self._alphaerr,(self._alpha)+5*self._alphaerr) 178 + ax.set_xlabel("KS statistic") 179 + ax.set_ylabel(r'$\alpha$') 180 + pylab.draw() 181 + 182 + return ax 183 + 184 + def plotcdf(self,x=None,xmin=None,alpha=None,**kwargs): 185 + """ 186 + Plots CDF and powerlaw 187 + """ 188 + if not(x): x=self.data 189 + if not(xmin): xmin=self._xmin 190 + if not(alpha): alpha=self._alpha 191 + 192 + x=numpy.sort(x) 193 + n=len(x) 194 + xcdf = numpy.arange(n,0,-1,dtype='float')/float(n) 195 + 196 + q = x[x>=xmin] 197 + fcdf = (q/xmin)**(1-alpha) 198 + nc = xcdf[argmax(x>=xmin)] 199 + fcdf_norm = nc*fcdf 200 + 201 + pylab.loglog(x,xcdf,marker='+',color='k',**kwargs) 202 + pylab.loglog(q,fcdf_norm,color='r',**kwargs) 203 + 204 + def plotpdf(self,x=None,xmin=None,alpha=None,nbins=50,dolog=True,dnds=False,**kwargs): 205 + """ 206 + Plots PDF and powerlaw. 207 + """ 208 + if not(x): x=self.data 209 + if not(xmin): xmin=self._xmin 210 + if not(alpha): alpha=self._alpha 211 + 212 + x=numpy.sort(x) 213 + n=len(x) 214 + 215 + pylab.gca().set_xscale('log') 216 + pylab.gca().set_yscale('log') 217 + 218 + if dnds: 219 + hb = pylab.histogram(x,bins=numpy.logspace(log10(min(x)),log10(max(x)),nbins)) 220 + h = hb[0] 221 + b = hb[1] 222 + db = hb[1][1:]-hb[1][:-1] 223 + h = h/db 224 + pylab.plot(b[:-1],h,drawstyle='steps-post',color='k',**kwargs) 225 + #alpha -= 1 226 + elif dolog: 227 + hb = pylab.hist(x,bins=numpy.logspace(log10(min(x)),log10(max(x)),nbins),log=True,fill=False,edgecolor='k',**kwargs) 228 + alpha -= 1 229 + h,b=hb[0],hb[1] 230 + else: 231 + hb = pylab.hist(x,bins=numpy.linspace((min(x)),(max(x)),nbins),fill=False,edgecolor='k',**kwargs) 232 + h,b=hb[0],hb[1] 233 + b = b[1:] 234 + 235 + q = x[x>=xmin] 236 + px = (alpha-1)/xmin * (q/xmin)**(-alpha) 237 + 238 + arg = argmin(abs(b-xmin)) 239 + plotloc = (b>xmin)*(h>0) 240 + norm = numpy.median( h[plotloc] / ((alpha-1)/xmin * (b[plotloc]/xmin)**(-alpha)) ) 241 + px = px*norm 242 + 243 + pylab.loglog(q,px,'r',**kwargs) 244 + pylab.vlines(xmin,0.1,max(px),colors='r',linestyle='dashed') 245 + 246 + pylab.gca().set_xlim(min(x),max(x)) 247 + 248 + def plotppf(self,x=None,xmin=None,alpha=None,dolog=True,**kwargs): 249 + if not(xmin): xmin=self._xmin 250 + if not(alpha): alpha=self._alpha 251 + if not(x): x=numpy.sort(self.data[self.data>xmin]) 252 + else: x=numpy.sort(x[x>xmin]) 253 + 254 + # N = M^(-alpha+1) 255 + # M = N^(1/(-alpha+1)) 256 +  257 + m0 = min(x) 258 + N = (1.0+numpy.arange(len(x)))[::-1] 259 + xmodel = m0 * N**(1/(1-alpha)) / max(N)**(1/(1-alpha)) 260 +  261 + if dolog: 262 + pylab.loglog(x,xmodel,'.',**kwargs) 263 + pylab.gca().set_xlim(min(x),max(x)) 264 + pylab.gca().set_ylim(min(x),max(x)) 265 + else: 266 + pylab.plot(x,xmodel,'.',**kwargs) 267 + pylab.plot([min(x),max(x)],[min(x),max(x)],'k--') 268 + 269 + def test_pl(self,niter=1e3,**kwargs): 270 + """ 271 + Monte-Carlo test to determine whether distribution is consistent with a power law 272 + 273 + Runs through niter iterations of a sample size identical to the input sample size. 274 + 275 + Will randomly select values from the data < xmin. The number of values selected will 276 + be chosen from a uniform random distribution with p(xmin)>100 is required to distinguish a PL from an exponential, 283 + and n(>xmin)>~300 is required to distinguish a log-normal distribution from a PL. 284 + For more details, see figure 4.1 and section 285 + 286 + **WARNING** This can take a very long time to run! Execution time scales as  287 + niter * setsize 288 + 289 + """ 290 + xmin = self._xmin 291 + alpha = self._alpha 292 + niter = int(niter) 293 + 294 + ntail = sum(self.data >= xmin) 295 + ntot = len(self.data) 296 + nnot = ntot-ntail # n(self._ks).sum() / float(niter) 317 + self._pval = p 318 + self._ks_rand = ksv 319 + 320 + print "p(%i) = %0.3f" % (niter,p) 321 + 322 + return p,ksv 323 + 324 +def plfit_lsq(x,y): 325 + """ 326 + Returns A and B in y=Ax^B 327 + http://mathworld.wolfram.com/LeastSquaresFittingPowerLaw.html 328 + """ 329 + n = len(x) 330 + btop = n * (log(x)*log(y)).sum() - (log(x)).sum()*(log(y)).sum() 331 + bbottom = n*(log(x)**2).sum() - (log(x).sum())**2 332 + b = btop / bbottom 333 + a = ( log(y).sum() - b * log(x).sum() ) / n 334 + 335 + A = exp(a) 336 + return A,b 337 + 338 +def plexp(x,xm=1,a=2.5): 339 + """ 340 + CDF(x) for the piecewise distribution exponential x=xmin 341 + This is the CDF version of the distributions drawn in fig 3.4a of Clauset et al. 342 + """ 343 + 344 + C = 1/(-xm/(1 - a) - xm/a + exp(a)*xm/a) 345 + Ppl = lambda(X): 1+C*(xm/(1-a)*(X/xm)**(1-a)) 346 + Pexp = lambda(X): C*xm/a*exp(a)-C*(xm/a)*exp(-a*(X/xm-1)) 347 + d=Ppl(x) 348 + d[x=Pxm] = xm*( (P[P>=Pxm]-1) * (1-a)/(C*xm) )**(1/(1-a)) # powerlaw 361 + x[P
 ... ... @@ -0,0 +1,425 @@ 1 +# intended to implement a power-law fitting routine as specified in..... 2 +# http://www.santafe.edu/~aaronc/powerlaws/ 3 +# 4 +# The MLE for the power-law alpha is very easy to derive given knowledge 5 +# of the lowest value at which a power law holds, but that point is  6 +# difficult to derive and must be acquired iteratively. 7 + 8 +""" 9 +plfit.py - a python power-law fitter based on code by Aaron Clauset 10 +http://www.santafe.edu/~aaronc/powerlaws/ 11 +http://arxiv.org/abs/0706.1062 "Power-law distributions in empirical data"  12 +Requires pylab (matplotlib), which requires numpy 13 + 14 +example use: 15 +from plfit import plfit 16 + 17 +MyPL = plfit(mydata) 18 +MyPL.plotpdf(log=True) 19 + 20 + 21 +""" 22 + 23 +import numpy  24 +import time 25 +import pylab 26 +try: 27 + import fplfit 28 + usefortran=True 29 + print "Fortran plfit module loaded successfully." 30 +except ImportError: 31 + print "Fortran module could not be loaded. plfit will load with the \ 32 + python module instead - it is fully functional but at least 4x \ 33 + slower for large arrays" 34 + usefortran=False 35 + 36 +import numpy.random as npr 37 +from numpy import log,log10,sum,argmin,argmax,exp,min,max 38 + 39 +class plfit: 40 + """ 41 + A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m 42 + from http://www.santafe.edu/~aaronc/powerlaws/ 43 + 44 + See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions 45 + in empirical data" SIAM Review, to appear (2009). (arXiv:0706.1062) 46 + http://arxiv.org/abs/0706.1062 47 + 48 + The output "alpha" is defined such that p(x) ~ (x/xmin)^-alpha 49 + """ 50 + 51 + def __init__(self,x,**kwargs): 52 + """ 53 + Initializes and fits the power law. Can pass "quiet" to turn off  54 + output (except for warnings; "silent" turns off warnings) 55 + """ 56 + self.data = x 57 + self.plfit(**kwargs) 58 + 59 + 60 + def alpha_(self,x): 61 + def alpha(xmin,x=x): 62 + """ 63 + given a sorted data set and a minimum, returns power law MLE fit 64 + data is passed as a keyword parameter so that it can be vectorized 65 + """ 66 + x = x[x>=xmin] 67 + n = sum(x>=xmin) 68 + a = float(n) / sum(log(x/xmin)) 69 + return a 70 + return alpha 71 + 72 + def kstest_(self,x): 73 + def kstest(xmin,x=x): 74 + """ 75 + given a sorted data set and a minimum, returns power law MLE ks-test w/data 76 + data is passed as a keyword parameter so that it can be vectorized 77 + """ 78 + x = x[x>=xmin] 79 + n = float(len(x)) 80 + a = float(n) / sum(log(x/xmin)) 81 + cx = numpy.arange(n,dtype='float')/float(n) 82 + cf = 1-(xmin/x)**a 83 + ks = max(abs(cf-cx)) 84 + return ks 85 + return kstest 86 +  87 + 88 + # should probably use a decorator here 89 + def plfit(self,nosmall=True,finite=False,quiet=False,silent=False,usefortran=usefortran,usecy=False, 90 + xmin=None): 91 + """ 92 + A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m 93 + from http://www.santafe.edu/~aaronc/powerlaws/ 94 + 95 + See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions 96 + in empirical data" SIAM Review, to appear (2009). (arXiv:0706.1062) 97 + http://arxiv.org/abs/0706.1062 98 + 99 + nosmall is on by default; it rejects low s/n points 100 + can specify xmin to skip xmin estimation 101 + 102 + There are 3 implementations of xmin estimation. The fortran version is fastest, the C (cython) 103 + version is ~10% slower, and the python version is ~3x slower than the fortran version. 104 + Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown 105 + reasons. 106 + """ 107 + x = self.data 108 + z = numpy.sort(x) 109 + t = time.time() 110 + xmins,argxmins = numpy.unique(z,return_index=True)#[:-1] 111 + t = time.time() 112 + if xmin is None: 113 + if usefortran: 114 + dat,av = fplfit.plfit(z,int(nosmall)) 115 + goodvals=dat>0 116 + sigma = ((av-1)/numpy.sqrt(len(z)-numpy.arange(len(z))))[argxmins] 117 + dat = dat[goodvals] 118 + av = av[goodvals] 119 + if not quiet: print "FORTRAN plfit executed in %f seconds" % (time.time()-t) 120 + elif usecy and cyok: 121 + dat,av = cplfit.plfit_loop(z,nosmall=nosmall,zunique=xmins,argunique=argxmins) 122 + goodvals=dat>0 123 + sigma = (av-1)/numpy.sqrt(len(z)-argxmins) 124 + dat = dat[goodvals] 125 + av = av[goodvals] 126 + if not quiet: print "CYTHON plfit executed in %f seconds" % (time.time()-t) 127 + else: 128 + av = numpy.asarray( map(self.alpha_(z),xmins) ,dtype='float') 129 + dat = numpy.asarray( map(self.kstest_(z),xmins),dtype='float') 130 + if nosmall: 131 + # test to make sure the number of data points is high enough 132 + # to provide a reasonable s/n on the computed alpha 133 + sigma = (av-1)/numpy.sqrt(len(z)-argxmins+1) 134 + goodvals = sigma<0.1 135 + nmax = argmin(goodvals) 136 + dat = dat[:nmax] 137 + av = av[:nmax] 138 + if not quiet: print "PYTHON plfit executed in %f seconds" % (time.time()-t) 139 + self._av = av 140 + self._xmin_kstest = dat 141 + self._sigma = sigma 142 + xmin = xmins[argmin(dat)]  143 + z = z[z>=xmin] 144 + n = len(z) 145 + alpha = 1 + n / sum( log(z/xmin) ) 146 + if finite: 147 + alpha = alpha*(n-1.)/n+1./n 148 + if n < 50 and not finite and not silent: 149 + print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n 150 + ks = max(abs( numpy.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) )) 151 + L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin)) 152 + #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum 153 + self._likelihood = L 154 + self._xmin = xmin 155 + self._xmins = xmins 156 + self._alpha= alpha 157 + self._alphaerr = (alpha-1)/numpy.sqrt(n) 158 + self._ks = ks # this ks statistic may not have the same value as min(dat) because of unique() 159 + self._ngtx = n 160 + 161 + if not quiet: 162 + print "xmin: %g n(>xmin): %i alpha: %g +/- %g Likelihood: %g ks: %g" % (xmin,n,alpha,self._alphaerr,L,ks) 163 + 164 + return xmin,alpha 165 + 166 + def alphavsks(self,autozoom=True,**kwargs): 167 + """ 168 + Plot alpha versus the ks value for derived alpha. This plot can be used 169 + as a diagnostic of whether you have derived the 'best' fit: if there are  170 + multiple local minima, your data set may be well suited to a broken  171 + powerlaw or a different function. 172 + """ 173 +  174 + pylab.plot(self._xmin_kstest,1+self._av,'.') 175 + pylab.errorbar([self._ks],self._alpha,yerr=self._alphaerr,fmt='+') 176 + 177 + ax=pylab.gca() 178 + ax.set_xlim(0.8*(self._ks),3*(self._ks)) 179 + ax.set_ylim((self._alpha)-5*self._alphaerr,(self._alpha)+5*self._alphaerr) 180 + ax.set_xlabel("KS statistic") 181 + ax.set_ylabel(r'$\alpha$') 182 + pylab.draw() 183 + 184 + return ax 185 + 186 + def plotcdf(self,x=None,xmin=None,alpha=None,**kwargs): 187 + """ 188 + Plots CDF and powerlaw 189 + """ 190 + if not(x): x=self.data 191 + if not(xmin): xmin=self._xmin 192 + if not(alpha): alpha=self._alpha 193 + 194 + x=numpy.sort(x) 195 + n=len(x) 196 + xcdf = numpy.arange(n,0,-1,dtype='float')/float(n) 197 + 198 + q = x[x>=xmin] 199 + fcdf = (q/xmin)**(1-alpha) 200 + nc = xcdf[argmax(x>=xmin)] 201 + fcdf_norm = nc*fcdf 202 + 203 + pylab.loglog(x,xcdf,marker='+',color='k',**kwargs) 204 + pylab.loglog(q,fcdf_norm,color='r',**kwargs) 205 + 206 + def plotpdf(self,x=None,xmin=None,alpha=None,nbins=50,dolog=True,dnds=False,**kwargs): 207 + """ 208 + Plots PDF and powerlaw. 209 + """ 210 + if not(x): x=self.data 211 + if not(xmin): xmin=self._xmin 212 + if not(alpha): alpha=self._alpha 213 + 214 + x=numpy.sort(x) 215 + n=len(x) 216 + 217 + pylab.gca().set_xscale('log') 218 + pylab.gca().set_yscale('log') 219 + 220 + if dnds: 221 + hb = pylab.histogram(x,bins=numpy.logspace(log10(min(x)),log10(max(x)),nbins)) 222 + h = hb[0] 223 + b = hb[1] 224 + db = hb[1][1:]-hb[1][:-1] 225 + h = h/db 226 + pylab.plot(b[:-1],h,drawstyle='steps-post',color='k',**kwargs) 227 + #alpha -= 1 228 + elif dolog: 229 + hb = pylab.hist(x,bins=numpy.logspace(log10(min(x)),log10(max(x)),nbins),log=True,fill=False,edgecolor='k',**kwargs) 230 + alpha -= 1 231 + h,b=hb[0],hb[1] 232 + else: 233 + hb = pylab.hist(x,bins=numpy.linspace((min(x)),(max(x)),nbins),fill=False,edgecolor='k',**kwargs) 234 + h,b=hb[0],hb[1] 235 + b = b[1:] 236 + 237 + q = x[x>=xmin] 238 + px = (alpha-1)/xmin * (q/xmin)**(-alpha) 239 + 240 + arg = argmin(abs(b-xmin)) 241 + plotloc = (b>xmin)*(h>0) 242 + norm = numpy.median( h[plotloc] / ((alpha-1)/xmin * (b[plotloc]/xmin)**(-alpha)) ) 243 + px = px*norm 244 + 245 + pylab.loglog(q,px,'r',**kwargs) 246 + pylab.vlines(xmin,0.1,max(px),colors='r',linestyle='dashed') 247 + 248 + pylab.gca().set_xlim(min(x),max(x)) 249 + 250 + def plotppf(self,x=None,xmin=None,alpha=None,dolog=True,**kwargs): 251 + if not(xmin): xmin=self._xmin 252 + if not(alpha): alpha=self._alpha 253 + if not(x): x=numpy.sort(self.data[self.data>xmin]) 254 + else: x=numpy.sort(x[x>xmin]) 255 + 256 + # N = M^(-alpha+1) 257 + # M = N^(1/(-alpha+1)) 258 +  259 + m0 = min(x) 260 + N = (1.0+numpy.arange(len(x)))[::-1] 261 + xmodel = m0 * N**(1/(1-alpha)) / max(N)**(1/(1-alpha)) 262 +  263 + if dolog: 264 + pylab.loglog(x,xmodel,'.',**kwargs) 265 + pylab.gca().set_xlim(min(x),max(x)) 266 + pylab.gca().set_ylim(min(x),max(x)) 267 + else: 268 + pylab.plot(x,xmodel,'.',**kwargs) 269 + pylab.plot([min(x),max(x)],[min(x),max(x)],'k--') 270 + 271 + def test_pl(self,niter=1e3,**kwargs): 272 + """ 273 + Monte-Carlo test to determine whether distribution is consistent with a power law 274 + 275 + Runs through niter iterations of a sample size identical to the input sample size. 276 + 277 + Will randomly select values from the data < xmin. The number of values selected will 278 + be chosen from a uniform random distribution with p(xmin)>100 is required to distinguish a PL from an exponential, 285 + and n(>xmin)>~300 is required to distinguish a log-normal distribution from a PL. 286 + For more details, see figure 4.1 and section 287 + 288 + **WARNING** This can take a very long time to run! Execution time scales as  289 + niter * setsize 290 + 291 + """ 292 + xmin = self._xmin 293 + alpha = self._alpha 294 + niter = int(niter) 295 + 296 + ntail = sum(self.data >= xmin) 297 + ntot = len(self.data) 298 + nnot = ntot-ntail # n(self._ks).sum() / float(niter) 319 + self._pval = p 320 + self._ks_rand = ksv 321 + 322 + print "p(%i) = %0.3f" % (niter,p) 323 + 324 + return p,ksv 325 + 326 +def plfit_lsq(x,y): 327 + """ 328 + Returns A and B in y=Ax^B 329 + http://mathworld.wolfram.com/LeastSquaresFittingPowerLaw.html 330 + """ 331 + n = len(x) 332 + btop = n * (log(x)*log(y)).sum() - (log(x)).sum()*(log(y)).sum() 333 + bbottom = n*(log(x)**2).sum() - (log(x).sum())**2 334 + b = btop / bbottom 335 + a = ( log(y).sum() - b * log(x).sum() ) / n 336 + 337 + A = exp(a) 338 + return A,b 339 + 340 +def plexp(x,xm=1,a=2.5): 341 + """ 342 + CDF(x) for the piecewise distribution exponential x=xmin 343 + This is the CDF version of the distributions drawn in fig 3.4a of Clauset et al. 344 + """ 345 + 346 + C = 1/(-xm/(1 - a) - xm/a + exp(a)*xm/a) 347 + Ppl = lambda(X): 1+C*(xm/(1-a)*(X/xm)**(1-a)) 348 + Pexp = lambda(X): C*xm/a*exp(a)-C*(xm/a)*exp(-a*(X/xm-1)) 349 + d=Ppl(x) 350 + d[x=Pxm] = xm*( (P[P>=Pxm]-1) * (1-a)/(C*xm) )**(1/(1-a)) # powerlaw 363 + x[P
 ... ... @@ -0,0 +1,851 @@ 1 +# -*- coding: latin-1 -*- 2 +#  3 +# intended to implement a power-law fitting routine as specified in..... 4 +# http://www.santafe.edu/~aaronc/powerlaws/ 5 +# 6 +# The MLE for the power-law alpha is very easy to derive given knowledge 7 +# of the lowest value at which a power law holds, but that point is  8 +# difficult to derive and must be acquired iteratively. 9 + 10 +""" 11 +plfit.py - a python power-law fitter based on code by Aaron Clauset 12 +http://www.santafe.edu/~aaronc/powerlaws/ 13 +http://arxiv.org/abs/0706.1062 "Power-law distributions in empirical data"  14 +Requires pylab (matplotlib), which requires numpy 15 + 16 +example use: 17 +from plfit import plfit 18 + 19 +MyPL = plfit(mydata) 20 +MyPL.plotpdf(log=True) 21 + 22 + 23 +""" 24 + 25 +import numpy  26 +# this may break downstream items numpy.seterr(all='ignore') # likely to cause failures otherwise 27 +import time 28 +import pylab 29 +try: 30 + import fplfit 31 + fortranOK = True 32 +except: 33 + fortranOK = False 34 +try: 35 + import cplfit 36 + cyOK = True 37 +except: 38 + cyOK = False 39 + 40 +import numpy.random as npr 41 +from numpy import log,log10,sum,argmin,argmax,exp,min,max 42 +try: 43 + import scipy.stats 44 + scipyOK = True 45 +except ImportError: 46 + scipyOK = False 47 + print "scipy didn't import. Can't compute certain basic statistics." 48 + 49 +class plfit: 50 + """ 51 + A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m 52 + from http://www.santafe.edu/~aaronc/powerlaws/ 53 + 54 + See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions 55 + in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062) 56 + http://arxiv.org/abs/0706.1062 57 + 58 + The output "alpha" is defined such that p(x) ~ (x/xmin)^-alpha 59 + """ 60 + 61 + def __init__(self,x,**kwargs): 62 + """ 63 + Initializes and fits the power law. Can pass "quiet" to turn off  64 + output (except for warnings; "silent" turns off warnings) 65 + """ 66 + x = numpy.array(x) # make sure x is an array, otherwise the next step fails  67 + if (x<0).sum() > 0: 68 + print "Removed %i negative points" % ((x<0).sum()) 69 + x = x[x>0] 70 + self.data = x 71 + self.plfit(**kwargs) 72 + 73 + 74 + def alpha_(self,x): 75 + def alpha(xmin,x=x): 76 + """ 77 + given a sorted data set and a minimum, returns power law MLE fit 78 + data is passed as a keyword parameter so that it can be vectorized 79 + 80 + if there is only one element, return alpha=0 81 + """ 82 + gexmin = x>=xmin 83 + n = gexmin.sum() 84 + if n < 2: 85 + return 0 86 + x = x[gexmin] 87 + a = float(n) / sum(log(x/xmin)) 88 + return a 89 + return alpha 90 + 91 + def kstest_(self,x): 92 + def kstest(xmin,x=x): 93 + """ 94 + given a sorted data set and a minimum, returns power law MLE ks-test w/data 95 + data is passed as a keyword parameter so that it can be vectorized 96 + 97 + The returned value is the "D" parameter in the ks test... 98 + """ 99 + x = x[x>=xmin] 100 + n = float(len(x)) 101 + if n == 0: return numpy.inf 102 + a = float(n) / sum(log(x/xmin)) 103 + cx = numpy.arange(n,dtype='float')/float(n) 104 + cf = 1-(xmin/x)**a 105 + ks = max(abs(cf-cx)) 106 + return ks 107 + return kstest 108 +  109 + 110 + def plfit(self, nosmall=True, finite=False, quiet=False, silent=False, 111 + usefortran=False, usecy=False, xmin=None, verbose=False,  112 + discrete=None, discrete_approx=True, discrete_n_alpha=1000): 113 + """ 114 + A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m 115 + from http://www.santafe.edu/~aaronc/powerlaws/ 116 + 117 + See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions 118 + in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062) 119 + http://arxiv.org/abs/0706.1062 120 + 121 + There are 3 implementations of xmin estimation. The fortran version is fastest, the C (cython) 122 + version is ~10% slower, and the python version is ~3x slower than the fortran version. 123 + Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown 124 + reasons. 125 + 126 + There is also a discrete version implemented in python - it is different from the continous version! 127 + *discrete* [ bool | None ] 128 + If *discrete* is None, the code will try to determine whether the 129 + data set is discrete or continous based on the uniqueness of the 130 + data. If *discrete* is True or False, the distcrete or continuous 131 + fitter will be used, respectively. 132 + 133 + *xmin* [ float / int ] 134 + If you specify xmin, the fitter will only determine alpha assuming 135 + the given xmin; the rest of the code (and most of the complexity) 136 + is determining an estimate for xmin and alpha. 137 + 138 + *nosmall* [ bool (True) ] 139 + When on, the code rejects low s/n points 140 + 141 + *finite* [ bool (False) ] 142 + There is a 'finite-size bias' to the estimator. The "alpha" the code measures 143 + is "alpha-hat" s.t. ᾶ = (nα-1)/(n-1), or α = (1 + ᾶ (n-1)) / n 144 + 145 + *quiet* [ bool (False) ] 146 + If False, delivers messages about what fitter is used and the fit results 147 + 148 + *verbose* [ bool (False) ]  149 + Deliver descriptive messages about the fit parameters (only if *quiet*==False) 150 + 151 + *silent* [ bool (False) ]  152 + If True, will print NO messages 153 + """ 154 + x = self.data 155 + z = numpy.sort(x) 156 + t = time.time() 157 + xmins,argxmins = numpy.unique(z,return_index=True)#[:-1] 158 + self._nunique = len(xmins) 159 +  160 + if self._nunique == len(x) and discrete is None: 161 + if verbose: print "Using CONTINUOUS fitter" 162 + discrete = False 163 + elif self._nunique < len(x) and discrete is None: 164 + if verbose: print "Using DISCRETE fitter" 165 + discrete = True 166 + 167 + t = time.time() 168 + if xmin is None: 169 + if discrete: 170 + self.discrete_best_alpha( approximate=discrete_approx, 171 + n_alpha=discrete_n_alpha, verbose=verbose, finite=finite) 172 + return self._xmin,self._alpha 173 + elif usefortran and fortranOK: 174 + dat,av = fplfit.plfit(z,int(nosmall)) 175 + goodvals=dat>0 176 + sigma = ((av-1)/numpy.sqrt(len(z)-numpy.arange(len(z))))[argxmins] 177 + dat = dat[goodvals] 178 + av = av[goodvals] 179 + if nosmall: 180 + # data, av a;ready treated for this. sigma, xmins not 181 + nmax = argmin(sigma<0.1) 182 + xmins = xmins[:nmax] 183 + sigma = sigma[:nmax] 184 + if not quiet: print "FORTRAN plfit executed in %f seconds" % (time.time()-t) 185 + elif usecy and cyOK: 186 + dat,av = cplfit.plfit_loop(z,nosmall=nosmall,zunique=xmins,argunique=argxmins) 187 + goodvals=dat>0 188 + sigma = (av-1)/numpy.sqrt(len(z)-argxmins) 189 + dat = dat[goodvals] 190 + av = av[goodvals] 191 + if not quiet: print "CYTHON plfit executed in %f seconds" % (time.time()-t) 192 + else: 193 + av = numpy.asarray( map(self.alpha_(z),xmins) ,dtype='float') 194 + dat = numpy.asarray( map(self.kstest_(z),xmins),dtype='float') 195 + sigma = (av-1)/numpy.sqrt(len(z)-argxmins+1) 196 + if nosmall: 197 + # test to make sure the number of data points is high enough 198 + # to provide a reasonable s/n on the computed alpha 199 + goodvals = sigma<0.1 200 + nmax = argmin(goodvals) 201 + if nmax > 0: 202 + dat = dat[:nmax] 203 + xmins = xmins[:nmax] 204 + av = av[:nmax] 205 + sigma = sigma[:nmax] 206 + else: 207 + if not silent:  208 + print "Not enough data left after flagging - using all positive data." 209 + if not quiet:  210 + print "PYTHON plfit executed in %f seconds" % (time.time()-t) 211 + if usefortran: print "fortran fplfit did not load" 212 + if usecy: print "cython cplfit did not load" 213 + self._av = av 214 + self._xmin_kstest = dat 215 + self._sigma = sigma 216 + xmin = xmins[argmin(dat)]  217 + z = z[z>=xmin] 218 + n = len(z) 219 + alpha = 1 + n / sum( log(z/xmin) ) 220 + if finite: 221 + alpha = alpha*(n-1.)/n+1./n 222 + if n < 50 and not finite and not silent: 223 + print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n 224 + ks = max(abs( numpy.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) )) 225 + # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) = (alpha-1)/xmin. Really is Eqn B3 in paper. 226 + L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin)) 227 + #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum 228 + self._likelihood = L 229 + self._xmin = xmin 230 + self._xmins = xmins 231 + self._alpha= alpha 232 + self._alphaerr = (alpha-1)/numpy.sqrt(n) 233 + self._ks = ks # this ks statistic may not have the same value as min(dat) because of unique() 234 + if scipyOK: self._ks_prob = scipy.stats.kstwobign.sf(ks*numpy.sqrt(n)) 235 + self._ngtx = n 236 + if n == 1: 237 + if not silent: 238 + print "Failure: only 1 point kept. Probably not a power-law distribution." 239 + self._alpha = alpha = 0 240 + self._alphaerr = 0 241 + self._likelihood = L = 0 242 + self._ks = 0 243 + self._ks_prob = 0 244 + self._xmin = xmin 245 + return xmin,0 246 + if numpy.isnan(L) or numpy.isnan(xmin) or numpy.isnan(alpha): 247 + raise ValueError("plfit failed; returned a nan") 248 + 249 + if not quiet: 250 + if verbose: print "The lowest value included in the power-law fit, ", 251 + print "xmin: %g" % xmin, 252 + if verbose: print "\nThe number of values above xmin, ", 253 + print "n(>xmin): %i" % n, 254 + if verbose: print "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ", 255 + print "alpha: %g +/- %g " % (alpha,self._alphaerr),  256 + if verbose: print "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ", 257 + print "Log-Likelihood: %g " % L, 258 + if verbose: print "\nThe KS-test statistic between the best-fit power-law and the data, ", 259 + print "ks: %g" % (ks), 260 + if scipyOK: 261 + if verbose: print " occurs with probability ", 262 + print "p(ks): %g" % (self._ks_prob) 263 + else: 264 + print 265 + 266 + return xmin,alpha 267 + 268 + 269 + def discrete_best_alpha(self, alpharangemults=(0.9,1.1), n_alpha=201, approximate=True, verbose=True, finite=True): 270 + """ 271 + Use the maximum L to determine the most likely value of alpha 272 + 273 + *alpharangemults* [ 2-tuple ] 274 + Pair of values indicating multiplicative factors above and below the 275 + approximate alpha from the MLE alpha to use when determining the 276 + "exact" alpha (by directly maximizing the likelihood function) 277 + """ 278 + 279 + data = self.data 280 + self._xmins = xmins = numpy.unique(data) 281 + if approximate: 282 + alpha_of_xmin = [ discrete_alpha_mle(data,xmin) for xmin in xmins ] 283 + else: 284 + alpha_approx = [ discrete_alpha_mle(data,xmin) for xmin in xmins ] 285 + alpharanges = [(0.9*a,1.1*a) for a in alpha_approx] 286 + alpha_of_xmin = [ most_likely_alpha(data,xmin,alpharange=ar,n_alpha=n_alpha) for xmin,ar in zip(xmins,alpharanges) ] 287 + ksvalues = numpy.array([ discrete_ksD(data, xmin, alpha) for xmin,alpha in zip(xmins,alpha_of_xmin) ]) 288 + self._av = numpy.array(alpha_of_xmin) 289 + self._xmin_kstest = ksvalues 290 +  291 + ksvalues[numpy.isnan(ksvalues)] = numpy.inf 292 + 293 + best_index = argmin(ksvalues) 294 + self._alpha = best_alpha = alpha_of_xmin[best_index] 295 + self._xmin = best_xmin = xmins[best_index] 296 + self._ks = best_ks = ksvalues[best_index] 297 + self._likelihood = best_likelihood = discrete_likelihood(data, best_xmin, best_alpha) 298 + 299 + if finite: 300 + self._alpha = self._alpha*(n-1.)/n+1./n 301 + 302 + if verbose: 303 + print "alpha = %f xmin = %f ksD = %f L = %f (n=x) = %i" % ( 304 + best_alpha, best_xmin, best_ks, best_likelihood, 305 + (data=best_xmin).sum()) 306 + 307 + 308 + self._ngtx = n = (self.data>=self._xmin).sum() 309 + self._alphaerr = (self._alpha-1.0)/numpy.sqrt(n) 310 + if scipyOK: self._ks_prob = scipy.stats.kstwobign.sf(self._ks*numpy.sqrt(n)) 311 + 312 + return best_alpha,best_xmin,best_ks,best_likelihood 313 + 314 + def xminvsks(self, **kwargs): 315 + """ 316 + Plot xmin versus the ks value for derived alpha. This plot can be used 317 + as a diagnostic of whether you have derived the 'best' fit: if there are  318 + multiple local minima, your data set may be well suited to a broken  319 + powerlaw or a different function. 320 + """ 321 +  322 + pylab.plot(self._xmins,self._xmin_kstest,'.') 323 + pylab.plot(self._xmin,self._ks,'s') 324 + #pylab.errorbar([self._ks],self._alpha,yerr=self._alphaerr,fmt='+') 325 + 326 + ax=pylab.gca()