diff --git a/localgraphclustering/__init__.py b/localgraphclustering/__init__.py index 6dc8598..ebbc034 100644 --- a/localgraphclustering/__init__.py +++ b/localgraphclustering/__init__.py @@ -3,7 +3,7 @@ from .approximate_PageRank import approximate_PageRank from .approximate_PageRank_weighted import approximate_PageRank_weighted from .sweep_cut import sweep_cut -from .ncp import NCPData +from .ncp import NCPData, partialfunc from .ncpplots import NCPPlots from .densest_subgraph import densest_subgraph from .multiclass_label_prediction import multiclass_label_prediction diff --git a/localgraphclustering/ncp.py b/localgraphclustering/ncp.py index 0f880d4..b2b5207 100755 --- a/localgraphclustering/ncp.py +++ b/localgraphclustering/ncp.py @@ -17,6 +17,16 @@ from .triangleclusters import triangleclusters from .cpp import * +class partialfunc(functools.partial): + @classmethod + def from_partial(cls, f): + return cls(f.func, f.args, f.keywords) + def __eq__(self, f2): + if not (isinstance(f2, partialfunc)): + return False + return all([getattr(self, attr) == getattr(f2, attr) for attr in ['func', 'args', 'keywords']]) + __hash__ = functools.partial.__hash__ + class SimpleLogForLongComputations: """ Implement a simple logger that will record messages and then replay them if a timer exceeds a threshold.""" @@ -51,6 +61,7 @@ def log(self, message): self._log.append((t, message)) def _partial_functions_equal(func1, func2): + assert(False) # shouldn't be called now if not (isinstance(func1, functools.partial) and isinstance(func2, functools.partial)): return False are_equal = all([getattr(func1, attr) == getattr(func2, attr) for attr in ['func', 'args', 'keywords']]) @@ -388,7 +399,15 @@ def as_data_frame(self): """ Return the NCP results as a pandas dataframe """ df = pd.DataFrame.from_records(self.results, columns=self.result_fields) # convert to human readable names - df["method"] = df["methodfunc"].map(self.method_names) + # It's important that this dictionary is converted into a lookup + # function so the pandas map function works correctly with our + # partial functions that may hash differently but compare as equal. + # Ideally, we'd call... + # df["method"] = df["methodfunc"].map(self.method_names) + df["method"] = df["methodfunc"].map(lambda x: self.method_names[x]) + # TODO, since this is a bit hacky, it's probably worth storing + # the method name in the results itself. That's probably better at + # this point return df @@ -501,7 +520,7 @@ def approxPageRank(self, if localmins: for rho in rholist: self.add_localmin_samples( - method=functools.partial( + method=partialfunc( spectral_clustering,**spectral_args,alpha=alpha,rho=rho*10,method=method), methodname="%s_localmin:rho=%.0e"%(methodname, rho*10), neighborhoods=True, @@ -516,7 +535,7 @@ def approxPageRank(self, if myratio is not None: kwargs['ratio'] = myratio self.add_random_node_samples( - method=functools.partial( + method=partialfunc( spectral_clustering,**spectral_args,alpha=alpha,rho=rho,method=method), methodname="%s:rho=%.0e"%(methodname, rho), timeout=timeout/(nruns*len(rholist)), **kwargs) @@ -528,7 +547,7 @@ def approxPageRank(self, if myratio is not None: kwargs['ratio'] = myratio self.add_random_neighborhood_samples( - method=functools.partial( + method=partialfunc( spectral_clustering,**spectral_args,alpha=alpha,rho=rho*10,method=method), methodname="%s_neighborhoods:rho=%.0e"%(methodname, rho*10), timeout=timeout/(len(rholist)), **kwargs) @@ -570,7 +589,7 @@ def l1reg(self, nthreads: int = 4, timeout: float = 1000): alpha = 1.0-1.0/(1.0+gamma) - funcs = {functools.partial(spectral_clustering, alpha=alpha,rho=rho,method="l1reg"):'l1reg;rho=%.0e'%(rho) + funcs = {partialfunc(spectral_clustering, alpha=alpha,rho=rho,method="l1reg"):'l1reg;rho=%.0e'%(rho) for rho in rholist} for func in funcs.keys(): self.add_random_node_samples(method=func,methodname=funcs[func],ratio=ratio,nthreads=nthreads,timeout=timeout/len(funcs)) @@ -583,7 +602,7 @@ def crd(self, ratio: float = 0.3, nthreads: int = 4, timeout: float = 1000): - func = functools.partial(flow_clustering,w=w, U=U, h=h,method="crd") + func = partialfunc(flow_clustering,w=w, U=U, h=h,method="crd") self.add_random_neighborhood_samples(method=func,methodname="crd", ratio=ratio,nthreads=nthreads,timeout=timeout/2) self.add_random_node_samples(method=func,methodname="crd", @@ -594,7 +613,7 @@ def mqi(self, ratio: float = 0.3, nthreads: int = 4, timeout: float = 1000): - func = functools.partial(flow_clustering,method="mqi") + func = partialfunc(flow_clustering,method="mqi") self.add_random_neighborhood_samples(ratio=ratio,nthreads=nthreads,timeout=timeout, method=func,methodname="mqi") return self @@ -609,12 +628,12 @@ def add_fiedler(self): # note that we use functools partial here to create a new function # that we name "fiedler" even though the code is just evaluate_set return self.add_set_samples(methodname="fiedler", - method=functools.partial(_evaluate_set), nthreads=1, sets=[S]) + method=partialfunc(_evaluate_set), nthreads=1, sets=[S]) def add_fiedler_mqi(self): S = self._fiedler_set() return self.add_set_samples(methodname="fiedler-mqi", - method=functools.partial(flow_clustering,method="mqi"), nthreads=1, sets=[S]) + method=partialfunc(flow_clustering,method="mqi"), nthreads=1, sets=[S]) def add_neighborhoods(self, **kwargs): return self.add_random_neighborhood_samples( diff --git a/localgraphclustering/spectral_clustering.py b/localgraphclustering/spectral_clustering.py index f143fca..ad427fb 100644 --- a/localgraphclustering/spectral_clustering.py +++ b/localgraphclustering/spectral_clustering.py @@ -16,6 +16,7 @@ def spectral_clustering(G, ref_nodes, ys: Sequence[float] = None, vol: float = 100, phi: float = 0.5, + refine = None, method: str = "acl"): """ Provide a simple interface to do spectral based clustering. @@ -33,6 +34,8 @@ def spectral_clustering(G, ref_nodes, Which method to use for the nodes embedding. Options: "acl", "l1reg", "nibble", "fiedler", "fiedler_local" + refine: + Extra parameters for "acl" and "l1reg" (optional) ------------------------------------------------- @@ -107,4 +110,7 @@ def spectral_clustering(G, ref_nodes, output = sweep_cut(G,p) + if refine is not None: + output = refine(G,list(output[0])) + return output diff --git a/localgraphclustering/tests/test_ncp.py b/localgraphclustering/tests/test_ncp.py index c83d3de..472ee1f 100644 --- a/localgraphclustering/tests/test_ncp.py +++ b/localgraphclustering/tests/test_ncp.py @@ -1,6 +1,6 @@ import localgraphclustering as lgc import pytest -from functools import partial +#from functools import partial def load_example_graph(): return lgc.GraphLocal("localgraphclustering/tests/data/dolphins.edges",separator=" ") @@ -14,7 +14,7 @@ def test_ncp(): df = ncp.as_data_frame() assert len(df) == G._num_vertices #func = lambda G,R: lgc.flow_clustering(G,R,method="mqi")[0] - func = partial(lgc.flow_clustering, method="mqi") + func = lgc.partialfunc(lgc.flow_clustering, method="mqi") ncp = lgc.NCPData(G) ncp.add_set_samples([[1]],nthreads=1,method=func,methodname="mqi") ncp.add_random_neighborhood_samples(ratio=2,nthreads=1,method=func,methodname="mqi") @@ -83,7 +83,7 @@ def test_ncp_l1reg(): def test_ncp_localmin(): G = load_example_graph() ncp = lgc.NCPData(G) - func = partial(lgc.spectral_clustering,alpha=0.01,rho=1.0e-4,method="acl") + func = lgc.partialfunc(lgc.spectral_clustering,alpha=0.01,rho=1.0e-4,method="acl") ncp.default_method = func ncp.add_localmin_samples(ratio=1) @@ -93,7 +93,7 @@ def test_ncp_localmin(): G = lgc.GraphLocal() G.list_to_gl([0,1],[1,0],[1,1]) ncp = lgc.NCPData(G) - func = partial(lgc.spectral_clustering,alpha=0.01,rho=1.0e-4,method="acl") + func = lgc.partialfunc(lgc.spectral_clustering,alpha=0.01,rho=1.0e-4,method="acl") ncp.default_method = func ncp.add_localmin_samples(ratio=1) @@ -109,10 +109,16 @@ def test_ncp_sets(): def test_apr_deep(): G = load_example_graph() df = lgc.NCPData(G).approxPageRank(ratio=1, gamma=0.1, rholist=[1e-2, 1e-3], deep=True) - + def test_apr_only_node_samples(): G = load_example_graph() - df = lgc.NCPData(G).approxPageRank(ratio=1, gamma=0.1, rholist=[1e-2, 1e-3], random_neighborhoods=False, localmins=False) + df = lgc.NCPData(G).approxPageRank(ratio=1, gamma=0.1, rholist=[1e-2, 1e-3], random_neighborhoods=False, localmins=False) + +def test_apr_refine(): + G = load_example_graph() + df = lgc.NCPData(G).approxPageRank(ratio=1, gamma=0.1, rholist=[1e-2, 1e-3], + random_neighborhoods=False, localmins=False, + spectral_args={'refine': lgc.partialfunc(lgc.flow_clustering, method="mqi")}) @pytest.mark.long_tests def test_ncp_crd_big():