Permalink
Browse files

initial code: hcluster() and friends

  • Loading branch information...
0 parents commit 37e6db93e915d3d7167a5865453a06b1ae07eeb4 @harthur committed Mar 3, 2011
@@ -0,0 +1,76 @@
+/*
+ Turns CommonJS package into a browser file.
+ Minifying requires UglifyJS (http://github.com/mishoo/UglifyJS)
+ to be in the dir above this one.
+
+ uses node-jake http://github.com/mde/node-jake
+ run with 'jake [build|minify|clean]'
+*/
+var fs = require("fs"),
+ path = require("path"),
+ sys = require("sys");
+
+task('build', [], function (name, dest) {
+ sys.puts("building...");
+ var pkg = getPackage();
+ name = name || pkg.name;
+ dest = dest || name + ".js";
+
+ var code = "var " + name + " = " + getCode(pkg.main + ".js", " ");
+ fs.writeFileSync(dest, code, "utf-8");
+ sys.puts("> " + dest);
+});
+
+task('minify', [], function (file, dest) {
+ var name = getPackage().name;
+ file = file || name + ".js";
+ dest = dest || name + ".min.js";
+
+ var minified = minify(fs.readFileSync(file, "utf-8"));
+ fs.writeFileSync(dest, minified, "utf-8");
+ sys.puts("> " + dest)
+});
+
+task('clean', [], function () {
+ var name = getPackage().name;
+ fs.unlink(name + ".js");
+ fs.unlink(name + ".min.js");
+});
+
+
+function getPackage() {
+ return JSON.parse(fs.readFileSync("package.json"));
+}
+
+function getCode(file, indent) {
+ sys.puts(indent + file);
+ var code = fs.readFileSync(file, "utf-8");
+
+ // replace all the require("mod")s with their code
+ // can't handle dep cycles
+ var re = /require\(["'](.+?)["']\)/g;
+ function expand(match, mod) {
+ if(mod.indexOf(".") != 0)
+ return "window"; // external dep, assume it will be global
+ var dep = path.join(path.dirname(file), mod + ".js");
+ return getCode(dep, indent + " ");
+ }
+ code = code.replace(re, expand);
+
+ return "(function() {\n\
+ var module = { exports: {}};\n\
+ var exports = module.exports;\n"
+ + code +
+ "\nreturn module.exports;\
+ })()";
+}
+
+function minify(code) {
+ var jsp = require("../UglifyJS/lib/parse-js"),
+ pro = require("../UglifyJS/lib/process");
+
+ var ast = jsp.parse(code);
+ ast = pro.ast_mangle(ast);
+ ast = pro.ast_squeeze(ast);
+ return pro.gen_code(ast);
+}
21 LICENSE
@@ -0,0 +1,21 @@
+Copyright (c) 2011 Heather Arthur <fayearthur@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
@@ -0,0 +1,25 @@
+#clusterfck
+A js [hierarchical clustering](http://en.wikipedia.org/wiki/Hierarchical_clustering) lib.
+
+# install
+git clone http://github.com/harthur/clusterfck.git
+cd clusterfck
+npm install .
+
+# usage
+ var clusterfck = require("clusterfck");
+
+ var colors = [[20, 120, 102],
+ [0, 230, 93],
+ [250, 255, 253],
+ [100, 54, 300]]; // array of vectors
+
+ var threshold = 9; // only combine two clusters if they have distance less than 9
+
+ var clusters = clusterfck.hcluster(colors, clusterfck.EUCLIDEAN_DISTANCE,
+ clusterfck.AVERAGE_LINKAGE, threshold);
+
+
+`clusters` will be an array of clusters. Each cluster has a `left` and `right` for the clusters that were merged. If you specified a merge function (instead of e.g. `clusterfck.AVERAGE_LINKAGE`), then it'll have a `canonical` property as well which contains the cluster that was formed from the merged clusters.
+
+
@@ -0,0 +1 @@
+module.exports = require("./hcluster");
@@ -0,0 +1,164 @@
+
+var HierarchicalClustering = function(distance, merge, threshold) {
+ this.distance = distance || clusterfck.EUCLIDEAN_DISTANCE;
+ this.merge = merge || clusterfck.AVERAGE_LINKAGE;
+ this.threshold = threshold == undefined ? Infinity : threshold;
+}
+
+HierarchicalClustering.prototype = {
+ cluster : function(items, snapshot, snapshotCallback) {
+
+ var clusters = [];
+ var dists = []; // distances between each pair of clusters
+ var mins = []; // closest cluster for each cluster
+ var index = []; // keep a hash of all clusters by key
+ for(var i = 0; i < items.length; i++) {
+ var cluster = { canonical: items[i], key: i, index: i, size: 1};
+ clusters[i] = cluster;
+ index[i] = cluster;
+ dists[i] = [];
+ mins[i] = 0;
+ }
+
+ // initialize distance matrix and cached mins
+ for(var i = 0; i < clusters.length; i++) {
+ for(var j = 0; j <= i; j++) {
+ var dist = (i == j) ? Infinity :
+ this.distance(clusters[i].canonical, clusters[j].canonical);
+ dists[i][j] = dist;
+ dists[j][i] = dist;
+
+ if(dist < dists[i][mins[i]])
+ mins[i] = j;
+ }
+ }
+
+
+ var toMerge = this.closestClusters(clusters, dists, mins);
+ var i = 0;
+ while(toMerge) {
+ if(!toMerge)
+ break; // W. T. F.
+ if(snapshotCallback && (i % snapshot) == 0)
+ snapshotCallback(clusters);
+
+ var c1 = index[toMerge[0]],
+ c2 = index[toMerge[1]];
+
+ this.mergeClusters(clusters, dists, mins, index, c1, c2);
+ toMerge = this.closestClusters(clusters, dists, mins);
+ i++;
+ }
+ return clusters;
+ },
+
+ mergeClusters : function(clusters, dists, mins, index, c1, c2) {
+ var merged = { canonical: this.merge(c1.canonical, c2.canonical),
+ left: c1,
+ right: c2,
+ key: c1.key,
+ size: c1.size + c2.size };
+
+ clusters[c1.index] = merged;
+ clusters.splice(c2.index, 1);
+ index[c1.key] = merged;
+
+
+ // update distances with new merged cluster
+ for(var i = 0; i < clusters.length; i++) {
+ var ci = clusters[i];
+ var dist;
+ if(c1.key == ci.key)
+ dist = Infinity;
+ else if(this.merge == clusterfck.SINGLE_LINKAGE) {
+ dist = dists[c1.key][ci.key];
+ if(dists[c1.key][ci.key] > dists[c2.key][ci.key])
+ dist = dists[c2.key][ci.key];
+ }
+ else if(this.merge == clusterfck.COMPLETE_LINKAGE) {
+ dist = dists[c1.key][ci.key];
+ if(dists[c1.key][ci.key] < dists[c2.key][ci.key])
+ dist = dists[c2.key][ci.key];
+ }
+ else if(this.merge == clusterfck.AVERAGE_LINKAGE) {
+ dist = (dists[c1.key][ci.key] * c1.size
+ + dists[c2.key][ci.key] * c2.size) / (c1.size + c2.size);
+ }
+ else
+ dist = this.distance(ci.canonical, c1.canonical);
+
+ dists[c1.key][ci.key] = dists[ci.key][c1.key] = dist;
+ }
+
+
+ // update cached mins
+ for(var i = 0; i < clusters.length; i++) {
+ var key1 = clusters[i].key;
+ if(mins[key1] == c1.key || mins[key1] == c2.key) {
+ var min = key1;
+ for(var j = 0; j < clusters.length; j++) {
+ var key2 = clusters[j].key;
+ if(dists[key1][key2] < dists[key1][min])
+ min = key2;
+ }
+ mins[key1] = min;
+ }
+ clusters[i].index = i;
+ }
+ },
+
+ closestClusters : function(clusters, dists, mins) {
+ var minKey = 0, min = Infinity, minClusters = [];
+ for(var i = 0; i < clusters.length; i++) {
+ var key = clusters[i].key;
+ if(dists[key][mins[key]] < min) {
+ minKey = key;
+ min = dists[key][mins[key]];
+ }
+ }
+ if(min < this.threshold)
+ return [minKey, mins[minKey]];
+ }
+}
+
+var SINGLE_LINKAGE = function(c1, c2) { return c1; };
+var COMPLETE_LINKAGE = function(c1, c2) { return c1; };
+var AVERAGE_LINKAGE = function(c1, c2) { return c1; };
+
+var EUCLIDEAN_DISTANCE = function(v1, v2) {
+ var total = 0;
+ for(var i = 0; i < v1.length; i++)
+ total += Math.pow(v2[i] - v1[i], 2)
+ return Math.sqrt(total);
+}
+
+var MANHATTAN_DISTANCE = function(v1, v2) {
+ var total = 0;
+ for(var i = 0; i < v1.length ; i++)
+ total += Math.abs(v2[i] - v1[i])
+ return total;
+}
+
+var MAX_DISTANCE = function(v1, v2) {
+ var max = 0;
+ for(var i = 0; i < v1.length; i++)
+ max = Math.max(max , Math.abs(v2[i] - v1[i]));
+ return max;
+}
+
+var hcluster = function(items, distance, merge, threshold, snapshot, snapshotCallback) {
+ return (new HierarchicalClustering(distance, merge, threshold))
+ .cluster(items, snapshot, snapshotCallback);
+}
+
+clusterfck = {
+ hcluster: hcluster,
+ SINGLE_LINKAGE: SINGLE_LINKAGE,
+ COMPLETE_LINKAGE: COMPLETE_LINKAGE,
+ AVERAGE_LINKAGE: AVERAGE_LINKAGE,
+ EUCLIDEAN_DISTANCE: EUCLIDEAN_DISTANCE,
+ MANHATTAN_DISTANCE: MANHATTAN_DISTANCE,
+ MAX_DISTANCE: MAX_DISTANCE
+};
+
+module.exports = clusterfck;
@@ -0,0 +1,23 @@
+{
+ "name": "clusterfck",
+ "description": "hierarchical clustering",
+ "version": "0.1",
+ "author": "Heather Arthur <fayearthur@gmail.com>",
+ "repository": {
+ "type": "git",
+ "url": "http://github.com/harthur/clusterfck.git"
+ },
+ "directories": {
+ "lib": "./lib"
+ },
+ "dependencies" : {
+ },
+ "devDependencies": {
+ "uglify" : "*",
+ "underscore": "*",
+ "nomnom" : "*",
+ "cradle" : "*",
+ "connect": "*"
+ },
+ "main": "./lib/clusterfck"
+}
@@ -0,0 +1,30 @@
+{
+ "hcluster" : [
+ { "db": "http://localhost:5984/colors",
+ "options" : {
+ "distance": "clusterfck.EUCLIDEAN_DISTANCE",
+ "merge" : "clusterfck.AVERAGE_LINKAGE",
+ "threshold" : 7,
+ "length": 1000
+ }
+ },
+
+ { "db": "http://localhost:5984/photo",
+ "options" : {
+ "distance": "clusterfck.EUCLIDEAN_DISTANCE",
+ "merge" : "clusterfck.AVERAGE_LINKAGE",
+ "threshold" : 7,
+ "length" : 1000
+ }
+ },
+
+ { "db": "http://localhost:5984/photo",
+ "options" : {
+ "distance": "clusterfck.EUCLIDEAN_DISTANCE",
+ "merge" : "clusterfck.AVERAGE_LINKAGE",
+ "threshold" : 7,
+ "length" : 2000
+ }
+ }
+ ]
+}
Oops, something went wrong.

0 comments on commit 37e6db9

Please sign in to comment.