Permalink
Browse files

refactor training and collection scripts

  • Loading branch information...
1 parent e1723d2 commit 56a4ce04e01d87784227b8abe0fd8dcf1929e5e5 @harthur committed Oct 15, 2012
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -0,0 +1,60 @@
+# Training
+
+The goal of training is to create a classifier (in this case a neural network) that can be used to classify cat head images.
+
+After a final round of training you should have the JSON state of a neural network in the file "network.json", which can be imported and used by kittydar.
+
+## collection
+
+First you need to collect positive and negative images to train the network with. See the `collection` directory for more information.
+
+## train the classifier
+
+You can train a network with:
+
+```
+node train-network.js POSITIVES NEGATIVES
+```
+
+where POSITIVES is the directory of positive images (cat head crops), and NEGATIVES is a directory of negative images (non-cat images).
+
+This will write the network to "network.json".
+
+## test the classifier
+
+After training the network you can test the network on a set of test positive and negative images (different from the ones that trained it):
+
+```
+node test-network.js POSITIVES_TEST NEGATIVES_TEST --network ./network.json
+```
+
+This will report the neural network error, as well as binary classification statistics like precision and recall.
+
+## optional: finding optimal parameters
+
+Find the best parameters for the feature extraction and classifier with cross-validation. Edit the `combos` object to add a combination and run with:
+
+```
+node cross-validate.js POSITIVES NEGATIVES
+```
+
+This will cross-validate on each combination of parameteres and report statistics on each combination, including the precision, recall, accuracy, and error of the test set.
+
+## optional: mining hard negatives
+
+After you've trained a classifier, you can test the classifier on a different set of negative images and save any false positives as "hard negatives". You can take the hard negatives and the positives and train a new (more precise) classifier.
+
+```
+node mine-negatives.js NEGATIVES_EXTRA HARD --samples 1 --network ./network.json
+```
+
+where `HARD` is a new directory to hold the mined negatives. The `threshold` param determines when a negative is classified as hard. It's a number from 0.5 to 1.0 (from "leaning positive" to very false positive).
+
+`samples` is the number of times to sample each negative image. It can take a lot of images to find a few hard negatives if you're classifier is good enough, so specifying a higher value will mine more hard negatives in the end.
+
+You can then train a new classifier with:
+
+```
+node train-network.js POSITIVES HARD --sample=false
+```
+
View
@@ -0,0 +1,93 @@
+var fs = require("fs"),
+ path = require("path"),
+ Canvas = require("canvas"),
+ utils = require("../utils");
+
+exports.collectImages = collectImages;
+exports.getDir = getDir;
+
+/*
+ * Collect the canvas representations of the images in the positive and
+ * negative directories and return
+ * an array of objects that look like:
+ * {
+ * canvas: <Canvas object>,
+ * file: 'test.jpg',
+ * iscat: true
+ * }
+ */
+function collectImages(posDir, negDir, samples, limit) {
+ // number of samples to extract from each negative, 0 for whole image
+ samples = samples || 0;
+
+ // max number of images to collect per directory
+ limit = limit || 1000;
+
+ var pos = getDir(posDir, true, 0, limit);
+ var neg = getDir(negDir, false, samples, limit);
+ return pos.concat(neg);
+}
+
+function getDir(dir, isCat, samples, limit) {
+ var files = fs.readdirSync(dir);
+
+ var images = files.filter(function(file) {
+ return (path.extname(file) == ".png"
+ || path.extname(file) == ".jpg");
+ });
+
+ images = images.slice(0, limit);
+
+ var data = [];
+ for (var i = 0; i < images.length; i++) {
+ var file = dir + "/" + images[i];
+ try {
+ var canvas = utils.drawImgToCanvasSync(file);
+ }
+ catch(e) {
+ console.log(e, file);
+ continue;
+ }
+
+ var canvases = extractSamples(canvas, samples);
+
+ for (var j = 0; j < canvases.length; j++) {
+ data.push({
+ canvas: canvases[j],
+ file: file,
+ isCat: isCat ? 1 : 0
+ });
+ }
+ }
+
+ return data;
+}
+
+
+function extractSamples(canvas, num) {
+ if (num == 0) {
+ // 0 means "don't sample"
+ return [canvas];
+ }
+
+ var min = 48;
+ var max = Math.min(canvas.width, canvas.height);
+
+ var canvases = [];
+ for (var i = 0; i < num; i++) {
+ var length = Math.max(min, Math.ceil(Math.random() * max));
+
+ var x = Math.floor(Math.random() * (max - length));
+ var y = Math.floor(Math.random() * (max - length));
+
+ canvases.push(cropCanvas(canvas, x, y, length, length));
+ }
+ return canvases;
+}
+
+function cropCanvas(canvas, x, y, width, height) {
+ var cropCanvas = new Canvas(width, height);
+ var context = cropCanvas.getContext("2d");
+ context.drawImage(canvas, x, y, width, height, 0, 0, width, height);
+ return cropCanvas;
+}
@@ -0,0 +1,22 @@
+## collection
+
+the goal of collection is to get a folder of positive (cat head) images and a folder of negative (non-cat) images to train the classifier with.
+
+### creating the positives
+
+To get the positives, first download this [dataset of cat pictures](http://137.189.35.203/WebUI/CatDatabase/catData.html). There should be folders called CAT_00, CAT_01, etc. Take the images from all of these and combine into one directory. Also remove the file "00000003_019.jpg.cat" and add [00000003_015.jpg.cat](http://137.189.35.203/WebUI/CatDatabase/Data/00000003_015.jpg.cat).
+
+Run the script to rotate and the crop out the cat head from each image. If you put the cat dataset in a folder called "CATS" and you want to put the cropped images in a folder called "POSITIVES":
+
+`node make-positives.js CATS POSITIVES`
+
+### creating the negatives
+
+If you don't already have a bunch of non-cat pictures you can fetch recent images from Flickr and save them in a folder called "FLICKR" by running:
+
+`ruby fetch-negatives.rb NEGATIVES`
+
+You'll need at least 10,000 images.
+
+If you're getting images from Flickr, some will contain cats for sure, so you'll need to weed those out by taking a close look at your hard negatives (see `training` directory above).
+
@@ -5,34 +5,31 @@
FlickRaw.api_key="0cc11cffc8a238efef4dfa6dca255a44"
FlickRaw.shared_secret="5f76a97053f99673"
-$count = 0
-
$fetched = Hash.new
+$dir = ARGV[0]
+
def getPage(page)
list = flickr.photos.getRecent :per_page => 500, :page => page
list.each do |photo|
- url = "http://farm#{photo.farm}.staticflickr.com/#{photo.server}/#{photo.id}_#{photo.secret}_c.jpg"
+ url = "http://farm#{photo.farm}.staticflickr.com/#{photo.server}/#{photo.id}_#{photo.secret}.jpg"
if $fetched[url] != 1
$fetched[url] = 1
name = rand(100000000000)
- file = "NEGS_FLICKR/#{name}.jpg"
-
- puts file
+ file = "#{$dir}/#{name}.jpg"
open(file, 'wb') do |file|
file << open(url).read
end
- puts "saved to #{file}"
- $count+=1
end
end
end
+# gets 120 x 500 = 60,000 images
120.times do |i|
getPage(i)
end
@@ -1,61 +1,65 @@
-var http = require("http"),
- url = require("url"),
- fs = require("fs"),
+var fs = require("fs"),
path = require("path"),
+ nomnom = require("nomnom"),
Canvas = require("canvas"),
- _ = require("underscore"),
utils = require("../../utils");
-var dir = __dirname + "/NEGS_FLICKR/";
-var outdir = __dirname + "/NEGS_SAMPLED3/";
+var opts = nomnom.options({
+ indir: {
+ position: 0,
+ default: __dirname + "/NEGS_FLICKR/",
+ help: "Directory of full-sizes negative images"
+ },
+ outdir: {
+ position: 1,
+ default: __dirname + "/NEGATIVES/",
+ help: "Directory to save cropped image sections"
+ }
+}).colors().parse();
-var part = parseInt(process.argv[2]);
-var perFile = 1;
-
-fs.readdir(dir, function(err, files) {
+fs.readdir(opts.indir, function(err, files) {
if (err) throw err;
var images = files.filter(function(file) {
return path.extname(file) == ".jpg";
});
- images = images.slice(9500 * part, 9500 * (part + 1));
- console.log(images.length);
+ console.log(images.length, "images to process");
images.forEach(function(image) {
+ var file = opts.indir + "/" + image;
try {
- var canvas = utils.drawImgToCanvasSync(dir + image);
+ var canvas = utils.drawImgToCanvasSync(file);
}
catch(e) {
- console.log(e, dir + image);
+ console.log(e, file);
return;
}
- var canvases = generateFromRaw(canvas);
+ var canvases = extractSamples(canvas);
canvases.forEach(function(canvas) {
var name = Math.floor(Math.random() * 10000000000);
- var file = outdir + name + ".jpg";
+ var file = opts.outdir + "/" + name + ".jpg";
- utils.writeCanvasToFile(canvas, file, function() {
- console.log("wrote to", file)
- });
+ utils.writeCanvasToFileSync(canvas, file);
});
});
})
-function generateFromRaw(canvas) {
+function extractSamples(canvas, num) {
var min = 48;
var max = Math.min(canvas.width, canvas.height);
- var canvases = _.range(0, perFile).map(function() {
+ var canvases = [];
+ for (var i = 0; i < num; i++) {
var length = Math.max(48, Math.ceil(Math.random() * max));
var x = Math.floor(Math.random() * (max - length));
var y = Math.floor(Math.random() * (max - length));
- return cropCanvas(canvas, x, y, length, length);
- })
+ canvases.push(cropCanvas(canvas, x, y, length, length));
+ }
return canvases;
}
@@ -1,9 +1,7 @@
var fs = require("fs"),
path = require("path"),
- async = require("async"),
nomnom = require("nomnom"),
Canvas = require("canvas"),
- cropper = require("./cropper"),
utils = require("../../utils");
var opts = nomnom.options({
Oops, something went wrong.

0 comments on commit 56a4ce0

Please sign in to comment.