diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3c3629e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+node_modules
diff --git a/Cakefile b/Cakefile
new file mode 100644
index 0000000..c4b3504
--- /dev/null
+++ b/Cakefile
@@ -0,0 +1,236 @@
+task 'build:apps', 'Build apps script JS file', ->
+ if_coffee ->
+ ps = spawn("coffee", ["--output", JAVASCRIPTS_PATH,"-- compile",COFFEESCRIPTS_PATH])
+ ps.stdout.on('data', log)
+ ps.stderr.on('data', log)
+ ps.on 'exit', (code)->
+ if code != 0
+ console.log 'failed'# ** Cakefile Template ** is a Template for a common Cakefile that you may use in a coffeescript nodejs project.
+#
+# It comes baked in with 5 tasks:
+#
+# * build - compiles your src directory to your lib directory
+# * watch - watches any changes in your src directory and automatically compiles to the lib directory
+# * test - runs mocha test framework, you can edit this task to use your favorite test framework
+# * docs - generates annotated documentation using docco
+# * clean - clean generated .js files
+files = [
+ 'lib'
+ 'src'
+]
+
+fs = require 'fs'
+{print} = require 'util'
+{spawn, exec} = require 'child_process'
+
+try
+ which = require('which').sync
+catch err
+ if process.platform.match(/^win/)?
+ console.log 'WARNING: the which module is required for windows\ntry: npm install which'
+ which = null
+
+# ANSI Terminal Colors
+bold = '\x1b[0;1m'
+green = '\x1b[0;32m'
+reset = '\x1b[0m'
+red = '\x1b[0;31m'
+
+# Cakefile Tasks
+#
+# ## *docs*
+#
+# Generate Annotated Documentation
+#
+# Usage
+#
+# ```
+# cake docs
+# ```
+task 'docs', 'generate documentation', -> docco()
+
+# ## *build*
+#
+# Builds Source
+#
+# Usage
+#
+# ```
+# cake build
+# ```
+task 'build', 'compile source', -> build -> log ":)", green
+
+# ## *watch*
+#
+# Builds your source whenever it changes
+#
+# Usage
+#
+# ```
+# cake watch
+# ```
+task 'watch', 'compile and watch', -> build true, -> log ":-)", green
+
+# ## *test*
+#
+# Runs your test suite.
+#
+# Usage
+#
+# ```
+# cake test
+# ```
+task 'test', 'run tests', -> build -> mocha -> log ":)", green
+
+# ## *clean*
+#
+# Cleans up generated js files
+#
+# Usage
+#
+# ```
+# cake clean
+# ```
+task 'clean', 'clean generated files', -> clean -> log ";)", green
+
+
+# Internal Functions
+#
+# ## *walk*
+#
+# **given** string as dir which represents a directory in relation to local directory
+# **and** callback as done in the form of (err, results)
+# **then** recurse through directory returning an array of files
+#
+# Examples
+#
+# ``` coffeescript
+# walk 'src', (err, results) -> console.log results
+# ```
+walk = (dir, done) ->
+ results = []
+ fs.readdir dir, (err, list) ->
+ return done(err, []) if err
+ pending = list.length
+ return done(null, results) unless pending
+ for name in list
+ file = "#{dir}/#{name}"
+ try
+ stat = fs.statSync file
+ catch err
+ stat = null
+ if stat?.isDirectory()
+ walk file, (err, res) ->
+ results.push name for name in res
+ done(null, results) unless --pending
+ else
+ results.push file
+ done(null, results) unless --pending
+
+# ## *log*
+#
+# **given** string as a message
+# **and** string as a color
+# **and** optional string as an explanation
+# **then** builds a statement and logs to console.
+#
+log = (message, color, explanation) -> console.log color + message + reset + ' ' + (explanation or '')
+
+# ## *launch*
+#
+# **given** string as a cmd
+# **and** optional array and option flags
+# **and** optional callback
+# **then** spawn cmd with options
+# **and** pipe to process stdout and stderr respectively
+# **and** on child process exit emit callback if set and status is 0
+launch = (cmd, options=[], callback) ->
+ cmd = which(cmd) if which
+ app = spawn cmd, options
+ app.stdout.pipe(process.stdout)
+ app.stderr.pipe(process.stderr)
+ app.on 'exit', (status) -> callback?() if status is 0
+
+# ## *build*
+#
+# **given** optional boolean as watch
+# **and** optional function as callback
+# **then** invoke launch passing coffee command
+# **and** defaulted options to compile src to lib
+build = (watch, callback) ->
+ if typeof watch is 'function'
+ callback = watch
+ watch = false
+
+ options = ['-c', '-b', '-o' ]
+ options = options.concat files
+ options.unshift '-w' if watch
+ launch 'coffee', options, callback
+
+# ## *unlinkIfCoffeeFile*
+#
+# **given** string as file
+# **and** file ends in '.coffee'
+# **then** convert '.coffee' to '.js'
+# **and** remove the result
+unlinkIfCoffeeFile = (file) ->
+ if file.match /\.coffee$/
+ fs.unlink file.replace(/\.coffee$/, '.js')
+ true
+ else false
+
+# ## *clean*
+#
+# **given** optional function as callback
+# **then** loop through files variable
+# **and** call unlinkIfCoffeeFile on each
+clean = (callback) ->
+ try
+ for file in files
+ unless unlinkIfCoffeeFile file
+ walk file, (err, results) ->
+ for f in results
+ unlinkIfCoffeeFile f
+
+ callback?()
+ catch err
+
+# ## *moduleExists*
+#
+# **given** name for module
+# **when** trying to require module
+# **and** not found
+# **then* print not found message with install helper in red
+# **and* return false if not found
+moduleExists = (name) ->
+ try
+ require name
+ catch err
+ log "#{name} required: npm install #{name}", red
+ false
+
+
+# ## *mocha*
+#
+# **given** optional array of option flags
+# **and** optional function as callback
+# **then** invoke launch passing mocha command
+mocha = (options, callback) ->
+ #if moduleExists('mocha')
+ if typeof options is 'function'
+ callback = options
+ options = []
+ # add coffee directive
+ options.push '--compilers'
+ options.push 'coffee:coffee-script'
+
+ launch 'mocha', options, callback
+
+# ## *docco*
+#
+# **given** optional function as callback
+# **then** invoke launch passing docco command
+docco = (callback) ->
+ #if moduleExists('docco')
+ walk 'src', (err, files) -> launch 'docco', files, callback
+
diff --git a/README.md b/README.md
index e0bad47..260db21 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,12 @@
-# Tesseract for node.js
+# Tesseract for Node.js
-A simple wrapper for the Tesseract OCR package for node.js
+A simple wrapper for the Tesseract OCR package for Node.js
## Installation
-npm install nodecr
+`npm install nodecr`
## Versions
+* **0.0.5**: Add possibility to override logger (quiet nodecr, see example below)
* **0.0.4**: Changed name to nodecr and published node module (formerly node-tesseract)
* **0.0.3**: Added support for custom preprocessors, OTB Preprocessor using ImageMagick 'convert'
* **0.0.2**: Refactored to support tesseract 3.01, added language parameter, config parameter, documentation
@@ -37,7 +38,10 @@ nodecr.process(__dirname + '/path/to/image.jpg',function(err, text) {
// Recognise text of any language in any format but preprocess the image
// with ImageMagick 'convert' (This requires ImageMagick to be installed)
-// You can write and use your own preprocessors easily, just have a look at lib/nodecr.js
+// uncomment this to quiet nodecr
+//nodecr.log = function() {};
+
+// You can write and use your own preprocessors easily, just have a look at src/nodecr.coffee
nodecr.process(__dirname + '/path/to/image.jpg',function(err, text) {
if(err) {
console.error(err);
@@ -46,4 +50,7 @@ nodecr.process(__dirname + '/path/to/image.jpg',function(err, text) {
}
console.log(text);
}, null, null, null, nodecr.preprocessors.convert);
-```
\ No newline at end of file
+```
+
+## License
+MIT
\ No newline at end of file
diff --git a/lib/nodecr.js b/lib/nodecr.js
index b00497a..3657406 100644
--- a/lib/nodecr.js
+++ b/lib/nodecr.js
@@ -1,169 +1,172 @@
-var exec = require('child_process').exec,
- fs = require('fs'),
- tmp = require('tmp');
+// Generated by CoffeeScript 1.4.0
+var Tesseract, exec, fs, tesseract, tmp;
-/**
-* Attention: Tesseract 3.01 or higher is needed for this to work
+exec = require('child_process').exec;
+
+fs = require('fs');
+
+tmp = require('tmp');
+
+/*
+Attention: Tesseract 3.01 or higher is needed for this to work
*/
-var tesseract = {
-
- /**
- *
- * @param image Can be any format that your installed Leptonica library can process
- * (additional libraries might be required by Leptonica)
- *
- * @param callback A function pointer
- * this function is called after the recognition has taken place
- * with a possible error as first and the resulting recognized text as second parameter
- *
- * @param languageCode (Optional) a language code for the language to recognise
- * see http://code.google.com/p/tesseract-ocr/downloads/list for available languages (xxx.traineddata.gz)
- * any language you pass as an argument here must be unzipped into the tessdata directory beforehand
- *
- * @param pageSegMode (Optional) The page segmentation mode.
- * As of March 4, 2012 tesseract supports the following options:
- *
- * 0 = Orientation and script detection (OSD) only.
- * 1 = Automatic page segmentation with OSD.
- * 2 = Automatic page segmentation, but no OSD, or OCR
- * 3 = Fully automatic page segmentation, but no OSD. (Default)
- * 4 = Assume a single column of text of variable sizes.
- * 5 = Assume a single uniform block of vertically aligned text.
- * 6 = Assume a single uniform block of text.
- * 7 = Treat the image as a single text line.
- * 8 = Treat the image as a single word.
- * 9 = Treat the image as a single word in a circle.
- * 10 = Treat the image as a single character.
- *
- * See http://code.google.com/p/tesseract-ocr/source/browse/trunk/api/tesseractmain.cpp#95 for current state of options
- *
- * @param config (Optional) A config file name
+
+
+Tesseract = (function() {
+ var ConvertPreprocessor;
+
+ function Tesseract() {}
+
+ /*
+ @param image Can be any format that your installed Leptonica library can process
+ (additional libraries might be required by Leptonica)
+
+ @param callback A function pointer
+ this function is called after the recognition has taken place
+ with a possible error as first and the resulting recognized text as second parameter
+
+ @param languageCode (Optional) a language code for the language to recognise
+ see http://code.google.com/p/tesseract-ocr/downloads/list for available languages (xxx.traineddata.gz)
+ any language you pass as an argument here must be unzipped into the tessdata directory beforehand
+
+ @param pageSegMode (Optional) The page segmentation mode.
+ As of March 4, 2012 tesseract supports the following options:
+
+ 0 = Orientation and script detection (OSD) only.
+ 1 = Automatic page segmentation with OSD.
+ 2 = Automatic page segmentation, but no OSD, or OCR
+ 3 = Fully automatic page segmentation, but no OSD. (Default)
+ 4 = Assume a single column of text of variable sizes.
+ 5 = Assume a single uniform block of vertically aligned text.
+ 6 = Assume a single uniform block of text.
+ 7 = Treat the image as a single text line.
+ 8 = Treat the image as a single word.
+ 9 = Treat the image as a single word in a circle.
+ 10 = Treat the image as a single character.
+
+ See http://code.google.com/p/tesseract-ocr/source/browse/trunk/api/tesseractmain.cpp#95 for current state of options
+
+ @param config (Optional) A config file name
*/
- process: function process(image, callback, languageCode, pageSegMode, config, preprocessor) {
- (preprocessor || tesseract.preprocessor)(image, function(err, processedImage, cleanup) {
- if(err) {
- // error in preprocessor
+
+
+ Tesseract.prototype.process = function(image, callback, languageCode, pageSegMode, config, preprocessor) {
+ var _this = this;
+ return (preprocessor || this.preprocessor)(image, function(err, processedImage, cleanup) {
+ var f;
+ if (err) {
callback(err, null);
return;
}
- tesseract._runTesseract(processedImage, function(err, text) {
- if(typeof cleanup == 'function') {
- console.log("node-tesseract: Preprocessor cleanup");
+ f = function(err, text) {
+ if (cleanup != null) {
+ _this.log("node-tesseract: Preprocessor cleanup");
cleanup();
}
callback(err, text);
- }, languageCode, pageSegMode, config);
+ };
+ _this._runTesseract(processedImage, f, languageCode, pageSegMode, config);
});
- },
+ };
- _runTesseract: function(image, callback, languageCode, pageSegMode, config) {
- // generate output file name
+ Tesseract.prototype._runTesseract = function(image, callback, languageCode, pageSegMode, config) {
+ var _this = this;
tmp.tmpName(function(err, output) {
- if(err) {
- // Something went wrong when generating the temporary filename
+ var command;
+ if (err) {
callback(err, null);
return;
}
-
- // assemble tesseract command
- var command = [tesseract.binary, image, output];
-
- if(languageCode) {
+ command = [_this.binary, image, output];
+ if (languageCode) {
command.push('-l');
command.push(languageCode);
}
- if(typeof pageSegMode != 'undefined' && pageSegMode !== null) {
+ if (typeof pageSegMode !== 'undefined' && pageSegMode !== null) {
command.push('-psm');
command.push(pageSegMode);
}
- if(config) {
+ if (config) {
command.push(config);
}
-
command = command.join(' ');
-
- // Run the tesseract command
- console.log("node-tesseract: Running '" + command + "'");
- exec(command, function(err, stdout, stderr){
- if(err) {
- // Something went wrong executing the assembled command
+ _this.log("node-tesseract: Running '" + command + "'");
+ exec(command, function(err, stdout, stderr) {
+ var outputFile;
+ if (err) {
callback(err, null);
return;
}
-
- var outputFile = output + '.txt';
+ outputFile = output + '.txt';
fs.readFile(outputFile, function(err, data) {
- if(!err) {
- // There was no error, so get the text
- data = data.toString(tesseract.outputEncoding);
+ if (!err) {
+ data = data.toString(_this.outputEncoding);
}
- console.log("node-tesseract: Deleting '"+outputFile+"'");
- fs.unlink(outputFile, function (err) {
- // ignore any errors here as it just means we have a temporary file left somewehere
- });
-
- // We got the result (or an error)
+ _this.log("node-tesseract: Deleting '" + outputFile + "'");
+ fs.unlink(outputFile, function(err) {});
callback(err, data);
- }); // end reaFile
-
- }); // end exec
-
- }); // end output filename
- },
-
- /**
- * A no-op preprocessor
- *
- * @param inputFile The file to process
- * @param callback The callback to call when the processing is done (1st argument error, 2nd the outputfile (the processed input file))
- **/
- preprocessor: function(inputFile, callback) {
- // the default preprocessor does nothing...
- var error = null,
- outputFile = inputFile,
- cleanup = function() {
- // clean up here
- // this gets called after the preprocessed image has been used
- };
- callback(error,outputFile,cleanup);
- },
- binary: 'tesseract',
- outputEncoding: 'UTF-8'
-}
-
-// OTB preprocessors
-
-var ConvertPreprocessor = function(inputFile, callback) {
- console.log("node-tesseract: preprocessor: convert: Processing '"+inputFile+"'");
- tmp.tmpName({postfix: '.tif'}, function(err, outputFile) {
- if(err) {
- // Something went wrong when generating the temporary filename
- callback(err, null);
- return;
+ });
+ });
+ });
+ };
+
+ /*
+ A no-op preprocessor
+
+ @param inputFile The file to process
+ @param callback The callback to call when the processing is done (1st argument error, 2nd the outputfile (the processed input file))
+ */
+
+
+ Tesseract.prototype.preprocessor = function(inputFile, callback) {
+ var cleanup, error, outputFile;
+ error = null;
+ outputFile = inputFile;
+ cleanup = function() {};
+ callback(error, outputFile, cleanup);
+ };
+
+ Tesseract.prototype.log = function() {
+ console.log.apply(console, arguments);
+ };
+
+ Tesseract.prototype.binary = 'tesseract';
+
+ Tesseract.prototype.outputEncoding = 'UTF-8';
+
+ Tesseract.prototype.preprocessors = {
+ convert: ConvertPreprocessor = function(inputFile, callback) {
+ tesseract.log("node-tesseract: preprocessor: convert: Processing '" + inputFile + "'");
+ tmp.tmpName({
+ postfix: '.tif'
+ }, function(err, outputFile) {
+ var command;
+ if (err) {
+ callback(err, null);
+ return;
+ }
+ command = ['convert', '-type', 'Grayscale', '-resize', '200%', '-sharpen', '10', inputFile, outputFile].join(' ');
+ tesseract.log("node-tesseract: preprocessor: convert: Running '" + command + "'");
+ exec(command, function(err, stdout, stderr) {
+ var cleanup;
+ if (err) {
+ callback(err, null);
+ } else {
+ cleanup = function() {
+ tesseract.log("node-tesseract: preprocessor: convert: Deleting '" + outputFile + "'");
+ fs.unlink(outputFile, function(err) {});
+ };
+ callback(null, outputFile, cleanup);
+ }
+ });
+ });
}
-
- var command = ['convert', '-type','Grayscale', '-resize','200%', '-sharpen','10', inputFile, outputFile].join(' ');
- console.log("node-tesseract: preprocessor: convert: Running '"+command+"'");
- exec(command, function(err, stdout, stderr){
- if(err) {
- // Something went wrong executing the convert command
- callback(err, null);
- } else {
- var cleanup = function() {
- console.log("node-tesseract: preprocessor: convert: Deleting '"+outputFile+"'");
- fs.unlink(outputFile, function (err) {
- // ignore any errors here as it just means we have a temporary file left somewehere
- });
- };
- callback(null, outputFile, cleanup);
- }
- }); // end exec
- }); // end output filename generation
-};
+ };
+
+ return Tesseract;
+
+})();
-// Exports
+tesseract = new Tesseract;
-module.exports.process = tesseract.process;
-module.exports.preprocessors = {
- convert: ConvertPreprocessor
-};
\ No newline at end of file
+module.exports = tesseract;
diff --git a/package.json b/package.json
index 7ffd65b..d9cb439 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "nodecr",
- "version": "0.0.4",
+ "version": "0.0.5",
"author": "Joscha Feth ",
"description": "A simple wrapper for the Tesseract OCR package",
"main": "./lib/nodecr.js",
diff --git a/src/nodecr.coffee b/src/nodecr.coffee
new file mode 100644
index 0000000..0f1ec33
--- /dev/null
+++ b/src/nodecr.coffee
@@ -0,0 +1,173 @@
+exec = require('child_process').exec
+fs = require 'fs'
+tmp = require 'tmp'
+
+###
+Attention: Tesseract 3.01 or higher is needed for this to work
+###
+class Tesseract
+
+ ###
+ @param image Can be any format that your installed Leptonica library can process
+ (additional libraries might be required by Leptonica)
+
+ @param callback A function pointer
+ this function is called after the recognition has taken place
+ with a possible error as first and the resulting recognized text as second parameter
+
+ @param languageCode (Optional) a language code for the language to recognise
+ see http://code.google.com/p/tesseract-ocr/downloads/list for available languages (xxx.traineddata.gz)
+ any language you pass as an argument here must be unzipped into the tessdata directory beforehand
+
+ @param pageSegMode (Optional) The page segmentation mode.
+ As of March 4, 2012 tesseract supports the following options:
+
+ 0 = Orientation and script detection (OSD) only.
+ 1 = Automatic page segmentation with OSD.
+ 2 = Automatic page segmentation, but no OSD, or OCR
+ 3 = Fully automatic page segmentation, but no OSD. (Default)
+ 4 = Assume a single column of text of variable sizes.
+ 5 = Assume a single uniform block of vertically aligned text.
+ 6 = Assume a single uniform block of text.
+ 7 = Treat the image as a single text line.
+ 8 = Treat the image as a single word.
+ 9 = Treat the image as a single word in a circle.
+ 10 = Treat the image as a single character.
+
+ See http://code.google.com/p/tesseract-ocr/source/browse/trunk/api/tesseractmain.cpp#95 for current state of options
+
+ @param config (Optional) A config file name
+ ###
+ process: (image, callback, languageCode, pageSegMode, config, preprocessor) ->
+ (preprocessor or @preprocessor) image, (err, processedImage, cleanup) =>
+ if err
+ # error in preprocessor
+ callback err, null
+ return
+
+ f = (err, text) =>
+ if cleanup?
+ @log "node-tesseract: Preprocessor cleanup"
+ cleanup()
+ callback err, text
+ return
+
+ @_runTesseract processedImage, f, languageCode, pageSegMode, config
+ return
+
+ _runTesseract: (image, callback, languageCode, pageSegMode, config) ->
+
+ # generate output file name
+ tmp.tmpName (err, output) =>
+ if err
+ # Something went wrong when generating the temporary filename
+ callback err, null
+ return
+
+ # assemble tesseract command
+ command = [
+ @binary
+ image
+ output
+ ]
+ if languageCode
+ command.push '-l'
+ command.push languageCode
+
+ if typeof pageSegMode isnt 'undefined' and pageSegMode isnt null
+ command.push '-psm'
+ command.push pageSegMode
+
+ command.push config if config
+
+ command = command.join ' '
+
+ # Run the tesseract command
+ @log "node-tesseract: Running '#{command}'"
+ exec command, (err, stdout, stderr) =>
+ if err
+
+ # Something went wrong executing the assembled command
+ callback err, null
+ return
+ outputFile = output + '.txt'
+ fs.readFile outputFile, (err, data) =>
+
+ # There was no error, so get the text
+ data = data.toString @outputEncoding unless err
+ @log "node-tesseract: Deleting '#{outputFile}'"
+ fs.unlink outputFile, (err) ->
+ # ignore any errors here as it just means we have a temporary file left somewehere
+
+ # We got the result (or an error)
+ callback err, data
+ return
+ return
+ return
+ return
+
+ ###
+ A no-op preprocessor
+
+ @param inputFile The file to process
+ @param callback The callback to call when the processing is done (1st argument error, 2nd the outputfile (the processed input file))
+ ###
+ preprocessor: (inputFile, callback) ->
+ # the default preprocessor does nothing...
+ error = null
+ outputFile = inputFile
+ cleanup = ->
+ # clean up here
+ return
+
+ # this gets called after the preprocessed image has been used
+ callback error, outputFile, cleanup
+ return
+
+ log: ->
+ console.log.apply console, arguments
+ return
+
+ binary: 'tesseract'
+ outputEncoding: 'UTF-8'
+ preprocessors:
+ convert: ConvertPreprocessor = (inputFile, callback) ->
+ tesseract.log "node-tesseract: preprocessor: convert: Processing '#{inputFile}'"
+ tmp.tmpName postfix: '.tif', (err, outputFile) ->
+ if err
+ # Something went wrong when generating the temporary filename
+ callback err, null
+ return
+
+ command = [
+ 'convert'
+ '-type'
+ 'Grayscale'
+ '-resize'
+ '200%'
+ '-sharpen'
+ '10'
+ inputFile
+ outputFile
+ ].join ' '
+ tesseract.log "node-tesseract: preprocessor: convert: Running '#{command}'"
+ exec command, (err, stdout, stderr) ->
+ if err
+ # Something went wrong executing the convert command
+ callback err, null
+ else
+ cleanup = ->
+ tesseract.log "node-tesseract: preprocessor: convert: Deleting '#{outputFile}'"
+ fs.unlink outputFile, (err) ->
+ # ignore any errors here as it just means we have a temporary file left somewehere
+ return
+ return
+
+ callback null, outputFile, cleanup
+ return
+ return
+ return
+
+tesseract = new Tesseract
+# Exports
+module.exports = tesseract
\ No newline at end of file
diff --git a/test/image.png b/test/image.png
new file mode 100644
index 0000000..8d30d29
Binary files /dev/null and b/test/image.png differ
diff --git a/test/test.js b/test/test.js
new file mode 100644
index 0000000..bdaacbe
--- /dev/null
+++ b/test/test.js
@@ -0,0 +1,13 @@
+var nodecr = require('../lib/nodecr');
+
+// uncomment this to quiet nodecr
+//nodecr.log = function() {};
+
+nodecr.process(__dirname + '/image.png',function(err, text) {
+ if(err) {
+ console.error(err);
+ } else {
+ console.log("Result:");
+ console.log(text);
+ }
+}, null, null, null, nodecr.preprocessors.convert);
\ No newline at end of file