diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3c3629e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +node_modules diff --git a/Cakefile b/Cakefile new file mode 100644 index 0000000..c4b3504 --- /dev/null +++ b/Cakefile @@ -0,0 +1,236 @@ +task 'build:apps', 'Build apps script JS file', -> + if_coffee -> + ps = spawn("coffee", ["--output", JAVASCRIPTS_PATH,"-- compile",COFFEESCRIPTS_PATH]) + ps.stdout.on('data', log) + ps.stderr.on('data', log) + ps.on 'exit', (code)-> + if code != 0 + console.log 'failed'# ** Cakefile Template ** is a Template for a common Cakefile that you may use in a coffeescript nodejs project. +# +# It comes baked in with 5 tasks: +# +# * build - compiles your src directory to your lib directory +# * watch - watches any changes in your src directory and automatically compiles to the lib directory +# * test - runs mocha test framework, you can edit this task to use your favorite test framework +# * docs - generates annotated documentation using docco +# * clean - clean generated .js files +files = [ + 'lib' + 'src' +] + +fs = require 'fs' +{print} = require 'util' +{spawn, exec} = require 'child_process' + +try + which = require('which').sync +catch err + if process.platform.match(/^win/)? + console.log 'WARNING: the which module is required for windows\ntry: npm install which' + which = null + +# ANSI Terminal Colors +bold = '\x1b[0;1m' +green = '\x1b[0;32m' +reset = '\x1b[0m' +red = '\x1b[0;31m' + +# Cakefile Tasks +# +# ## *docs* +# +# Generate Annotated Documentation +# +# Usage +# +# ``` +# cake docs +# ``` +task 'docs', 'generate documentation', -> docco() + +# ## *build* +# +# Builds Source +# +# Usage +# +# ``` +# cake build +# ``` +task 'build', 'compile source', -> build -> log ":)", green + +# ## *watch* +# +# Builds your source whenever it changes +# +# Usage +# +# ``` +# cake watch +# ``` +task 'watch', 'compile and watch', -> build true, -> log ":-)", green + +# ## *test* +# +# Runs your test suite. +# +# Usage +# +# ``` +# cake test +# ``` +task 'test', 'run tests', -> build -> mocha -> log ":)", green + +# ## *clean* +# +# Cleans up generated js files +# +# Usage +# +# ``` +# cake clean +# ``` +task 'clean', 'clean generated files', -> clean -> log ";)", green + + +# Internal Functions +# +# ## *walk* +# +# **given** string as dir which represents a directory in relation to local directory +# **and** callback as done in the form of (err, results) +# **then** recurse through directory returning an array of files +# +# Examples +# +# ``` coffeescript +# walk 'src', (err, results) -> console.log results +# ``` +walk = (dir, done) -> + results = [] + fs.readdir dir, (err, list) -> + return done(err, []) if err + pending = list.length + return done(null, results) unless pending + for name in list + file = "#{dir}/#{name}" + try + stat = fs.statSync file + catch err + stat = null + if stat?.isDirectory() + walk file, (err, res) -> + results.push name for name in res + done(null, results) unless --pending + else + results.push file + done(null, results) unless --pending + +# ## *log* +# +# **given** string as a message +# **and** string as a color +# **and** optional string as an explanation +# **then** builds a statement and logs to console. +# +log = (message, color, explanation) -> console.log color + message + reset + ' ' + (explanation or '') + +# ## *launch* +# +# **given** string as a cmd +# **and** optional array and option flags +# **and** optional callback +# **then** spawn cmd with options +# **and** pipe to process stdout and stderr respectively +# **and** on child process exit emit callback if set and status is 0 +launch = (cmd, options=[], callback) -> + cmd = which(cmd) if which + app = spawn cmd, options + app.stdout.pipe(process.stdout) + app.stderr.pipe(process.stderr) + app.on 'exit', (status) -> callback?() if status is 0 + +# ## *build* +# +# **given** optional boolean as watch +# **and** optional function as callback +# **then** invoke launch passing coffee command +# **and** defaulted options to compile src to lib +build = (watch, callback) -> + if typeof watch is 'function' + callback = watch + watch = false + + options = ['-c', '-b', '-o' ] + options = options.concat files + options.unshift '-w' if watch + launch 'coffee', options, callback + +# ## *unlinkIfCoffeeFile* +# +# **given** string as file +# **and** file ends in '.coffee' +# **then** convert '.coffee' to '.js' +# **and** remove the result +unlinkIfCoffeeFile = (file) -> + if file.match /\.coffee$/ + fs.unlink file.replace(/\.coffee$/, '.js') + true + else false + +# ## *clean* +# +# **given** optional function as callback +# **then** loop through files variable +# **and** call unlinkIfCoffeeFile on each +clean = (callback) -> + try + for file in files + unless unlinkIfCoffeeFile file + walk file, (err, results) -> + for f in results + unlinkIfCoffeeFile f + + callback?() + catch err + +# ## *moduleExists* +# +# **given** name for module +# **when** trying to require module +# **and** not found +# **then* print not found message with install helper in red +# **and* return false if not found +moduleExists = (name) -> + try + require name + catch err + log "#{name} required: npm install #{name}", red + false + + +# ## *mocha* +# +# **given** optional array of option flags +# **and** optional function as callback +# **then** invoke launch passing mocha command +mocha = (options, callback) -> + #if moduleExists('mocha') + if typeof options is 'function' + callback = options + options = [] + # add coffee directive + options.push '--compilers' + options.push 'coffee:coffee-script' + + launch 'mocha', options, callback + +# ## *docco* +# +# **given** optional function as callback +# **then** invoke launch passing docco command +docco = (callback) -> + #if moduleExists('docco') + walk 'src', (err, files) -> launch 'docco', files, callback + diff --git a/README.md b/README.md index e0bad47..260db21 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ -# Tesseract for node.js +# Tesseract for Node.js -A simple wrapper for the Tesseract OCR package for node.js +A simple wrapper for the Tesseract OCR package for Node.js ## Installation -npm install nodecr +`npm install nodecr` ## Versions +* **0.0.5**: Add possibility to override logger (quiet nodecr, see example below) * **0.0.4**: Changed name to nodecr and published node module (formerly node-tesseract) * **0.0.3**: Added support for custom preprocessors, OTB Preprocessor using ImageMagick 'convert' * **0.0.2**: Refactored to support tesseract 3.01, added language parameter, config parameter, documentation @@ -37,7 +38,10 @@ nodecr.process(__dirname + '/path/to/image.jpg',function(err, text) { // Recognise text of any language in any format but preprocess the image // with ImageMagick 'convert' (This requires ImageMagick to be installed) -// You can write and use your own preprocessors easily, just have a look at lib/nodecr.js +// uncomment this to quiet nodecr +//nodecr.log = function() {}; + +// You can write and use your own preprocessors easily, just have a look at src/nodecr.coffee nodecr.process(__dirname + '/path/to/image.jpg',function(err, text) { if(err) { console.error(err); @@ -46,4 +50,7 @@ nodecr.process(__dirname + '/path/to/image.jpg',function(err, text) { } console.log(text); }, null, null, null, nodecr.preprocessors.convert); -``` \ No newline at end of file +``` + +## License +MIT \ No newline at end of file diff --git a/lib/nodecr.js b/lib/nodecr.js index b00497a..3657406 100644 --- a/lib/nodecr.js +++ b/lib/nodecr.js @@ -1,169 +1,172 @@ -var exec = require('child_process').exec, - fs = require('fs'), - tmp = require('tmp'); +// Generated by CoffeeScript 1.4.0 +var Tesseract, exec, fs, tesseract, tmp; -/** -* Attention: Tesseract 3.01 or higher is needed for this to work +exec = require('child_process').exec; + +fs = require('fs'); + +tmp = require('tmp'); + +/* +Attention: Tesseract 3.01 or higher is needed for this to work */ -var tesseract = { - - /** - * - * @param image Can be any format that your installed Leptonica library can process - * (additional libraries might be required by Leptonica) - * - * @param callback A function pointer - * this function is called after the recognition has taken place - * with a possible error as first and the resulting recognized text as second parameter - * - * @param languageCode (Optional) a language code for the language to recognise - * see http://code.google.com/p/tesseract-ocr/downloads/list for available languages (xxx.traineddata.gz) - * any language you pass as an argument here must be unzipped into the tessdata directory beforehand - * - * @param pageSegMode (Optional) The page segmentation mode. - * As of March 4, 2012 tesseract supports the following options: - * - * 0 = Orientation and script detection (OSD) only. - * 1 = Automatic page segmentation with OSD. - * 2 = Automatic page segmentation, but no OSD, or OCR - * 3 = Fully automatic page segmentation, but no OSD. (Default) - * 4 = Assume a single column of text of variable sizes. - * 5 = Assume a single uniform block of vertically aligned text. - * 6 = Assume a single uniform block of text. - * 7 = Treat the image as a single text line. - * 8 = Treat the image as a single word. - * 9 = Treat the image as a single word in a circle. - * 10 = Treat the image as a single character. - * - * See http://code.google.com/p/tesseract-ocr/source/browse/trunk/api/tesseractmain.cpp#95 for current state of options - * - * @param config (Optional) A config file name + + +Tesseract = (function() { + var ConvertPreprocessor; + + function Tesseract() {} + + /* + @param image Can be any format that your installed Leptonica library can process + (additional libraries might be required by Leptonica) + + @param callback A function pointer + this function is called after the recognition has taken place + with a possible error as first and the resulting recognized text as second parameter + + @param languageCode (Optional) a language code for the language to recognise + see http://code.google.com/p/tesseract-ocr/downloads/list for available languages (xxx.traineddata.gz) + any language you pass as an argument here must be unzipped into the tessdata directory beforehand + + @param pageSegMode (Optional) The page segmentation mode. + As of March 4, 2012 tesseract supports the following options: + + 0 = Orientation and script detection (OSD) only. + 1 = Automatic page segmentation with OSD. + 2 = Automatic page segmentation, but no OSD, or OCR + 3 = Fully automatic page segmentation, but no OSD. (Default) + 4 = Assume a single column of text of variable sizes. + 5 = Assume a single uniform block of vertically aligned text. + 6 = Assume a single uniform block of text. + 7 = Treat the image as a single text line. + 8 = Treat the image as a single word. + 9 = Treat the image as a single word in a circle. + 10 = Treat the image as a single character. + + See http://code.google.com/p/tesseract-ocr/source/browse/trunk/api/tesseractmain.cpp#95 for current state of options + + @param config (Optional) A config file name */ - process: function process(image, callback, languageCode, pageSegMode, config, preprocessor) { - (preprocessor || tesseract.preprocessor)(image, function(err, processedImage, cleanup) { - if(err) { - // error in preprocessor + + + Tesseract.prototype.process = function(image, callback, languageCode, pageSegMode, config, preprocessor) { + var _this = this; + return (preprocessor || this.preprocessor)(image, function(err, processedImage, cleanup) { + var f; + if (err) { callback(err, null); return; } - tesseract._runTesseract(processedImage, function(err, text) { - if(typeof cleanup == 'function') { - console.log("node-tesseract: Preprocessor cleanup"); + f = function(err, text) { + if (cleanup != null) { + _this.log("node-tesseract: Preprocessor cleanup"); cleanup(); } callback(err, text); - }, languageCode, pageSegMode, config); + }; + _this._runTesseract(processedImage, f, languageCode, pageSegMode, config); }); - }, + }; - _runTesseract: function(image, callback, languageCode, pageSegMode, config) { - // generate output file name + Tesseract.prototype._runTesseract = function(image, callback, languageCode, pageSegMode, config) { + var _this = this; tmp.tmpName(function(err, output) { - if(err) { - // Something went wrong when generating the temporary filename + var command; + if (err) { callback(err, null); return; } - - // assemble tesseract command - var command = [tesseract.binary, image, output]; - - if(languageCode) { + command = [_this.binary, image, output]; + if (languageCode) { command.push('-l'); command.push(languageCode); } - if(typeof pageSegMode != 'undefined' && pageSegMode !== null) { + if (typeof pageSegMode !== 'undefined' && pageSegMode !== null) { command.push('-psm'); command.push(pageSegMode); } - if(config) { + if (config) { command.push(config); } - command = command.join(' '); - - // Run the tesseract command - console.log("node-tesseract: Running '" + command + "'"); - exec(command, function(err, stdout, stderr){ - if(err) { - // Something went wrong executing the assembled command + _this.log("node-tesseract: Running '" + command + "'"); + exec(command, function(err, stdout, stderr) { + var outputFile; + if (err) { callback(err, null); return; } - - var outputFile = output + '.txt'; + outputFile = output + '.txt'; fs.readFile(outputFile, function(err, data) { - if(!err) { - // There was no error, so get the text - data = data.toString(tesseract.outputEncoding); + if (!err) { + data = data.toString(_this.outputEncoding); } - console.log("node-tesseract: Deleting '"+outputFile+"'"); - fs.unlink(outputFile, function (err) { - // ignore any errors here as it just means we have a temporary file left somewehere - }); - - // We got the result (or an error) + _this.log("node-tesseract: Deleting '" + outputFile + "'"); + fs.unlink(outputFile, function(err) {}); callback(err, data); - }); // end reaFile - - }); // end exec - - }); // end output filename - }, - - /** - * A no-op preprocessor - * - * @param inputFile The file to process - * @param callback The callback to call when the processing is done (1st argument error, 2nd the outputfile (the processed input file)) - **/ - preprocessor: function(inputFile, callback) { - // the default preprocessor does nothing... - var error = null, - outputFile = inputFile, - cleanup = function() { - // clean up here - // this gets called after the preprocessed image has been used - }; - callback(error,outputFile,cleanup); - }, - binary: 'tesseract', - outputEncoding: 'UTF-8' -} - -// OTB preprocessors - -var ConvertPreprocessor = function(inputFile, callback) { - console.log("node-tesseract: preprocessor: convert: Processing '"+inputFile+"'"); - tmp.tmpName({postfix: '.tif'}, function(err, outputFile) { - if(err) { - // Something went wrong when generating the temporary filename - callback(err, null); - return; + }); + }); + }); + }; + + /* + A no-op preprocessor + + @param inputFile The file to process + @param callback The callback to call when the processing is done (1st argument error, 2nd the outputfile (the processed input file)) + */ + + + Tesseract.prototype.preprocessor = function(inputFile, callback) { + var cleanup, error, outputFile; + error = null; + outputFile = inputFile; + cleanup = function() {}; + callback(error, outputFile, cleanup); + }; + + Tesseract.prototype.log = function() { + console.log.apply(console, arguments); + }; + + Tesseract.prototype.binary = 'tesseract'; + + Tesseract.prototype.outputEncoding = 'UTF-8'; + + Tesseract.prototype.preprocessors = { + convert: ConvertPreprocessor = function(inputFile, callback) { + tesseract.log("node-tesseract: preprocessor: convert: Processing '" + inputFile + "'"); + tmp.tmpName({ + postfix: '.tif' + }, function(err, outputFile) { + var command; + if (err) { + callback(err, null); + return; + } + command = ['convert', '-type', 'Grayscale', '-resize', '200%', '-sharpen', '10', inputFile, outputFile].join(' '); + tesseract.log("node-tesseract: preprocessor: convert: Running '" + command + "'"); + exec(command, function(err, stdout, stderr) { + var cleanup; + if (err) { + callback(err, null); + } else { + cleanup = function() { + tesseract.log("node-tesseract: preprocessor: convert: Deleting '" + outputFile + "'"); + fs.unlink(outputFile, function(err) {}); + }; + callback(null, outputFile, cleanup); + } + }); + }); } - - var command = ['convert', '-type','Grayscale', '-resize','200%', '-sharpen','10', inputFile, outputFile].join(' '); - console.log("node-tesseract: preprocessor: convert: Running '"+command+"'"); - exec(command, function(err, stdout, stderr){ - if(err) { - // Something went wrong executing the convert command - callback(err, null); - } else { - var cleanup = function() { - console.log("node-tesseract: preprocessor: convert: Deleting '"+outputFile+"'"); - fs.unlink(outputFile, function (err) { - // ignore any errors here as it just means we have a temporary file left somewehere - }); - }; - callback(null, outputFile, cleanup); - } - }); // end exec - }); // end output filename generation -}; + }; + + return Tesseract; + +})(); -// Exports +tesseract = new Tesseract; -module.exports.process = tesseract.process; -module.exports.preprocessors = { - convert: ConvertPreprocessor -}; \ No newline at end of file +module.exports = tesseract; diff --git a/package.json b/package.json index 7ffd65b..d9cb439 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "nodecr", - "version": "0.0.4", + "version": "0.0.5", "author": "Joscha Feth ", "description": "A simple wrapper for the Tesseract OCR package", "main": "./lib/nodecr.js", diff --git a/src/nodecr.coffee b/src/nodecr.coffee new file mode 100644 index 0000000..0f1ec33 --- /dev/null +++ b/src/nodecr.coffee @@ -0,0 +1,173 @@ +exec = require('child_process').exec +fs = require 'fs' +tmp = require 'tmp' + +### +Attention: Tesseract 3.01 or higher is needed for this to work +### +class Tesseract + + ### + @param image Can be any format that your installed Leptonica library can process + (additional libraries might be required by Leptonica) + + @param callback A function pointer + this function is called after the recognition has taken place + with a possible error as first and the resulting recognized text as second parameter + + @param languageCode (Optional) a language code for the language to recognise + see http://code.google.com/p/tesseract-ocr/downloads/list for available languages (xxx.traineddata.gz) + any language you pass as an argument here must be unzipped into the tessdata directory beforehand + + @param pageSegMode (Optional) The page segmentation mode. + As of March 4, 2012 tesseract supports the following options: + + 0 = Orientation and script detection (OSD) only. + 1 = Automatic page segmentation with OSD. + 2 = Automatic page segmentation, but no OSD, or OCR + 3 = Fully automatic page segmentation, but no OSD. (Default) + 4 = Assume a single column of text of variable sizes. + 5 = Assume a single uniform block of vertically aligned text. + 6 = Assume a single uniform block of text. + 7 = Treat the image as a single text line. + 8 = Treat the image as a single word. + 9 = Treat the image as a single word in a circle. + 10 = Treat the image as a single character. + + See http://code.google.com/p/tesseract-ocr/source/browse/trunk/api/tesseractmain.cpp#95 for current state of options + + @param config (Optional) A config file name + ### + process: (image, callback, languageCode, pageSegMode, config, preprocessor) -> + (preprocessor or @preprocessor) image, (err, processedImage, cleanup) => + if err + # error in preprocessor + callback err, null + return + + f = (err, text) => + if cleanup? + @log "node-tesseract: Preprocessor cleanup" + cleanup() + callback err, text + return + + @_runTesseract processedImage, f, languageCode, pageSegMode, config + return + + _runTesseract: (image, callback, languageCode, pageSegMode, config) -> + + # generate output file name + tmp.tmpName (err, output) => + if err + # Something went wrong when generating the temporary filename + callback err, null + return + + # assemble tesseract command + command = [ + @binary + image + output + ] + if languageCode + command.push '-l' + command.push languageCode + + if typeof pageSegMode isnt 'undefined' and pageSegMode isnt null + command.push '-psm' + command.push pageSegMode + + command.push config if config + + command = command.join ' ' + + # Run the tesseract command + @log "node-tesseract: Running '#{command}'" + exec command, (err, stdout, stderr) => + if err + + # Something went wrong executing the assembled command + callback err, null + return + outputFile = output + '.txt' + fs.readFile outputFile, (err, data) => + + # There was no error, so get the text + data = data.toString @outputEncoding unless err + @log "node-tesseract: Deleting '#{outputFile}'" + fs.unlink outputFile, (err) -> + # ignore any errors here as it just means we have a temporary file left somewehere + + # We got the result (or an error) + callback err, data + return + return + return + return + + ### + A no-op preprocessor + + @param inputFile The file to process + @param callback The callback to call when the processing is done (1st argument error, 2nd the outputfile (the processed input file)) + ### + preprocessor: (inputFile, callback) -> + # the default preprocessor does nothing... + error = null + outputFile = inputFile + cleanup = -> + # clean up here + return + + # this gets called after the preprocessed image has been used + callback error, outputFile, cleanup + return + + log: -> + console.log.apply console, arguments + return + + binary: 'tesseract' + outputEncoding: 'UTF-8' + preprocessors: + convert: ConvertPreprocessor = (inputFile, callback) -> + tesseract.log "node-tesseract: preprocessor: convert: Processing '#{inputFile}'" + tmp.tmpName postfix: '.tif', (err, outputFile) -> + if err + # Something went wrong when generating the temporary filename + callback err, null + return + + command = [ + 'convert' + '-type' + 'Grayscale' + '-resize' + '200%' + '-sharpen' + '10' + inputFile + outputFile + ].join ' ' + tesseract.log "node-tesseract: preprocessor: convert: Running '#{command}'" + exec command, (err, stdout, stderr) -> + if err + # Something went wrong executing the convert command + callback err, null + else + cleanup = -> + tesseract.log "node-tesseract: preprocessor: convert: Deleting '#{outputFile}'" + fs.unlink outputFile, (err) -> + # ignore any errors here as it just means we have a temporary file left somewehere + return + return + + callback null, outputFile, cleanup + return + return + return + +tesseract = new Tesseract +# Exports +module.exports = tesseract \ No newline at end of file diff --git a/test/image.png b/test/image.png new file mode 100644 index 0000000..8d30d29 Binary files /dev/null and b/test/image.png differ diff --git a/test/test.js b/test/test.js new file mode 100644 index 0000000..bdaacbe --- /dev/null +++ b/test/test.js @@ -0,0 +1,13 @@ +var nodecr = require('../lib/nodecr'); + +// uncomment this to quiet nodecr +//nodecr.log = function() {}; + +nodecr.process(__dirname + '/image.png',function(err, text) { + if(err) { + console.error(err); + } else { + console.log("Result:"); + console.log(text); + } +}, null, null, null, nodecr.preprocessors.convert); \ No newline at end of file