In [14]:
var nlp = require("nlp_compromise");

'use strict'

In [6]:
var wordIndices = (originalString, words) => {
    var curIndex = 0;
    return words.map((word) => {
        var wordIndex = originalString.indexOf(word, curIndex);
        curIndex = wordIndex + word.length;
        return wordIndex
    })
};

'use strict'

In [7]:
// `replacements` should be an array of {replaceWith, start, end} objects, ordered by `start`
var multiReplace = (string, replacements) => {
    var toReturn = "";
    var stringIndex = 0;
    var replacementIndex = 0;
    var count = 0;
    while (stringIndex < string.length && count < 10) {
        count++;
        // we're not done with the original string!
        
        if (replacementIndex < replacements.length) {
            // there are more replacements to make!
            var curReplacement = replacements[replacementIndex];

            if (curReplacement.start == stringIndex) {
                toReturn += curReplacement.replaceWith;
                stringIndex = curReplacement.end;
                replacementIndex += 1;
            } else {
                toReturn += string.slice(stringIndex, curReplacement.start);
                stringIndex = curReplacement.start;
            }
            
        } else {
            toReturn += string.slice(stringIndex, string.length);
            stringIndex = string.length;
        }
    }
    return toReturn;
};

multiReplace("Here's a test of the multi-replace system!", [
    {replaceWith: 'fun test', start: 9, end: 13},
    {replaceWith: '', start: 21, end: 27}
])

'Here\'s a fun test of the replace system!'

In [13]:
var text = "Always good for a laugh.  Montreal's mayor, angry at home delivery of mail being cut and replaced by community mailboxes, heard about a slab that was just poured at the edge of a park.  So he took a jackhammer to the situation, and did not mince words.";

var _ = require('underscore');
var pos = require('pos');
var words = new pos.Lexer().lex(text);
var tagger = new pos.Tagger();
var taggedWords = tagger.tag(words).map((arr) => ({word: arr[0], tag: arr[1]}));
// for (var i in taggedWords) {
//     var taggedWord = taggedWords[i];
//     var word = taggedWord[0];
//     var tag = taggedWord[1];
//     console.log(word + " / " + tag);
// }

var taggedWordIndices = wordIndices(text, taggedWords.map(taggedWord => taggedWord.word))
taggedWords.forEach((word, i) => word.index = taggedWordIndices[i])

var tagProbabilities = {
    NN: 0.3,
    NNS: 0.3,
    NNP: 0.5,
    VBD: 0.2,
};

var tagDescriptions = {
    NN: 'noun (singular)',
    NN: 'noun (plural)',
    NNP: 'proper noun',
    VBD: 'verb (past)',
};

taggedWords.forEach((word) => {
    if (Math.random() < tagProbabilities[word.tag]) {
        word.selected = true;
    }
});

// taggedWords

var replacements = _.where(taggedWords, {selected: true}).map((taggedWord) => ({
    replaceWith: '{' + tagDescriptions[taggedWord.tag] + '}',
    start: taggedWord.index,
    end: taggedWord.index + taggedWord.word.length
}));
multiReplace(text, replacements);


// var taggedWordsByTag = _.groupBy(taggedWords, 'tag');

// taggedWordsByTag

'Always good for a laugh.  Montreal\'s {noun (plural)}, angry at home {noun (plural)} of mail being cut and replaced by community mailboxes, heard about a slab that was just poured at the edge of a {noun (plural)}.  So he took a {noun (plural)} to the situation, and did not mince words.'

In [52]:
var terms = nlp.text(text).terms()
var termIndices = wordIndices(text, _.pluck(terms, 'text'))

var termSelectionProbability = (term) => {
    if (!term.text) {
        return 0;
    }

    if (term.pos.Prounoun || term.pos.Copula || term.pos.Infinitive || term.pos.Gerund) {
        return 0;
    } else if (term.pos.Verb) {
        return 0.15;
    } else if (term.pos.Noun) {
        return 0.3;
    } else if (term.pos.Adjective) {
        return 0.2;
    } else if (term.pos.Adverb) {
        return 0.1;
    } else {
        return 0;
    }
}

var posText = (pos) => {
    if (pos.Adjective) {
        return "adjective";
    }
    
    if (pos.Noun) {
        if (pos.Plural) {
            return "noun (plural)";
        } else {
            return "noun";
        }
    }
    
    if (pos.Verb) {
        if (pos.PastTense) {
            return "verb (past)";
        } else {
            return "verb";
        }
    }
}

terms.forEach((term, i) => {
    term.index = termIndices[i];
    term.prob = termSelectionProbability(term);
    if (Math.random() < term.prob) {
        term.selected = true;
    }
})

var replacements = _.where(terms, {selected: true}).map((term) => ({
    replaceWith: '{' + posText(term.pos) + '}', //JSON.stringify(term.pos) + " (" + posText(term.pos) + ")",
    start: term.index,
    end: term.index + term.text.length
}));
multiReplace(text, replacements);

'Always {adjective} for a laugh.  Montreal\'s mayor, angry at home delivery of mail being cut and replaced by community mailboxes, heard about a slab that was just poured at the {noun} of a park.  So {noun} took a jackhammer to the situation, and did not mince words.'

In [45]:
terms

[ Adverb {
    whitespace: { preceding: '', trailing: ' ' },
    text: 'Always',
    normal: 'always',
    expansion: null,
    reason: 'lexicon_pass',
    pos: { Adverb: true },
    tag: 'Adverb',
    index: 0,
    prob: 0.1 },
  Adjective {
    whitespace: { preceding: '', trailing: ' ' },
    text: 'good',
    normal: 'good',
    expansion: null,
    reason: 'lexicon_pass',
    pos: { Adjective: true },
    tag: 'Adjective',
    index: 7,
    prob: 0.2 },
  Term {
    whitespace: { preceding: '', trailing: ' ' },
    text: 'for',
    normal: 'for',
    expansion: null,
    reason: 'lexicon_pass',
    pos: { Conjunction: true },
    tag: 'Conjunction',
    index: 12,
    prob: 0 },
  Term {
    whitespace: { preceding: '', trailing: ' ' },
    text: 'a',
    normal: 'a',
    expansion: null,
    reason: 'lexicon_pass',
    pos: { Determiner: true },
    tag: 'Determiner',
    index: 16,
    prob: 0 },
  Verb {
    whitespace: { preceding: '', trailing: '  ' },
    text: 'laugh.',
   

In [18]:
var pos = require('pos');
var words = new pos.Lexer().lex('This is some sample text. This text can contain mucho suerte.');
var tagger = new pos.Tagger();
tagger.extendLexicon({'mucho suerte': ['NNP']});
var taggedWords = tagger.tag(words);
for (var i in taggedWords) {
    var taggedWord = taggedWords[i];
    var word = taggedWord[0];
    var tag = taggedWord[1];
//     console.log(word + " /" + tag);
    console.log(JSON.stringify(taggedWord));
}

// // extend the lexicon
// tagger.extendLexicon({'Obama': ['NNP']});
// tagger.tag(['Mr', 'Obama']);

["This","DT"]
["is","VBZ"]
["some","DT"]
["sample","NN"]
["text","NN"]
[".","."]
["This","DT"]
["text","NN"]
["can","MD"]
["contain","VB"]
["mucho","JJ"]
["suerte","NN"]
[".","."]


undefined

In [7]:
var phantomjs = require('phantomjs-prebuilt')
phantomjs.path

'/usr/local/bin/phantomjs'

In [5]:
var util = require('util');

var myLog = (obj) => console.log(util.inspect(obj, false, null));

'use strict'

In [10]:
var ParseEnglish = require('parse-english'),
    english = new ParseEnglish();

/**
 * parse-latin would fail helplessly at the full-stop preceding the
 * capital `H`, and would erroneously parse the following as two
 * sentences.
 */
myLog(english.parse(
  'A hapless but friendly City of London worker is here.'
));

ReferenceError: myLog is not defined

In [12]:
"testing!".replace(/e(.)/, "ABC$1")

'tABCsting!'

In [14]:
"testing!".replace(/t(.)/g, "")

'sng!'