## Named Entity Recognition
- Using regular expressions to find entities
- Using chunks with regular exp to identify entities
- Using OpenNLP to find entities in text
- Isolating multiple entities types
- Using a CRF model to find entities in a document
- Using a chunker to find entities
- Training a specialized NER model

### Using Regex

In [1]:
import java.util.regex.Matcher;
import java.util.regex.Pattern;

String sampleText = "I can normally be reached at nlp@nlpworks.com. " + "If not you can email me at mrnlp@nlpworks.org";
String emailRegularExpression = "[a-zA-Z0-9'._%+-]+@" + "(?:[a-zA-Z0-9-]+\\.)" + "+[a-zA-Z]{2,4}";
Pattern pattern = Pattern.compile(emailRegularExpression);
Matcher matcher = pattern.matcher(sampleText);

while (matcher.find()) {
    System.out.println(matcher.group() + " [" + matcher.start() + ":" + 
        matcher.end() + "]");
}

nlp@nlpworks.com [29:45]
mrnlp@nlpworks.org [74:92]


In [2]:
String phoneNumberRegularExpression = "\\d{3}-\\d{3}-\\d{4}";
String zipCodeRegularExpression = "[0-9]{5}(\\-?[0-9]{4})?";
pattern = Pattern.compile(phoneNumberRegularExpression + "|" + 
 zipCodeRegularExpression + "|" + emailRegularExpression);
 sampleText = "Her phone number is 888-555-1111. You may also need her ZIP code: 55555-4444";

matcher = pattern.matcher(sampleText);
while (matcher.find()) {
    System.out.println(matcher.group() + " [" + matcher.start() + ":" + 
        matcher.end() + "]");
}

888-555-1111 [20:32]
55555-4444 [66:76]


### Using LINGPipe's Chunking

In [3]:
%%loadFromPOM
<dependency>
    <groupId>de.julielab</groupId>
    <artifactId>aliasi-lingpipe</artifactId>
    <version>4.1.0</version>
</dependency>

In [4]:
import java.util.Set;
import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunker;
import com.aliasi.chunk.Chunking;
import com.aliasi.chunk.RegExChunker;

String sampleText = "His email address is hisemail@somecompany.com.";
String emailRegularExpression = "[A-Za-z0-9](([_\\.\\-]?[a-zA-Z0-9]+)*)@(" + 
    "[A-Za-z0-9]+)(([\\.\\-]?[a-zA-Z0-9]+)*)\\.([A-Za-z]{2,})";

Chunker chunker = new RegExChunker(emailRegularExpression,"EMAIL",1.0);
Chunking chunking = chunker.chunk(sampleText);
Set<Chunk> chunkSet = chunking.chunkSet();

for (Chunk chunk : chunkSet) {
    System.out.println("Entity: " + 
        sampleText.substring(chunk.start(), chunk.end()) + 
        "\tType: " + chunk.type());
}

Entity: hisemail@somecompany.com	Type: EMAIL


### Using OpenNLP 

In [5]:
%%loadFromPOM
<dependency>
    <groupId>org.apache.opennlp</groupId>
    <artifactId>opennlp-tools</artifactId>
    <version>1.9.0</version>
</dependency>

In [6]:
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;

In [17]:
try (InputStream tokenStream = new FileInputStream(new File("../models/en-token.bin"));
        InputStream entityModelInputStream = new FileInputStream(new File("../models/en-ner-date.bin"));) {
    TokenizerModel tokenizerModel = new TokenizerModel(tokenStream);
    Tokenizer tokenizer = new TokenizerME(tokenizerModel);
    TokenNameFinderModel tokenNameFinderModel = new TokenNameFinderModel(entityModelInputStream);
    // set class instance
    NameFinderME nameFinderME = new NameFinderME(tokenNameFinderModel);

    String text = "The city was founded in the 1850s and its first mayor was born March 3, 1832.";
    String tokens[] = tokenizer.tokenize(text);
    Span dateSpans[] = nameFinderME.find(tokens);

    for (int i = 0; i < dateSpans.length; i++) {
        System.out.print("Entity: [" + tokens[dateSpans[i].getStart()]);
        System.out.print("] was a " + dateSpans[i].getType() + " entity found starting at " + dateSpans[i].getStart());
        System.out.println(" and ending at " + dateSpans[i].getEnd());
        
        // to get actual spans
        String date = "";
        for(int j=dateSpans[i].getStart(); j< dateSpans[i].getEnd(); j++) {
            date += tokens[j] + " "; 
        }
        // To get probabilities
        double[] spanProbs = nameFinderME.probs(dateSpans);
        System.out.println("Date: " + date + " Probability: " + spanProbs[i]);
    }
    
} catch (Exception ex) {
 // Handle exception
    System.out.println("Could not find model files");
}

Entity: [1850s] was a date entity found starting at 6 and ending at 7
Date: 1850s  Probability: 0.878211895731101
Entity: [March] was a date entity found starting at 13 and ending at 15
Date: March 3  Probability: 0.9937399307548391


### Identifying multiple entities

In [18]:
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;

In [19]:
String sentences[] = { 
    "Sam and Mary left on Friday, November 12. ",
    "They stopped in Boston at an ATM to get $300 for expenses. ",
    "While they were there Sam bumped into an old friend who was on his way to work at ATT. ",
    "They decided to leave together and departed for Maine" };

In [24]:
try (InputStream tokenStream = new FileInputStream(new File("../models/en-token.bin"))) {
    TokenizerModel tokenModel = new TokenizerModel(tokenStream);
    
    Tokenizer tokenizer = new TokenizerME(tokenModel);
    String modelNames[] = { 
        "../models/en-ner-person.bin", "../models/en-ner-location.bin", 
        "../models/en-ner-organization.bin", "../models/en-ner-money.bin", 
        "../models/en-ner-time.bin" 
    };
    for (int i = 0; i < sentences.length; i++) {
        System.out.println("Sentence " + (i + 1));
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(new FileInputStream(new File(name)));
            NameFinderME nameFinderME = new NameFinderME(entityModel);
            
            // process sentence
            String tokens[] = tokenizer.tokenize(sentences[i]);
            Span spans[] = nameFinderME.find(tokens);
            
            // find location of entities
            for (Span span : spans) {
                System.out.print("\tEntity: ");
                for (int j = span.getStart(); j < span.getEnd(); j++) {
                    System.out.print(tokens[j]);
                }
                System.out.println(" - Entity Type: " + span.getType());
            }
        }
    }
} catch (Exception ex) {
// Handle exceptions
    System.out.println("Cant find model files");
}

Sentence 1
	Entity: Sam - Entity Type: person
	Entity: Mary - Entity Type: person
Sentence 2
	Entity: Boston - Entity Type: location
	Entity: $300 - Entity Type: money
Sentence 3
Sentence 4
	Entity: Maine - Entity Type: location
