## Named Entity Recognizer (NER)

Named Entity Recognition is one of the very useful information extraction technique to identify and classify 
named entities in text. These entities are pre-defined categories such a person’s names, organizations, 
locations, time representations, financial elements, etc.

##### NER using NLTK

In [8]:
import nltk
#nltk.download('words')
#nltk.download('abc')
from nltk.tokenize import word_tokenize
from nltk import pos_tag,ne_chunk

In [7]:
document = '''Andrew Yan-Tak Ng is a Chinese American computer scientist.
He is the former chief scientist at Baidu, where he led the company's
Artificial Intelligence Group. He is an adjunct professor (formerly 
associate professor) at Stanford University. Ng is also the co-founder
and chairman at Coursera, an online education platform. Andrew was born
in the UK in 1976. His parents were both from Hong Kong.'''


In [29]:
def nltk_ner(document):
    '''
    fetch the Named entity from the document using NLTK
    '''
    ##tokenize the doc
    tokenized_doc = word_tokenize(document)

    ##tag the document 
    tag_doc = pos_tag(tokenized_doc)
    tag_chunk_doc = ne_chunk(tag_doc)
    ##extract the named entities
    
    named_entities=[]
    for items in tag_chunk_doc:
        #print('*',items)
        if hasattr(items, 'label'):
            ##named_entities=[]
            ## c[0] is the word like Andrew c[1] contain Part of Speech
            entity_name = ' '.join(c[0] for c in items.leaves())
            entity_type = items.label()
            named_entities.append((entity_name,entity_type))
    return named_entities
        
named_entities= nltk_ner(document)
print(named_entities)

[('Andrew', 'PERSON'), ('Chinese', 'GPE'), ('American', 'GPE'), ('Baidu', 'ORGANIZATION'), ("company's Artificial Intelligence Group", 'ORGANIZATION'), ('Stanford University', 'ORGANIZATION'), ('Coursera', 'ORGANIZATION'), ('Andrew', 'PERSON'), ('Hong Kong', 'GPE')]


##### Classical approaches: Rule Based NER( using Regex)

In [43]:
# Code for tagging temporal expressions in text
# For details of the TIMEX format, see http://timex2.mitre.org/

import re
import string
import os
import sys

# Requires eGenix.com mx Base Distribution
# http://www.egenix.com/products/python/mxBase/
try:
    from mx.DateTime import *
except ImportError:
    print("""
Requires eGenix.com mx Base Distribution
http://www.egenix.com/products/python/mxBase/""")

# Predefined strings.
numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \
          eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \
          eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \
          ninety|hundred|thousand)"
day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
month = "(january|february|march|april|may|june|july|august|september| \
          october|november|december)"
dmy = "(year|day|week|month)"
rel_day = "(today|yesterday|tomorrow|tonight|tonite)"
exp1 = "(before|after|earlier|later|ago)"
exp2 = "(this|next|last)"
iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+"
year = "((?<=\s)\d{4}|^\d{4})"
regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")"
regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))"

reg1 = re.compile(regxp1, re.IGNORECASE)
reg2 = re.compile(regxp2, re.IGNORECASE)
reg3 = re.compile(rel_day, re.IGNORECASE)
reg4 = re.compile(iso)
reg5 = re.compile(year)

def tag(text):

    # Initialization
    timex_found = []

    # re.findall() finds all the substring matches, keep only the full
    # matching string. Captures expressions such as 'number of days' ago, etc.
    found = reg1.findall(text)
    found = [a[0] for a in found if len(a) > 1]
    for timex in found:
        timex_found.append(timex)

    # Variations of this thursday, next year, etc
    found = reg2.findall(text)
    found = [a[0] for a in found if len(a) > 1]
    for timex in found:
        timex_found.append(timex)

    # today, tomorrow, etc
    found = reg3.findall(text)
    for timex in found:
        timex_found.append(timex)

    # ISO
    found = reg4.findall(text)
    for timex in found:
        timex_found.append(timex)

    # Year
    found = reg5.findall(text)
    for timex in found:
        timex_found.append(timex)

    # Tag only temporal expressions which haven't been tagged.
    for timex in timex_found:
        text = re.sub(timex + '(?!</TIMEX2>)', '<TIMEX2>' + timex + '</TIMEX2>', text)

    return text

# Hash function for week days to simplify the grounding task.
# [Mon..Sun] -> [0..6]
hashweekdays = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6}

# Hash function for months to simplify the grounding task.
# [Jan..Dec] -> [1..12]
hashmonths = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12}

# Hash number in words into the corresponding integer value
def hashnum(number):
    if re.match(r'one|^a\b', number, re.IGNORECASE):
        return 1
    if re.match(r'two', number, re.IGNORECASE):
        return 2
    if re.match(r'three', number, re.IGNORECASE):
        return 3
    if re.match(r'four', number, re.IGNORECASE):
        return 4
    if re.match(r'five', number, re.IGNORECASE):
        return 5
    if re.match(r'six', number, re.IGNORECASE):
        return 6
    if re.match(r'seven', number, re.IGNORECASE):
        return 7
    if re.match(r'eight', number, re.IGNORECASE):
        return 8
    if re.match(r'nine', number, re.IGNORECASE):
        return 9
    if re.match(r'ten', number, re.IGNORECASE):
        return 10
    if re.match(r'eleven', number, re.IGNORECASE):
        return 11
    if re.match(r'twelve', number, re.IGNORECASE):
        return 12
    if re.match(r'thirteen', number, re.IGNORECASE):
        return 13
    if re.match(r'fourteen', number, re.IGNORECASE):
        return 14
    if re.match(r'fifteen', number, re.IGNORECASE):
        return 15
    if re.match(r'sixteen', number, re.IGNORECASE):
        return 16
    if re.match(r'seventeen', number, re.IGNORECASE):
        return 17
    if re.match(r'eighteen', number, re.IGNORECASE):
        return 18
    if re.match(r'nineteen', number, re.IGNORECASE):
        return 19
    if re.match(r'twenty', number, re.IGNORECASE):
        return 20
    if re.match(r'thirty', number, re.IGNORECASE):
        return 30
    if re.match(r'forty', number, re.IGNORECASE):
        return 40
    if re.match(r'fifty', number, re.IGNORECASE):
        return 50
    if re.match(r'sixty', number, re.IGNORECASE):
        return 60
    if re.match(r'seventy', number, re.IGNORECASE):
        return 70
    if re.match(r'eighty', number, re.IGNORECASE):
        return 80
    if re.match(r'ninety', number, re.IGNORECASE):
        return 90
    if re.match(r'hundred', number, re.IGNORECASE):
        return 100
    if re.match(r'thousand', number, re.IGNORECASE):
      return 1000

# Given a timex_tagged_text and a Date object set to base_date,
# returns timex_grounded_text
def ground(tagged_text, base_date):

    # Find all identified timex and put them into a list
    timex_regex = re.compile(r'<TIMEX2>.*?</TIMEX2>', re.DOTALL)
    timex_found = timex_regex.findall(tagged_text)
    timex_found = map(lambda timex:re.sub(r'</?TIMEX2.*?>', '', timex), \
                timex_found)

    # Calculate the new date accordingly
    for timex in timex_found:
        timex_val = 'UNKNOWN' # Default value

        timex_ori = timex   # Backup original timex for later substitution

        # If numbers are given in words, hash them into corresponding numbers.
        # eg. twenty five days ago --> 25 days ago
        if re.search(numbers, timex, re.IGNORECASE):
            split_timex = re.split(r'\s(?=days?|months?|years?|weeks?)', \
                                                              timex, re.IGNORECASE)
            value = split_timex[0]
            unit = split_timex[1]
            num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \
                                          value, re.IGNORECASE))
            timex = str(sum(num_list)) + ' ' + unit

        # If timex matches ISO format, remove 'time' and reorder 'date'
        if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex):
            dmy = re.split(r'\s', timex)[0]
            dmy = re.split(r'/|-', dmy)
            timex_val = str(dmy[2]) + '-' + str(dmy[1]) + '-' + str(dmy[0])

        # Specific dates
        elif re.match(r'\d{4}', timex):
            timex_val = str(timex)

        # Relative dates
        elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE):
            timex_val = str(base_date)
        elif re.match(r'yesterday', timex, re.IGNORECASE):
            timex_val = str(base_date + RelativeDateTime(days=-1))
        elif re.match(r'tomorrow', timex, re.IGNORECASE):
            timex_val = str(base_date + RelativeDateTime(days=+1))

        # Weekday in the previous week.
        elif re.match(r'last ' + week_day, timex, re.IGNORECASE):
            day = hashweekdays[timex.split()[1]]
            timex_val = str(base_date + RelativeDateTime(weeks=-1, \
                            weekday=(day,0)))

        # Weekday in the current week.
        elif re.match(r'this ' + week_day, timex, re.IGNORECASE):
            day = hashweekdays[timex.split()[1]]
            timex_val = str(base_date + RelativeDateTime(weeks=0, \
                            weekday=(day,0)))

        # Weekday in the following week.
        elif re.match(r'next ' + week_day, timex, re.IGNORECASE):
            day = hashweekdays[timex.split()[1]]
            timex_val = str(base_date + RelativeDateTime(weeks=+1, \
                              weekday=(day,0)))

        # Last, this, next week.
        elif re.match(r'last week', timex, re.IGNORECASE):
            year = (base_date + RelativeDateTime(weeks=-1)).year

            # iso_week returns a triple (year, week, day) hence, retrieve
            # only week value.
            week = (base_date + RelativeDateTime(weeks=-1)).iso_week[1]
            timex_val = str(year) + 'W' + str(week)
        elif re.match(r'this week', timex, re.IGNORECASE):
            year = (base_date + RelativeDateTime(weeks=0)).year
            week = (base_date + RelativeDateTime(weeks=0)).iso_week[1]
            timex_val = str(year) + 'W' + str(week)
        elif re.match(r'next week', timex, re.IGNORECASE):
            year = (base_date + RelativeDateTime(weeks=+1)).year
            week = (base_date + RelativeDateTime(weeks=+1)).iso_week[1]
            timex_val = str(year) + 'W' + str(week)

        # Month in the previous year.
        elif re.match(r'last ' + month, timex, re.IGNORECASE):
            month = hashmonths[timex.split()[1]]
            timex_val = str(base_date.year - 1) + '-' + str(month)

        # Month in the current year.
        elif re.match(r'this ' + month, timex, re.IGNORECASE):
            month = hashmonths[timex.split()[1]]
            timex_val = str(base_date.year) + '-' + str(month)

        # Month in the following year.
        elif re.match(r'next ' + month, timex, re.IGNORECASE):
            month = hashmonths[timex.split()[1]]
            timex_val = str(base_date.year + 1) + '-' + str(month)
        elif re.match(r'last month', timex, re.IGNORECASE):

            # Handles the year boundary.
            if base_date.month == 1:
                timex_val = str(base_date.year - 1) + '-' + '12'
            else:
                timex_val = str(base_date.year) + '-' + str(base_date.month - 1)
        elif re.match(r'this month', timex, re.IGNORECASE):
                timex_val = str(base_date.year) + '-' + str(base_date.month)
        elif re.match(r'next month', timex, re.IGNORECASE):

            # Handles the year boundary.
            if base_date.month == 12:
                timex_val = str(base_date.year + 1) + '-' + '1'
            else:
                timex_val = str(base_date.year) + '-' + str(base_date.month + 1)
        elif re.match(r'last year', timex, re.IGNORECASE):
            timex_val = str(base_date.year - 1)
        elif re.match(r'this year', timex, re.IGNORECASE):
            timex_val = str(base_date.year)
        elif re.match(r'next year', timex, re.IGNORECASE):
            timex_val = str(base_date.year + 1)
        elif re.match(r'\d+ days? (ago|earlier|before)', timex, re.IGNORECASE):

            # Calculate the offset by taking '\d+' part from the timex.
            offset = int(re.split(r'\s', timex)[0])
            timex_val = str(base_date + RelativeDateTime(days=-offset))
        elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE):
            offset = int(re.split(r'\s', timex)[0])
            timex_val = str(base_date + RelativeDateTime(days=+offset))
        elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE):
            offset = int(re.split(r'\s', timex)[0])
            year = (base_date + RelativeDateTime(weeks=-offset)).year
            week = (base_date + \
                            RelativeDateTime(weeks=-offset)).iso_week[1]
            timex_val = str(year) + 'W' + str(week)
        elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE):
            offset = int(re.split(r'\s', timex)[0])
            year = (base_date + RelativeDateTime(weeks=+offset)).year
            week = (base_date + RelativeDateTime(weeks=+offset)).iso_week[1]
            timex_val = str(year) + 'W' + str(week)
        elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE):
            extra = 0
            offset = int(re.split(r'\s', timex)[0])

            # Checks if subtracting the remainder of (offset / 12) to the base month
            # crosses the year boundary.
            if (base_date.month - offset % 12) < 1:
                extra = 1

            # Calculate new values for the year and the month.
            year = str(base_date.year - offset // 12 - extra)
            month = str((base_date.month - offset % 12) % 12)

            # Fix for the special case.
            if month == '0':
                month = '12'
            timex_val = year + '-' + month
        elif re.match(r'\d+ months? (later|after)', timex, re.IGNORECASE):
            extra = 0
            offset = int(re.split(r'\s', timex)[0])
            if (base_date.month + offset % 12) > 12:
                extra = 1
            year = str(base_date.year + offset // 12 + extra)
            month = str((base_date.month + offset % 12) % 12)
            if month == '0':
                month = '12'
            timex_val = year + '-' + month
        elif re.match(r'\d+ years? (ago|earlier|before)', timex, re.IGNORECASE):
            offset = int(re.split(r'\s', timex)[0])
            timex_val = str(base_date.year - offset)
        elif re.match(r'\d+ years? (later|after)', timex, re.IGNORECASE):
            offset = int(re.split(r'\s', timex)[0])
            timex_val = str(base_date.year + offset)

        # Remove 'time' from timex_val.
        # For example, If timex_val = 2000-02-20 12:23:34.45, then
        # timex_val = 2000-02-20
        timex_val = re.sub(r'\s.*', '', timex_val)

        # Substitute tag+timex in the text with grounded tag+timex.
        tagged_text = re.sub('<TIMEX2>' + timex_ori + '</TIMEX2>', '<TIMEX2 val=\"' \
            + timex_val + '\">' + timex_ori + '</TIMEX2>', tagged_text)

    return tagged_text

####

def demo(text):
    '''
    print the tagged text
    '''
    print(tag(text))

## Pass the text to the function
##if __name__ == '__main__':
text = nltk.corpus.abc.raw('rural.txt')[:10000]
demo(text)


Requires eGenix.com mx Base Distribution
http://www.egenix.com/products/python/mxBase/
PM denies knowledge of AWB kickbacks
The Prime Minister has denied he knew AWB was paying kickbacks to Iraq despite writing to the wheat exporter asking to be kept fully informed on Iraq wheat sales.
Letters from John Howard and Deputy Prime Minister Mark Vaile to AWB have been released by the Cole inquiry into the oil for food program.
In one of the letters Mr Howard asks AWB managing director Andrew Lindberg to remain in close contact with the Government on Iraq wheat sales.
The Opposition's Gavan O'Connor says the letter was sent in <TIMEX2>2002</TIMEX2>, the same time AWB was paying kickbacks to Iraq though a Jordanian trucking company.
He says the Government can longer wipe its hands of the illicit payments, which totalled $290 million.
"The responsibility for this must lay may squarely at the feet of Coalition ministers in trade, agriculture and the Prime Minister," he said.
But the Prime Mini

In [44]:
demo(document)

Andrew Yan-Tak Ng is a Chinese American computer scientist.
He is the former chief scientist at Baidu, where he led the company's
Artificial Intelligence Group. He is an adjunct professor (formerly 
associate professor) at Stanford University. Ng is also the co-founder
and chairman at Coursera, an online education platform. Andrew was born
in the UK in <TIMEX2>1976</TIMEX2>. His parents were both from Hong Kong.


In [45]:
Sent: ['Linux', 'is', 'the', 'best', 'OS']
Labels: ['OS','IR','IR','IR','IR']
Sent: ['Ubuntu', 'is', 'my', 'favorite', 'OS']
Labels: ['OS','IR','IR','IR','IR']

In [46]:
## Here the set of class labels are OS and IR
## OS means Operating System
## IR means Irrelevent


In [47]:
data = [(['Linux', 'is', 'the', 'best', 'OS'], ['OS','IR','IR','IR','IR']),
(['Ubuntu', 'is', 'my', 'favourite', 'OS'], ['OS','IR','IR','IR','IR'])]

In [49]:
def get_corpus(data):
    '''
    Get the word and tag to build the corpus
    '''
    corpus = []
    for doc,tags in data:
        doc_tag = []
        for word, tag in zip(doc,tags):
            doc_tag.append((word,tag))
        corpus.append(doc_tag)
    return corpus


corpus = get_corpus(data)
print(corpus)

[[('Linux', 'OS'), ('is', 'IR'), ('the', 'IR'), ('best', 'IR'), ('OS', 'IR')], [('Ubuntu', 'OS'), ('is', 'IR'), ('my', 'IR'), ('favourite', 'IR'), ('OS', 'IR')]]


In [66]:
def doc_2_features(doc,i):
    '''
    This function creates the features from the document
    i here determines the position of the current word
    doc is one documnet of the corpus
    
    This function return the dictionary of features for 1 document of the corpus
    '''
    features = dict()
    ## Here each of the word of the doc is a tuple where the first one is the word 
    ## and the second one is the entity tag
    features['current_word'] = doc[i][0]
    if i >1:
        features['previous_word'] = doc[i-1][0]
        features['2nd_previous_word'] = doc[i-2][0]
    elif i >0:
        features['previous_word'] = doc[i-1][0]
    else:
        features['BOS'] = True
    ## -2 since max i value is len(doc) -1
    if i< len(doc)-2:
        features['next_word'] = doc[i+1][0]
        features['2nd_next_word'] = doc[i+2][0]
    elif i< len(doc)-1:
        features['next_word'] = doc[i+1][0]
    else:
        features['EOS'] = True
    
    return features

def extract_features(doc):
    '''
    Extract features from the doc
    '''
    return [doc_2_features(doc,i) for i in range(len(doc))]

X = [extract_features(doc) for doc in corpus]
print(X)

[[{'current_word': 'Linux', 'BOS': True, 'next_word': 'is', '2nd_next_word': 'the'}, {'current_word': 'is', 'previous_word': 'Linux', 'next_word': 'the', '2nd_next_word': 'best'}, {'current_word': 'the', 'previous_word': 'is', '2nd_previous_word': 'Linux', 'next_word': 'best', '2nd_next_word': 'OS'}, {'current_word': 'best', 'previous_word': 'the', '2nd_previous_word': 'is', 'next_word': 'OS'}, {'current_word': 'OS', 'previous_word': 'best', '2nd_previous_word': 'the', 'EOS': True}], [{'current_word': 'Ubuntu', 'BOS': True, 'next_word': 'is', '2nd_next_word': 'my'}, {'current_word': 'is', 'previous_word': 'Ubuntu', 'next_word': 'my', '2nd_next_word': 'favourite'}, {'current_word': 'my', 'previous_word': 'is', '2nd_previous_word': 'Ubuntu', 'next_word': 'favourite', '2nd_next_word': 'OS'}, {'current_word': 'favourite', 'previous_word': 'my', '2nd_previous_word': 'is', 'next_word': 'OS'}, {'current_word': 'OS', 'previous_word': 'favourite', '2nd_previous_word': 'my', 'EOS': True}]]


In [69]:
def get_labels(doc):
    '''
    fetch the label of one document
    '''
    return [label for (word,label) in doc]

y = [get_labels(doc) for doc in corpus]
print(y)

[['OS', 'IR', 'IR', 'IR', 'IR'], ['OS', 'IR', 'IR', 'IR', 'IR']]


In [73]:
import sklearn_crfsuite
##%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [101]:
test = [['CentOS','happens' , 'to','be', 'my', 'favourite', 'OS']]
X_test = [extract_features(doc) for doc in test]
print(crf.predict(X_test))
print('********************************************')
## Here I am convert X_text from 2D to 1D for predict_single
print(crf.predict_single(X_test[0])) 


[['OS', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR']]
********************************************
['OS', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR']


##### spaCy for NER

In [103]:
import pandas as pd
data = pd.read_csv("dataset/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


### Start from here