In [1]:
%autosave 300

Autosaving every 300 seconds



Named Entity Recognition and Classification (NERC) is a process of recognizing information units like names, including person, organization and location names, and numeric expressions including time, date, money and percent expressions from unstructured text. The goal is to develop practical and domain-independent techniques in order to detect named entities with high accuracy automatically.


<table>
  <tr><td>
    <img src="https://miro.medium.com/max/1400/1*qQggIPMugLcy-ndJ8X_aAA.png"
         alt="Fashion MNIST sprite"  width="600">
  </td></tr>

<table>
  <tr><td>
    <img src="https://miro.medium.com/max/1400/1*bP_mN9GaZ-6J1ssmpzdzzQ.png"
         alt="Fashion MNIST sprite"  width="600">
  </td></tr>


# import package and read data

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/ner_dataset.csv', encoding = "ISO-8859-1")


In [8]:
df.tail(5)

Unnamed: 0,Sentence #,Word,POS,Tag
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O
1048574,,attack,NN,O


In [8]:
df.POS.unique()

array(['NNS', 'IN', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'CC',
       'JJ', '.', 'VBD', 'WP', '``', 'CD', 'PRP', 'VBZ', 'POS', 'VBG',
       'RB', ',', 'WRB', 'PRP$', 'MD', 'WDT', 'JJR', ':', 'JJS', 'WP$',
       'RP', 'PDT', 'NNPS', 'EX', 'RBS', 'LRB', 'RRB', '$', 'RBR', ';',
       'UH', 'FW'], dtype=object)

In [10]:
df.Tag.unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

# data exlpore

In [11]:
df.isnull().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

In [3]:
df = df.fillna(method='ffill')
df.tail()

Unnamed: 0,Sentence #,Word,POS,Tag
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [14]:
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(47959, 35178, 17)

In [17]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,402
1,B-eve,308
2,B-geo,37644
3,B-gpe,15870
4,B-nat,201
5,B-org,20143
6,B-per,16990
7,B-tim,20333
8,I-art,297
9,I-eve,253


# training

The following code transform the text date to vector using DictVectorizer and then split to train and test sets.

In [4]:
# df1 = df.sample(frac=0.2, replace=True, random_state=1)
# df1.shape
df1 = df[:200000]
df1.shape

(200000, 4)

In [5]:
df1.isnull().sum()

Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64

In [6]:
X = df1.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
# iob tags
y = df1.Tag.values


In [7]:
classes = np.unique(y)
classes = classes.tolist()
print(classes)


['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
print(X_train.shape, y_train.shape)

(150000, 24724) (150000,)


## sgd - 0.65

In [9]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

In [10]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

SGDClassifier()

In [14]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       1.00      0.03      0.06        31
       B-eve       0.05      0.09      0.06        23
       B-geo       0.73      0.80      0.76      1755
       B-gpe       0.95      0.65      0.77       826
       B-nat       0.25      0.12      0.16        17
       B-org       0.71      0.42      0.53       896
       B-per       0.77      0.55      0.64       813
       B-tim       0.84      0.65      0.73       934
       I-art       0.60      0.17      0.26        18
       I-eve       0.60      0.12      0.20        25
       I-geo       0.78      0.49      0.61       359
       I-gpe       0.00      0.00      0.00        17
       I-nat       0.00      0.00      0.00         7
       I-org       0.70      0.54      0.61       767
       I-per       0.69      0.62      0.65       852
       I-tim       0.43      0.04      0.07       272

   micro avg       0.75      0.59      0.66      7612
   macro avg       0.57   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## naive bayes - 0.57

In [15]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

MultinomialNB(alpha=0.01)

In [16]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.11      0.26      0.15        31
       B-eve       0.18      0.35      0.24        23
       B-geo       0.73      0.66      0.69      1755
       B-gpe       0.76      0.87      0.81       826
       B-nat       0.30      0.76      0.43        17
       B-org       0.44      0.43      0.44       896
       B-per       0.44      0.47      0.45       813
       B-tim       0.60      0.62      0.61       934
       I-art       0.23      0.28      0.25        18
       I-eve       0.32      0.52      0.39        25
       I-geo       0.45      0.51      0.48       359
       I-gpe       0.19      0.18      0.18        17
       I-nat       0.00      0.00      0.00         7
       I-org       0.51      0.53      0.52       767
       I-per       0.51      0.52      0.51       852
       I-tim       0.19      0.30      0.23       272

   micro avg       0.55      0.58      0.56      7612
   macro avg       0.37   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
