In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# From Wikipedia 
# https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/ja.wikipedia.conll
# wiki = pd.read_csv('data/ja.wikipedia.conll', header=None, sep='\t', engine='python')
wiki = pd.read_csv('data/ja.wikipedia.conll', header=None, delimiter=r"\s+", engine='python')
wiki.head()

Unnamed: 0,0,1
0,1960,B-DATE
1,年代,I-DATE
2,と,O
3,1970,B-DATE
4,年代,I-DATE


In [3]:
wiki.shape

(142400, 2)

In [4]:
labels = wiki[1].unique()
labels

array(['B-DATE', 'I-DATE', 'O', 'B-PERSON', 'I-PERSON', None,
       'B-ARTIFACT', 'B-LOCATION', 'B-NUMBER', 'I-NUMBER', 'B-PERCENT',
       'I-PERCENT', 'B-TIME', 'I-TIME', 'I-ARTIFACT', 'B-EVENT',
       'I-EVENT', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-LOCATION',
       'B-OTHER', 'I-OTHER', 'B-MONEY', 'I-MONEY'], dtype=object)

In [5]:
len(labels)

24

In [6]:
wiki[1].value_counts()

O                 121564
B-LOCATION          2853
I-DATE              2346
I-ORGANIZATION      1704
B-ORGANIZATION      1576
B-DATE              1489
I-NUMBER            1374
B-PERSON            1370
I-ARTIFACT          1236
I-LOCATION          1136
B-NUMBER            1027
I-PERSON             978
B-ARTIFACT           725
I-EVENT              591
I-OTHER              428
B-OTHER              380
B-EVENT              260
I-PERCENT            259
B-PERCENT            157
I-MONEY              154
B-MONEY               67
I-TIME                38
B-TIME                17
Name: 1, dtype: int64

In [7]:
# Read data
# Annotated Corpus for Named Entity Recognition
# https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
ner = pd.read_csv("data/entity-annotated-corpus/ner.csv", encoding = "ISO-8859-1", error_bad_lines=False)
ner.head()

b'Skipping line 281837: expected 25 fields, saw 34\n'


Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,...,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,...,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,...,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,...,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,...,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,...,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O


In [8]:
ner.head(20)

Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,...,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,...,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,...,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,...,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,...,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,...,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O
5,5,through,london,to,TO,lowercase,to,NNP,capitalized,London,...,have,VBP,lowercase,have,lowercase,marched,1.0,lowercase,through,O
6,6,london,to,protest,VB,lowercase,protest,TO,lowercase,to,...,march,VBN,lowercase,marched,lowercase,through,1.0,capitalized,London,B-geo
7,7,to,protest,the,DT,lowercase,the,VB,lowercase,protest,...,through,IN,lowercase,through,capitalized,London,1.0,lowercase,to,O
8,8,protest,the,war,NN,lowercase,war,DT,lowercase,the,...,london,NNP,capitalized,London,lowercase,to,1.0,lowercase,protest,O
9,9,the,war,in,IN,lowercase,in,NN,lowercase,war,...,to,TO,lowercase,to,lowercase,protest,1.0,lowercase,the,O


In [9]:
ner = ner[['lemma', 'tag']]
ner.head(50)

Unnamed: 0,lemma,tag
0,thousand,O
1,of,O
2,demonstr,O
3,have,O
4,march,O
5,through,O
6,london,B-geo
7,to,O
8,protest,O
9,the,O


In [10]:
ner.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050795 entries, 0 to 1050794
Data columns (total 2 columns):
lemma    1050795 non-null object
tag      1050794 non-null object
dtypes: object(2)
memory usage: 16.0+ MB


In [11]:
ner.tag.value_counts()

O        889973
B-geo     37525
B-tim     20193
B-org     20184
I-per     17382
B-per     17011
I-org     16537
B-gpe     16392
I-geo      7409
I-tim      6298
B-art       434
B-eve       348
I-eve       297
I-art       280
I-gpe       229
B-nat       226
I-nat        76
Name: tag, dtype: int64

In [12]:
len(ner.tag.value_counts())

17

In [13]:
ner.isna().sum()

lemma    0
tag      1
dtype: int64

In [14]:
ner[ner.tag.isna()]

Unnamed: 0,lemma,tag
689433,domin,


In [15]:
ner['tag'] = ner['tag'].fillna('O')
ner.iloc[689433]

lemma    domin
tag          O
Name: 689433, dtype: object

In [16]:
ner.isna().sum()

lemma    0
tag      0
dtype: int64

In [17]:
ner.iloc[689423:689443]

Unnamed: 0,lemma,tag
689423,altern,O
689424,energi,O
689425,sourc,O
689426,to,O
689427,avoid,O
689428,be,O
689429,at,O
689430,the,O
689431,merci,O
689432,of,O


In [18]:
ner.shape

(1050795, 2)

In [19]:
ner[ner['tag'] == 'B-per']['lemma'].unique()[:20]

array(['bush', 'presid', 'thoma', 'prophet', 'omar', 'khayam', 'malik',
       'abdul', 'khan', 'nancy-amelia', 'tim', 'harcourt', 'foreign',
       'mr.', 'cholili', 'azahari', 'pope', 'sister', 'byzantin', 'ahm'],
      dtype=object)

In [20]:
ner[ner['tag'] == 'B-org']['lemma'].unique()[:20]

array(['labor', 'intern', 'iaea', 'european', 'u.n.', 'bilfing',
       'royal-dutch', 'shell', 'al', 'sunni', 'muslim', 'brotherhood',
       'mutahida', 'taleban', 'islam', 'british', 'home', 'associ', 'ap',
       'al-qaida'], dtype=object)

In [21]:
ner[ner['tag'] == 'B-geo']['lemma'].unique()[:20]

array(['london', 'iraq', 'hyde', 'britain', 'brighton', 'rome', 'pari',
       'madrid', 'vienna', 'isfahan', 'nigeria', 'delta', 'bayelsa',
       'niger', 'somalia', 'mogadishu', 'mosul', 'sunni', 'kurdish',
       'anbar'], dtype=object)

In [22]:
ner[ner['tag'] == 'B-gpe']['lemma'].unique()[:20]

array(['british', 'english', 'britain', 'iran', 'iranian', 'u.s.',
       'tehran', 'european', 'german', 'nigerian', 'nigeria', 'somali',
       'iraqi', 'american', 'egyptian', 'bedfordshir', 'pakistani',
       'pakistan', 'unit', 'libya'], dtype=object)

In [23]:
ner[ner['tag'] == 'B-tim']['lemma'].unique()[:100]

array(['wednesday', 'tuesday', 'sunday', 'saturday', 'friday', 'thursday',
       '2001', '2004', 'septemb', '9', '1995', 'april', 'octob', '2002',
       '2003', 'decemb', 'novemb', 'august', '2000', 'june', '3', '2009',
       '2008', 'monday', 'juli', '1970s', '1980s', 'januari', 'march',
       'midnight', '1990', '2012', '2005', 'neolith', 'new', 'may',
       '1976', 'a.d.', '1940', '1991', '1994', '1990s', '2010', '1968',
       '2006', '2007', '19th', '1834', '1845', '1917', '1910', '1962',
       '2017', '1954', 'tombston', '27', 'each', '120', 'from',
       'three-day', 'first', 'two', 'end', 'of', 'recent', 'midday',
       'fourth', '1657', '70th', 'few', 'one', '1992', 'almost',
       'between', "'s", 'later', '13', 'in', 'kamdesh', 'more', 'u.s.-l',
       '80th', 'second', 'februari', '1974', 'today', 'releas', '1998',
       '13th', 'six', '1961', '2030', 'three', '11th', 'cosmonaut',
       '1960s', 'sinc', 'past', '20', 'four-year'], dtype=object)

In [24]:
ner.to_csv('data/entity-annotated-corpus/ner_labeled_data.tsv', sep='\t', header=False, index=False)

In [48]:
# Cleaned data
ner_cleaned = pd.read_csv("data/entity-annotated-corpus/ner_dataset.csv", encoding = "ISO-8859-1", error_bad_lines=False)
ner_cleaned.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [49]:
ner_cleaned.head(55)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [50]:
ner_cleaned.shape

(1048575, 4)

In [51]:
ner_cleaned.Tag.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [52]:
len(ner_cleaned.Tag.value_counts())

17

In [53]:
ner_cleaned.isna().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

In [54]:
ner_cleaned[ner_cleaned['Tag'] == 'B-per']['Word'].unique()[:20]

array(['Bush', 'President', 'Thomas', 'Prophet', 'Omar', 'Khayam',
       'Malik', 'Abdul', 'Khan', 'Nancy-Amelia', 'Tim', 'Harcourt',
       'Foreign', 'Mr.', 'Cholily', 'Azahari', 'Pope', 'Sister',
       'Byzantine', 'Ahmed'], dtype=object)

In [55]:
ner_cleaned[ner_cleaned['Tag'] == 'B-org']['Word'].unique()[:20]

array(['Labor', 'International', 'IAEA', 'European', 'U.N.', 'Bilfinger',
       'Royal-Dutch', 'Shell', 'al', 'Sunni', 'Muslim', 'Brotherhood',
       'Mutahida', 'Taleban', 'Islam', 'British', 'Home', 'Associated',
       'AP', 'al-Qaida'], dtype=object)

In [56]:
ner_cleaned[ner_cleaned['Tag'] == 'B-geo']['Word'].unique()[:20]

array(['London', 'Iraq', 'Hyde', 'Britain', 'Brighton', 'Rome', 'Paris',
       'Madrid', 'Vienna', 'Isfahan', 'Nigeria', 'Delta', 'Bayelsa',
       'Niger', 'Somalia', 'Mogadishu', 'Mosul', 'Sunni', 'Kurdish',
       'Anbar'], dtype=object)

In [57]:
ner_cleaned[ner_cleaned['Tag'] == 'B-gpe']['Word'].unique()[:20]

array(['British', 'English', 'Britain', 'Iran', 'Iranian', 'U.S.',
       'Tehran', 'European', 'Germans', 'Nigerian', 'German', 'Nigeria',
       'Somali', 'Iraqi', 'American', 'Egyptian', 'Bedfordshire',
       'Pakistani', 'Pakistan', 'United'], dtype=object)

In [58]:
ner_cleaned[ner_cleaned['Tag'] == 'B-tim']['Word'].unique()[:20]

array(['Wednesday', 'Tuesday', 'Sunday', 'Saturday', 'Friday', 'Thursday',
       '2001', '2004', 'September', '9', '1995', 'April', 'October',
       '2002', '2003', 'December', 'November', 'August', '2000', 'June'],
      dtype=object)

In [59]:
ner_cleaned[ner_cleaned['Tag'] == 'B-art']['Word'].unique()[:20]

array(['Nuclear', 'Saltillo', 'Pentastar', 'Chrysler', 'Dodge', 'Jeep',
       'Ram', 'Vioxx', 'The', 'Good', '20/20', 'Web', 'english', 'Daily',
       'alHurra', 'Charles', 'Baghdad', 'Association', 'Huygens',
       'Cassini'], dtype=object)

In [60]:
ner_cleaned[ner_cleaned['Tag'] == 'B-eve']['Word'].unique()[:20]

array(['2012', 'Games', '2008', 'Operation', 'Gulf', 'Australian',
       'Kooyong', 'World', 'Ashura', 'Christmas', 'Olympic', 'Delray',
       'II', 'Hurricane', 'Korean', 'Beijing', 'I', 'New', 'Eve', 'Grand'],
      dtype=object)

In [61]:
ner_cleaned[ner_cleaned['Tag'] == 'B-nat']['Word'].unique()[:20]

array(['H5N1', 'Jing', 'SARS', 'Severe', 'HIV', 'AIDS', 'Hurricane',
       'Katrina', 'Ills', 'H5N2', 'XDR-TB', 'Marburg', 'Tropical', 'H1N1',
       'Bird', 'heart', 'Type', 'Rita', 'Hurricanes', 'Amur'],
      dtype=object)

In [62]:
ner_cleaned.dropna().head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
24,Sentence: 2,Families,NNS,O
54,Sentence: 3,They,PRP,O
68,Sentence: 4,Police,NNS,O
83,Sentence: 5,The,DT,O


In [63]:
indices = ner_cleaned.dropna().index
indices[:20]

Int64Index([  0,  24,  54,  68,  83, 108, 132, 153, 181, 196, 221, 233, 267,
            296, 322, 362, 372, 394, 428, 448],
           dtype='int64')

In [64]:
delimiters = ner_cleaned.iloc[indices - 1]
delimiters.head()

Unnamed: 0,Sentence #,Word,POS,Tag
1048574,,attack,NN,O
23,,.,.,O
53,,"""",``,O
67,,.,.,O
82,,.,.,O


In [65]:
delimiters['Word'].value_counts()

.                 47068
"                   684
:                    30
U.S.                 21
'                    13
?                     8
...                   8
!                     7
D.C.                  5
Baghdad               4
)                     3
,                     3
country               3
that                  2
enter                 2
Garang                2
-                     2
reporters             2
Bush                  2
                     2
years                 2
Monday                2
..                    1
involvement           1
LRA..                 1
Akayeva               1
Iraq                  1
Base                  1
2010                  1
Party                 1
                  ...  
M.C.                  1
both                  1
door                  1
Grill                 1
20th                  1
prayer                1
million               1
Florida               1
Gnassingbe            1
Swaziland             1
The             

In [66]:
ner_out = ner_cleaned[['Sentence #', 'Word', 'Tag']]
ner_out.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,,of,O
2,,demonstrators,O
3,,have,O
4,,marched,O


In [67]:
ner_out['Sentence #'] = ner_out['Sentence #'].fillna('c')
ner_out.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,c,of,O
2,c,demonstrators,O
3,c,have,O
4,c,marched,O


In [68]:
def my_map(x):
    if x != 'c':
        _, n = x.split()
        return n
    else:
        return 'c'

ner_out['Sentence #'] = ner_out['Sentence #'].map(my_map)
ner_out.head(55)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Sentence #,Word,Tag
0,1,Thousands,O
1,c,of,O
2,c,demonstrators,O
3,c,have,O
4,c,marched,O
5,c,through,O
6,c,London,B-geo
7,c,to,O
8,c,protest,O
9,c,the,O


In [74]:
output_file = 'data/test_empty_line_inserted.tsv'

with open(output_file, mode='w') as f:
    for i, row in ner_out.iterrows():
        row_list = row.tolist()
        if i != 0 and row_list[0] != 'c':
             f.write('\n')
        f.write(row_list[1] + '\t' + row_list[2] + '\n')

In [69]:
# ner_out.to_csv('data/entity-annotated-corpus/ner_labeled_data.tsv', sep='\t', header=False, index=False)