# Deliverable 2

- Deliverable 2 will be a NER (Named entity recognition system).


## Overview of the data

url = https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus


Essential info about entities:

```
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
```


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

In [3]:
# Path to the dataset
path = "data/"

In [4]:
data = pd.read_csv(path+"ner_dataset.csv",
                   encoding="latin1")

In [5]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


 Fill with "Sentence: k" for each k

In [6]:
sentences = list(set(data["Sentence #"]))
sentences[0] = "nan"
sentences.sort()

len(sentences)

47960

In [7]:
sentences[0:3]

['Sentence: 1', 'Sentence: 10', 'Sentence: 100']

In [8]:
set(data["Tag"])

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O'}

In [9]:
for tag in set(data["Tag"]):
    print("\nTAG:",tag)
    print(data[data["Tag"] == tag]["Word"][0:10])


TAG: I-per
271         Mahmoud
272     Ahmadinejad
332         Horbach
444       Abdullahi
445           Yusuf
446           Ahmad
966        Muhammad
974          Khayam
1106     Faridullah
1107           Khan
Name: Word, dtype: object

TAG: I-eve
4854      Summer
4855    Olympics
5036     Olympic
5171      Medusa
5764         War
6730        Open
6756     Classic
6834        Open
9990         War
9991          II
Name: Word, dtype: object

TAG: B-tim
167    Wednesday
211    Wednesday
274      Tuesday
341    Wednesday
493    Wednesday
654       Sunday
679     Saturday
684       Friday
740     Saturday
848     Thursday
Name: Word, dtype: object

TAG: I-nat
5045            Jing
5074            Jing
12509          Acute
12510    Respiratory
12511       Syndrome
22948        Katrina
23055        Katrina
29719        Katrina
34813        Katrina
68389        Katrina
Name: Word, dtype: object

TAG: B-geo
6        London
12         Iraq
65         Hyde
94      Britain
106    Brighton
118   

## Indexing Sentences

In [10]:
sentence_formatter = "Sentence: {}"
sentence_formatter.format(0) in sentences

False

In [11]:
sentence_formatter = "Sentence: {}"
sentence_formatter.format(1) in sentences

True

In [12]:
i = 1

sentence_id      = sentence_formatter.format(i)
sentence_id_next = sentence_formatter.format(i+1)
sentence_id, sentence_id_next

('Sentence: 1', 'Sentence: 2')

In [13]:
print(data.index[data["Sentence #"] == sentence_id])
print(data.index[data["Sentence #"] == sentence_id_next])

Int64Index([0], dtype='int64')
Int64Index([24], dtype='int64')


In [14]:
start = data.index[data["Sentence #"] == sentence_id][0]
end   =  data.index[data["Sentence #"] == sentence_id_next][0]
start, end

(0, 24)

In [15]:
data["Sentence #"][start:end] = sentence_id

In [16]:
data["Sentence #"][start:end]

0     Sentence: 1
1     Sentence: 1
2     Sentence: 1
3     Sentence: 1
4     Sentence: 1
5     Sentence: 1
6     Sentence: 1
7     Sentence: 1
8     Sentence: 1
9     Sentence: 1
10    Sentence: 1
11    Sentence: 1
12    Sentence: 1
13    Sentence: 1
14    Sentence: 1
15    Sentence: 1
16    Sentence: 1
17    Sentence: 1
18    Sentence: 1
19    Sentence: 1
20    Sentence: 1
21    Sentence: 1
22    Sentence: 1
23    Sentence: 1
Name: Sentence #, dtype: object

## Selecting a subset and writting an identifier

In [17]:
data = pd.read_csv(path+"ner_dataset.csv",
                   encoding="latin1")

last_n = 2000
end   = data.index[data["Sentence #"] == sentence_formatter.format(last_n)][0]

In [18]:
data = data[0:end]

In [19]:
n_sentences = len(list(set(data["Sentence #"])))
first_n = 1
last_n = last_n -1
print(n_sentences)

2000


In [20]:
# %%time 
sentence_formatter = "Sentence: {}"

for s_id in tqdm(range(first_n, last_n)):
#     print("current {}/{}".format(s_id,last_n), end="\r")
    sentence_id = sentence_formatter.format(s_id)
    sentence_id_next = sentence_formatter.format(s_id + 1)
    start = data.index[data["Sentence #"] == sentence_id][0]
    end   = data.index[data["Sentence #"] == sentence_id_next][0]
    data["Sentence #"][start:end] = sentence_id
    
sentence_id = sentence_formatter.format(last_n)
start = data.index[data["Sentence #"] == sentence_id][0]
end   = data.shape[0]
data["Sentence #"][start:end] = sentence_id


HBox(children=(FloatProgress(value=0.0, max=1998.0), HTML(value='')))


CPU times: user 13.2 s, sys: 280 ms, total: 13.4 s
Wall time: 13 s


data = pd.read_csv(path+"ner_dataset.csv",
                   encoding="latin1").fillna(method="ffill")

## Building X and Y

In [21]:
n_sentences

2000

In [22]:
X_txt = []
Y_txt = []

sentence_formatter = "Sentence: {}"

for i in range(1,n_sentences):
    s = sentence_formatter.format(i)
    X_txt.append(list(data[data["Sentence #"]==s]["Word"].values))
    Y_txt.append(list(data[data["Sentence #"]==s]["Tag"].values))

In [23]:
i = 0
xy = ["{}/{}".format(x,y) for x,y in zip(X_txt[i],Y_txt[i])]
" ".join(xy)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O'

In [24]:
def build_word_to_pos(X):

    word_to_pos = {}
    i = 0
    for s in X:
        for w in s:
            if w not in word_to_pos:
                word_to_pos[w] = i
                i +=1
                
    pos_to_word = {v: k for k, v in word_to_pos.items()}
    return word_to_pos, pos_to_word
            
def build_tag_to_pos(Y):
    tag_to_pos = {}
    i = 0
    for s in Y:
        for t in s:
            if t not in tag_to_pos:
                tag_to_pos[t] = i
                i +=1
    pos_to_tag = {v: k for k, v in tag_to_pos.items()}

    return tag_to_pos, pos_to_tag

In [25]:
word_to_pos, pos_to_word = build_word_to_pos(X_txt)
tag_to_pos, pos_to_tag  = build_tag_to_pos(Y_txt)

len(word_to_pos), len(tag_to_pos)

(7047, 17)

In [26]:
tag_to_pos

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [27]:
X = [[word_to_pos[w] for w in s] for s in X_txt]
Y = [[tag_to_pos[t] for t in s] for s in Y_txt]

## HMM 

In [28]:
import scipy
import numpy as np

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

import skseq

In [29]:
import skseq
import skseq.sequences
import skseq.readers

from skseq.sequences import sequence
from skseq.sequences import sequence_list
from skseq.sequences import label_dictionary

In [30]:
# Label dictionary containing all the words 
x_dict = label_dictionary.LabelDictionary(label_names=list(word_to_pos.keys()))

# Label dictionary containing all the tags
y_dict = label_dictionary.LabelDictionary(label_names=list(tag_to_pos.keys()))

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [33]:
# List of sequences
train_seq = sequence_list.SequenceList(x_dict,y_dict)
train_seq.seq_list = [sequence.Sequence(x,y) for x,y in zip(X_train,Y_train)]

test_seq = sequence_list.SequenceList(x_dict,y_dict)
test_seq.seq_list = [sequence.Sequence(x,y) for x,y in zip(X_test,Y_test)]

In [34]:
# Decodification of train sentence 0
print(train_seq[0],'\n\n',train_seq[0].to_words(sequence_list=train_seq))

61/0 5672/0 2194/3 509/0 352/0 13/0 2554/2 1189/0 19/0 9/0 5673/0 369/0 279/0 994/0 107/7 53/0 882/0 1070/0 11/0 9/0 813/0 832/0 1/0 5674/1 21/0  

 The/O 85-year-old/O Obama/B-per told/O police/O and/O Kenyan/B-gpe media/O that/O the/O break-in/O attempt/O occurred/O early/O Wednesday/B-tim at/O her/O home/O in/O the/O western/O village/O of/O Kogelo/B-geo ./O 


In [35]:
# Decodification of train sentence 0
print(test_seq[0], '\n\n', test_seq[0].to_words(sequence_list=test_seq))

367/0 1171/0 18/0 9/0 1384/0 1569/0 172/0 9/0 1566/0 68/0 1570/0 1571/0 1069/0 11/0 247/0 1572/0 21/0  

 A/O statement/O from/O the/O health/O ministry/O said/O the/O woman/O 's/O family/O kept/O chickens/O in/O their/O backyard/O ./O 


### From notebook 07

In [36]:
def update_initial_counts(initial_counts, seq, state_to_pos):
    initial_counts[state_to_pos[seq.y[0]]] +=  1

def update_transition_counts(transition_counts, seq, state_to_pos):
    for (t_prev,t) in zip(seq.y[:-1], seq.y[1:]):
        transition_counts[state_to_pos[t], state_to_pos[t_prev]] += 1 

def update_emission_counts(emission_counts, seq, state_to_pos, word_to_pos):
    for (t,w) in zip(seq.y, seq.x):
        emission_counts[state_to_pos[t], word_to_pos[w]] += 1 
        
def update_final_counts(final_counts, seq, state_to_pos):
    final_counts[state_to_pos[seq.y[-1]]] +=1

In [37]:
def sufficient_statistics_hmm(sequences, state_to_pos, word_to_pos):
    
    n_states = len(state_to_pos)
    n_words  = len(word_to_pos)
    initial_counts      = np.zeros((n_states))
    transition_counts   = np.zeros((n_states, n_states))
    final_counts        = np.zeros((n_states))
    emission_counts     = np.zeros((n_states, n_words))
    
    for seq in sequences:
        update_initial_counts(initial_counts, seq, state_to_pos)
        update_transition_counts(transition_counts, seq,  state_to_pos)
        update_emission_counts(emission_counts, seq,  state_to_pos, word_to_pos) 
        update_final_counts(final_counts, seq,  state_to_pos) 
    
    return initial_counts, transition_counts, final_counts, emission_counts

In [38]:
X_train_txt, X_test_txt, Y_train_txt, Y_test_txt = train_test_split(X_txt, Y_txt, test_size=0.1)

In [39]:
train_seq_txt = [sequence.Sequence(x,y) for x,y in zip(X_train_txt,Y_train_txt)]
test_seq_txt = [sequence.Sequence(x,y) for x,y in zip(X_test_txt,Y_test_txt)]
all_seq_txt = [sequence.Sequence(x,y) for x,y in zip(X_txt,Y_txt)]

In [40]:
counts = sufficient_statistics_hmm(all_seq_txt, 
                                   tag_to_pos,
                                   word_to_pos);

In [41]:
initial_counts, transition_counts, final_counts, emission_counts = counts

In [42]:
M = len(all_seq_txt)
N = len(all_seq_txt[0].x)
print("M:\t", M, "\nN:\t", N,"\nM*N:\t", M*N)

M:	 1999 
N:	 24 
M*N:	 47976


In [43]:
print("initial_counts sum: ", np.sum(initial_counts))
print("emission_counts sum: ", np.sum(emission_counts))
print("transition and final counts sum: ",\
       np.sum(transition_counts) + sum(final_counts))

initial_counts sum:  1999.0
emission_counts sum:  44375.0
transition and final counts sum:  44375.0


In [44]:
initial_probs    = (initial_counts / np.sum(initial_counts))
transition_probs = transition_counts/(np.sum(transition_counts,0) + final_counts)
final_probs      = final_counts/(np.sum(transition_counts, 0) + final_counts )
emission_probs   = emission_counts.T / np.sum(emission_counts, 1)

print("\ninitial_probs")
print(initial_probs)

print("\ntransition_probs")
print(transition_probs)

print("\nfinal_probs")
print(final_probs)

print("\nemission_probs")
print(emission_probs)


initial_probs
[7.19859930e-01 4.60230115e-02 8.70435218e-02 9.00450225e-02
 0.00000000e+00 4.75237619e-02 0.00000000e+00 9.00450225e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 5.00250125e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00]

transition_probs
[[8.45657293e-01 7.93157076e-01 8.64253394e-01 1.99712644e-01
  8.83534137e-01 5.69794050e-01 5.48701299e-01 8.04941482e-01
  6.73913043e-01 5.20000000e-01 6.67901235e-01 8.33333333e-01
  6.63461538e-01 6.11111111e-01 4.06250000e-01 7.50000000e-01
  7.77777778e-01]
 [3.14029472e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.14416476e-03 0.00000000e+00 3.90117035e-03
  0.00000000e+00 0.00000000e+00 0.00000000e+00 3.33333333e-02
  9.61538462e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [1.83073626e-02 5.44323484e-03 0.00000000e+00 5.74712644e-03
  0.00000000e+00 1.14416476e-03 3.24675325e-03 3.90117035e-03
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.0

In [45]:
transitions_with_final_counts = np.vstack((transition_counts,
                                           final_counts))

In [46]:
def logzero():
    return -np.inf


def safe_log(x):
    print(x)
    if x == 0:
        return logzero()
    return np.log(x)


def logsum_pair(logx, logy):
    """
    Return log(x+y), avoiding arithmetic underflow/overflow.

    logx: log(x)
    logy: log(y)

    Rationale:

    x + y    = e^logx + e^logy
             = e^logx (1 + e^(logy-logx))
    log(x+y) = logx + log(1 + e^(logy-logx)) (1)

    Likewise,
    log(x+y) = logy + log(1 + e^(logx-logy)) (2)

    The computation of the exponential overflows earlier and is less precise
    for big values than for small values. Due to the presence of logy-logx
    (resp. logx-logy), (1) is preferred when logx > logy and (2) is preferred
    otherwise.
    """
    if logx == logzero():
        return logy
    elif logx > logy:
        return logx + np.log1p(np.exp(logy-logx))
    else:
        return logy + np.log1p(np.exp(logx-logy))


def logsum(logv):
    """
    Return log(v[0]+v[1]+...), avoiding arithmetic underflow/overflow.
    """
    res = logzero()
    for val in logv:
        res = logsum_pair(res, val)
    return res

In [47]:
class HMM(object):
    
    def __init__(self, word_to_pos={}, state_to_pos={}):
        self.fitted = False
        self.counts = {"emission": None, "transition":None, "final":None, "initial":None}
        self.probs  = {"emission": None, "transition":None, "final":None, "initial":None}
        self.scores = {"emission": None, "transition":None, "final":None, "initial":None}
        self.decode = set(["posterior", "viterbi"])
        self.word_to_pos  = word_to_pos
        self.state_to_pos = state_to_pos
        self.pos_to_word  = {v: k for k, v in word_to_pos.items()}
        self.pos_to_state = {v: k for k, v in state_to_pos.items()}
    
        self.n_states     = len(state_to_pos)
        self.n_words      = len(word_to_pos)
        self.fitted = False

    def fit(self, observation_lables: list, state_labels: list):
        """
        Computes and saves: counts, probs, scores.
        """
        if self.state_to_pos is None or self.word_to_pos is None:
            print("Error state_to_pos or word_to_pos needed to be defined")
            return
            
        self.counts = self.sufficient_statistics_hmm(observation_lables, state_labels)       
        self.probs  = self.compute_probs(self.counts)  
        self.scores = self.compute_scores(self.probs)  
        self.fitted = True
        
    def sufficient_statistics_hmm(self, observation_lables, state_labels):

        state_to_pos, word_to_pos = self.state_to_pos, self.word_to_pos
        
        def update_initial_counts(initial_counts, seq_x, state_to_pos):
            initial_counts[state_to_pos[seq_x[0]]] +=  1
            
        def update_transition_counts(transition_counts, seq_y, state_to_pos):
            for (t_prev, t) in zip(seq_y[:-1], seq_y[1:]):
                transition_counts[state_to_pos[t], state_to_pos[t_prev]] += 1 

        def update_emission_counts(emission_counts, seq_x, seq_y, state_to_pos, word_to_pos):
            for (t,x) in zip(seq_y, seq_x):
                emission_counts[state_to_pos[t], word_to_pos[x]] += 1 
                
        def update_final_counts(final_counts, seq_y, state_to_pos):
            final_counts[state_to_pos[seq_y[-1]]] +=1

        n_states = len(state_to_pos)
        n_words  = len(word_to_pos)
        initial_counts      = np.zeros((n_states))
        transition_counts   = np.zeros((n_states, n_states))
        final_counts        = np.zeros((n_states))
        emission_counts     = np.zeros((n_states, n_words))

        for seq_x, seq_y in zip(observation_lables, state_labels):
            update_initial_counts(initial_counts, seq_y, state_to_pos)
            update_transition_counts(transition_counts, seq_y,  state_to_pos)
            update_emission_counts(emission_counts, seq_x, seq_y, state_to_pos, word_to_pos) 
            update_final_counts(final_counts, seq_y,  state_to_pos) 

        return {"emission":   emission_counts, 
                "transition": transition_counts,
                "final":      final_counts, 
                "initial":    initial_counts}
    
    def compute_probs(self, counts):
        
        initial_counts    = counts['initial']
        transition_counts = counts['transition']
        emission_counts   = counts['emission']
        final_counts      = counts['final']

        initial_probs    = (initial_counts / np.sum(initial_counts))
        transition_probs = transition_counts/(np.sum(transition_counts,0) + final_counts)
        final_probs      = final_counts/(np.sum(transition_counts, 0) + final_counts )
        emission_probs   = (emission_counts.T / np.sum(emission_counts, 1)).T
    
        return {"emission":   emission_probs, 
                "transition": transition_probs,
                "final":      final_probs, 
                "initial":    initial_probs}
    
    def compute_scores(self, probs):
         return {"emission":   np.log(probs["emission"]), 
                 "transition": np.log(probs["transition"]),
                 "final":      np.log(probs["final"]), 
                 "initial":    np.log(probs["initial"])}
        
    def forward_computations(self, x: list):
        forward_x = None
        return forward_x
    
    def backward_computations(self, x:list):
        backward_x = None
        return backward_x
    
    def log_forward_computations(self, x: list):
        """
        Compute the log_forward computations

        Assume there are S possible states and a sequence of length N.
        This method will compute iteritavely the log_forward quantities.

        * log_f is a S x N Array.
        * log_f_x[:,i] will contain the forward quantities at position i.
        * log_f_x[:,i] is a vector of size S.
        
        Returns
        - log_f_x: Array of size K x N
        """ 
        n_x = len(x)
        
        # log_f_x initialized to -Inf because log(0) = -Inf
        log_f_x = np.zeros((self.n_states, n_x)) - np.Inf
        x_emission_scores = np.array([hmm.scores['emission'][:, hmm.word_to_pos[w]] for w in x]).T
        
        log_f_x[:,0] = x_emission_scores[:, 0] + self.scores['initial']
        for n in range(1, n_x):
            for s in range(self.n_states):
                log_f_x[s,n] = logsum(log_f_x[:,n-1] + self.scores['transition'][s,:]) + x_emission_scores[s,n]

        log_likelihood = logsum(log_f_x[:,n_x-1] + self.scores['final']) 
        return log_f_x, log_likelihood # log(P(X=x))
    
    
    def log_backward_computations(self, x: list):
        n_x = len(x)
        
        # log_f_x initialized to -Inf because log(0) = -Inf
        log_b_x = np.zeros((self.n_states, n_x)) - np.Inf
        x_emission_scores = np.array([hmm.scores['emission'][:, hmm.word_to_pos[w]] for w in x]).T
        log_b_x[:,-1] = self.scores['final']

        for n in range(n_x-2, -1, -1):
            for s in range(self.n_states):
                log_b_x[s,n] = logsum(log_b_x[:,n+1] + self.scores['transition'][:,s] + x_emission_scores[:,n+1])

        log_likelihood = logsum(log_b_x[:,0] + self.scores['initial'] + x_emission_scores[:,0]) 
        return log_b_x, log_likelihood  # log(P(X=x))
        
    def predict_labels(self, x: list, decode="posterior"):
        """
        Retuns a sequence of states for each word in **x**.
        The output depends on the **decode** method chosen.
        """
        assert decode in self.decode, "decode `{}` is not valid".format(decode)
        
        if decode is 'posterior':
            return self.posterior_decode(x)
        
        if decode is 'viterbi':
            return self.viterbi_decode(x)

    def compute_state_posteriors(self, x:list):
        log_f_x, log_likelihood = self.log_forward_computations(x)
        log_b_x, log_likelihood = self.log_backward_computations(x)
        state_posteriors = np.zeros((self.n_states, len(x)))
        
        for pos in range(len(x)):
            state_posteriors[:, pos] = log_f_x[:, pos] + log_b_x[:, pos] - log_likelihood
        return state_posteriors

    def posterior_decode(self, x: list, decode_states=True):
        
        state_posteriors = self.compute_state_posteriors(x)
        y_hat = state_posteriors.argmax(axis=0)
        
        if decode_states:
            y_hat = [hmm.pos_to_state[y] for y in y_hat]
            
        return y_hat

In [48]:
hmm = HMM(word_to_pos, tag_to_pos)

In [49]:
hmm.fit(X_train_txt, Y_train_txt)



In [50]:
tot = 0
err = 0

mstks = []
correct = []

for i in tqdm(range(len(X_test_txt))):
    pred = hmm.predict_labels(X_test_txt[i])
    s = sum(v1!=v2 for v1,v2 in list(zip(pred, Y_test_txt[i])))
    err+= s
    tot+=len(Y_test_txt[i])
    
    toappend = mstks if s!=0 else correct
    toappend.append(pd.DataFrame([X_test_txt[i], pred, Y_test_txt[i]]))
    
print("Accuracy: {:6.4f}".format(1-err/tot))

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




Accuracy: 0.8624


In [51]:
correct[5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,But,he,says,those,who,become,corrupt,will,be,punished,without,mercy,.
1,O,O,O,O,O,O,O,O,O,O,O,O,O
2,O,O,O,O,O,O,O,O,O,O,O,O,O


In [52]:
mstks[5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,China,has,lashed,out,at,a,U.S.,report,critical,of,Chinese,policies,on,religious,freedom,",",saying,such,criticism,could,harm,U.S.,-,China,relations,.
1,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O
2,B-gpe,O,O,O,O,O,B-gpe,O,O,O,B-gpe,O,O,O,O,O,O,O,O,O,O,B-gpe,O,B-gpe,O,O
