In [24]:
from corpus import ConllCorpusReaderX

In [25]:
conll2003_train = ConllCorpusReaderX('./conll2003_dataset', 
                              fileids='eng.train.txt', 
                              columntypes=('words', 'pos', 'chunk', 'ne'))

conll2003_testa = ConllCorpusReaderX('./conll2003_dataset', 
                              fileids='eng.testa.dev.txt', 
                              columntypes=('words', 'pos', 'chunk', 'ne'))

conll2003_testb = ConllCorpusReaderX('./conll2003_dataset', 
                              fileids='eng.testb.test.txt', 
                              columntypes=('words', 'pos', 'chunk', 'ne'))

In [26]:
conll2003_train.sents()

[[], ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ...]

In [27]:
conll2003_train.words()

['EU', 'rejects', 'German', 'call', 'to', 'boycott', ...]

In [28]:
conll2003_train.get_tags(tags=["words", "ne"])

[('EU', 'S-ORG'), ('rejects', 'O'), ...]

In [29]:
print(len(conll2003_train.words()))

203621


In [30]:
import numpy as np
from generator import Generator

words = [[*el] for el in conll2003_train.get_tags(tags=['words', 'pos', 'chunk'])[:4000]]
gen = Generator(column_types=['word', 'pos', 'chunk'], context_len=2)
X = gen.generate(words, path='./conll2003_train.npy')
y = np.array([el[1] for el in conll2003_train.get_ne()[:4000]])

In [31]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [32]:
clf.fit(X[:3000], y[:3000])
y_pred = clf.predict(X[3000:4000])

In [33]:
from sklearn.metrics import f1_score

y_pred = np.array(y_pred)
y_true = np.array(y[3000:4000])

y_pred_i = np.array([y_pred != 'O'])
y_true_i = np.array([y_true != 'O'])

ind = (y_pred_i | y_true_i).reshape(y_pred.shape)

y_pred_fixed = y_pred[ind]
y_true_fixed = y_true[ind]

In [35]:
def get_el(el):
    if el == "O":
        return el
    else:
        return el[2:]
    
y_pred_fixed = [get_el(el) for el in y_pred_fixed]
y_true_fixed = [get_el(el) for el in y_true_fixed]

In [39]:
print(f1_score(y_true_fixed, y_pred_fixed, average="weighted", labels=["PER", "LOC", "MISC", "ORG"]))

0.600130936582


In [45]:
print(f1_score(y_true_fixed, y_pred_fixed, average=None, labels=["PER", "LOC", "ORG", "MISC"]))
from collections import Counter
print(Counter(y_true_fixed))

[ 0.65384615  0.74452555  0.18181818  0.625     ]
Counter({'LOC': 61, 'MISC': 27, 'ORG': 26, 'PER': 26, 'O': 17})


In [42]:
from itertools import islice
for el in islice(((a, b) for a, b in zip(y_pred, y[3000:4000]) if a != 'O' or b != 'O'), 10):
    print(el)

('O', 'S-LOC')
('S-LOC', 'S-LOC')
('O', 'B-MISC')
('S-MISC', 'E-MISC')
('S-MISC', 'S-MISC')
('S-LOC', 'S-LOC')
('S-LOC', 'S-LOC')
('S-LOC', 'O')
('S-MISC', 'S-MISC')
('S-LOC', 'S-LOC')
