In [59]:
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

from generator import Generator
from corpus import ConllCorpusReaderX

import warnings
warnings.filterwarnings('ignore')

In [67]:
factrueval_devset = ConllCorpusReaderX('./factrueval2016_dataset/', 
                              fileids='devset.txt', 
                              columntypes=['words', 'offset', 'len', 'ne'])

gen = Generator(column_types=['word'], context_len=2)

all_words = [[el] for el in factrueval_devset.words()]
len_words = len(all_words)

X_all = gen.fit_transform(all_words, path='./factrueval_allset.npz')
X_train = gen.transform(all_words[:len_words // 10 * 9], path='./factrueval_trainset.npz')
X_test = gen.transform(all_words[len_words // 10 * 9:], path='./factrueval_testset.npz')

In [68]:
print(type(X_test))

<class 'scipy.sparse.csr.csr_matrix'>


# One classifier for all classes (with prefixes)

In [69]:
y_all_first = np.array([el[1] for el in factrueval_devset.get_ne()])
y_all = y_all_first
y_train = y_all[:len_words // 10 * 9]
y_test = y_all[len_words // 10 * 9:]

In [5]:
clf = LogisticRegression()

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [6]:
y_pred = np.array(y_pred)
y_true = np.array(y_test)

y_pred_i = np.array([y_pred != 'O'])
y_true_i = np.array([y_true != 'O'])

ind = (y_pred_i | y_true_i).reshape(y_pred.shape)

y_pred_fixed = y_pred[ind]
y_true_fixed = y_true[ind]

### Strict evaluation of results:

In [7]:
counter = Counter(y_true_fixed)
labels = list(counter.keys())
labels.remove("O")
results = f1_score(y_true_fixed, y_pred_fixed, average=None, labels=labels)
for a, b in zip(labels, results):
    print('F1 for {} == {}, with {} entities'.format(a, b, counter[a]))

F1 for B-Org == 0.42553191489361697, with 30 entities
F1 for E-Org == 0.3448275862068966, with 20 entities
F1 for B-Person == 0.782608695652174, with 31 entities
F1 for E-Person == 0.8450704225352113, with 31 entities
F1 for S-Org == 0.6181818181818183, with 30 entities
F1 for I-Org == 0.19672131147540983, with 44 entities
F1 for S-Location == 0.7476635514018691, with 57 entities
F1 for S-LocOrg == 0.5656565656565656, with 56 entities
F1 for I-Person == 0.0, with 6 entities
F1 for B-Location == 0.4444444444444445, with 14 entities
F1 for I-Location == 0.125, with 13 entities
F1 for E-Location == 0.15384615384615385, with 11 entities
F1 for S-Person == 0.3636363636363637, with 13 entities
F1 for B-LocOrg == 0.5714285714285715, with 4 entities
F1 for E-LocOrg == 0.0, with 2 entities
F1 for I-LocOrg == 0.0, with 2 entities


In [8]:
print(f1_score(y_true_fixed, y_pred_fixed, average="weighted", labels=list(counter.keys())))

0.500446439147


### Not strict evaluation of results:

In [9]:
def get_el(el):
    if el == "O":
        return el
    else:
        return el[2:]
    
y_pred_fixed_light = [get_el(el) for el in y_pred_fixed]
y_true_fixed_light = [get_el(el) for el in y_true_fixed]

In [10]:
light_counter = Counter(y_true_fixed_light)
light_labels = list(light_counter.keys())
light_labels.remove("O")
print(light_counter)
light_results = f1_score(y_true_fixed_light, y_pred_fixed_light, average=None, labels=light_labels)
for a, b in zip(light_labels, light_results):
    print('F1 for {} == {}, with {} entities'.format(a, b, light_counter[a]))

Counter({'Org': 124, 'Location': 95, 'Person': 81, 'LocOrg': 64, 'O': 12})
F1 for Org == 0.46875, with 124 entities
F1 for Person == 0.770949720670391, with 81 entities
F1 for Location == 0.6233766233766235, with 95 entities
F1 for LocOrg == 0.5454545454545454, with 64 entities


In [11]:
print(f1_score(y_true_fixed_light, y_pred_fixed_light, average="weighted", labels=light_labels))

0.589840103033


# One classifier for all classes (without prefixes)

In [12]:
y_all = [get_el(el) for el in y_all_first]
y_train = y_all[:len_words // 10 * 9]
y_test = y_all[len_words // 10 * 9:]

In [13]:
clf = LogisticRegression()

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [14]:
y_pred = np.array(y_pred)
y_true = np.array(y_test)

y_pred_i = np.array([y_pred != 'O'])
y_true_i = np.array([y_true != 'O'])

ind = (y_pred_i | y_true_i).reshape(y_pred.shape)

y_pred_fixed = y_pred[ind]
y_true_fixed = y_true[ind]

### Not strict evaluation of results

In [15]:
light_counter = Counter(y_true_fixed)
light_labels = list(light_counter.keys())
print(light_counter)
light_results = f1_score(y_true_fixed, y_pred_fixed, average=None, labels=light_labels)
for a, b in zip(light_labels, light_results):
    print('F1 for {} == {}, with {} entities'.format(a, b, light_counter[a]))

Counter({'Org': 124, 'Location': 95, 'Person': 81, 'LocOrg': 64, 'O': 10})
F1 for Org == 0.45595854922279794, with 124 entities
F1 for Person == 0.8021978021978022, with 81 entities
F1 for Location == 0.6580645161290322, with 95 entities
F1 for LocOrg == 0.5576923076923076, with 64 entities
F1 for O == 0.0, with 10 entities


In [16]:
print(f1_score(y_true_fixed, y_pred_fixed, average="weighted", labels=light_labels))

0.587500852423


# Different classifiers for different classes (without prefixes and with prefixes)

In [17]:
def run_diff_classes(template, prefixes=False):
    
    def get_only(el, template):
        if el[2:] == template:
            if prefixes:
                return el
            else:
                return template
        else:
            return "O"

    y_all = [get_only(el, template) for el in y_all_first]
    y_train = y_all[:len_words // 10 * 9]
    y_test = y_all[len_words // 10 * 9:]
    
    clf = LogisticRegression()

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    y_pred = np.array(y_pred)
    y_true = np.array(y_test)

    y_pred_i = np.array([y_pred != 'O'])
    y_true_i = np.array([y_true != 'O'])

    ind = (y_pred_i | y_true_i).reshape(y_pred.shape)

    y_pred_fixed = y_pred[ind]
    y_true_fixed = y_true[ind]
    
    light_counter = Counter(y_true_fixed)
    labels = list(light_counter.keys())
    labels.remove("O")
    print(labels)
    light_result = f1_score(y_true_fixed, y_pred_fixed, average="weighted", labels=labels)
    print('F1 for {} == {}, with {} entities'.format(template, light_result, light_counter[template]))
        
    return light_result, light_counter[template]

In [18]:
result1, weight1 = run_diff_classes("Person")

['Person']
F1 for Person == 0.8366013071895425, with 81 entities


In [19]:
result2, weight2 = run_diff_classes("Org")

['Org']
F1 for Org == 0.3953488372093023, with 124 entities


In [20]:
result3, weight3 = run_diff_classes("LocOrg")

['LocOrg']
F1 for LocOrg == 0.4827586206896551, with 64 entities


In [21]:
result4, weight4 = run_diff_classes("Location")

['Location']
F1 for Location == 0.5467625899280576, with 95 entities


In [22]:
total_weight = weight1 + weight2 + weight3 + weight4
total_result = (result1 * weight1 + result2 * weight2 + result3 * weight3 + result4 * weight4) / total_weight

In [23]:
print(total_result)

0.548425712812


In [24]:
result1, weight1 = run_diff_classes("Person", prefixes=True)

['B-Person', 'E-Person', 'I-Person', 'S-Person']
F1 for Person == 0.7247599451303155, with 0 entities


In [25]:
result2, weight2 = run_diff_classes("Org", prefixes=True)

['B-Org', 'E-Org', 'S-Org', 'I-Org']
F1 for Org == 0.302298742425133, with 0 entities


In [26]:
result3, weight3 = run_diff_classes("LocOrg", prefixes=True)

['S-LocOrg', 'B-LocOrg', 'E-LocOrg', 'I-LocOrg']
F1 for LocOrg == 0.37333333333333335, with 0 entities


In [27]:
result4, weight4 = run_diff_classes("Location", prefixes=True)

['S-Location', 'B-Location', 'I-Location', 'E-Location']
F1 for Location == 0.4411899313501145, with 0 entities


### Почему результат ухудшился при раздельной классификации?

### Пока причина этого непонятна, для отбора признаков воспользуемся вторым классификатором:


In [70]:
y_all_first = np.array([el[1] for el in factrueval_devset.get_ne()])
y_all = [get_el(el) for el in y_all_first]
y_train = y_all[:len_words // 10 * 9]
y_test = y_all[len_words // 10 * 9:]

In [83]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

model = SelectFromModel(clf, prefit=True, threshold="mean")
X_train_T = model.transform(X_train)
X_test_T = model.transform(X_test)

In [84]:
print(X_train.shape, X_train_T.shape)

(27846, 29819) (27846, 7509)


In [85]:
clf = LogisticRegression()
clf.fit(X_train_T, y_train)
y_pred = clf.predict(X_test_T)

In [86]:
y_pred = np.array(y_pred)
y_true = np.array(y_test)

y_pred_i = np.array([y_pred != 'O'])
y_true_i = np.array([y_true != 'O'])

ind = (y_pred_i | y_true_i).reshape(y_pred.shape)

y_pred_fixed = y_pred[ind]
y_true_fixed = y_true[ind]

In [87]:
light_counter = Counter(y_true_fixed)
light_labels = list(light_counter.keys())
print(light_counter)
light_results = f1_score(y_true_fixed, y_pred_fixed, average=None, labels=light_labels)
for a, b in zip(light_labels, light_results):
    print('F1 for {} == {}, with {} entities'.format(a, b, light_counter[a]))

Counter({'Org': 124, 'Location': 95, 'Person': 81, 'LocOrg': 64, 'O': 10})
F1 for Org == 0.45595854922279794, with 124 entities
F1 for Person == 0.7955801104972375, with 81 entities
F1 for Location == 0.6493506493506493, with 95 entities
F1 for LocOrg == 0.5660377358490566, with 64 entities
F1 for O == 0.0, with 10 entities


In [88]:
print(f1_score(y_true_fixed, y_pred_fixed, average="weighted", labels=light_labels))

0.585282288333
