In [14]:
import os
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

from generator import Generator
from corpus import ConllCorpusReaderX

import warnings
warnings.filterwarnings('ignore')

EC_PATH = "./bsnlp_ec.npz"
TRUMP_PATH = "./bsnlp_trump.npz"
TRAINSET_PATH = "./factrueval_trainset.npz"

In [15]:
trump_dataset = ConllCorpusReaderX('./bsnlp_dataset/', 
                              fileids='trump.txt', 
                              columntypes=('words', 'ne'))

eu_dataset = ConllCorpusReaderX('./bsnlp_dataset/', 
                              fileids='ec.txt', 
                              columntypes=('words', 'ne'))

factrueval_devset = ConllCorpusReaderX('../FactRuEval/factrueval2016_dataset/', 
                                        fileids='devset.txt', 
                                        columntypes=['words', 'offset', 'len', 'ne'])

In [22]:
gen = Generator(column_types=['WORD'], context_len=2)

def get_class(el):
    if el == 

Y_train = [el[1] for el in factrueval_devset.get_ne()]
Y_train = [get_class(el) for el in Y_train]

Y_test_eu = [el[1] for el in eu_dataset.get_ne()]
Y_test_trump = [el[1] for el in trump_dataset.get_ne()] 

X_train = gen.fit_transform([[el] for el in factrueval_devset.words()], 
                            Y_train, 
                            path=TRAINSET_PATH)
X_test_eu = gen.transform([[el] for el in eu_dataset.words()], 
                       path=EC_PATH)
X_test_trump = gen.transform([[el] for el in trump_dataset.words()], 
                       path=TRUMP_PATH)

In [23]:
# Избавляет данные от случаев O : O #
def clean(Y_pred, Y_test):
    Y_pred = np.array(Y_pred)
    Y_test = np.array(Y_test)

    Y_pred_i = np.array([Y_pred != 'O'])
    Y_test_i = np.array([Y_test != 'O'])

    indexes = (Y_pred_i | Y_test_i).reshape(Y_pred.shape)

    Y_pred_fixed = Y_pred[indexes]
    Y_test_fixed = Y_test[indexes]
    return Y_pred_fixed, Y_test_fixed

In [24]:
def run_baseline(clf=LogisticRegression()):
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test_eu)
    Y_pred_c, Y_test_c = clean(Y_pred, Y_test_eu)

    def get_el(el):
        if el == "O":
            return el
        else:
            return el[2:]

    Y_pred_c_light = [get_el(el) for el in Y_pred_c]
    Y_test_c_light = [get_el(el) for el in Y_test_c]

    # Strict evaluation #

    print("")
    print("# Strict evaluation #")
    counter = Counter(Y_test_c)
    labels = list(counter.keys())
    labels.remove("O")
    results = f1_score(Y_test_c, Y_pred_c, average=None, labels=labels)
    for a, b in zip(labels, results):
        print('F1 for {} == {}, with {} entities'.format(a, b, counter[a]))

    print("Weighted Score:", f1_score(Y_test_c, Y_pred_c, average="weighted", labels=list(counter.keys())))    

    # Not strict evaluation #    

    print("")
    print("# Not strict evaluation #")    
    light_counter = Counter(Y_test_c_light)
    light_labels = list(light_counter.keys())
    light_labels.remove("O")
    print(light_counter)
    light_results = f1_score(Y_test_c_light, Y_pred_c_light, average=None, labels=light_labels)
    for a, b in zip(light_labels, light_results):
        print('F1 for {} == {}, with {} entities'.format(a, b, light_counter[a]))

    print("Weighted Score:", f1_score(Y_test_c_light, Y_pred_c_light, average="weighted", labels=light_labels))

In [25]:
run_baseline()


# Strict evaluation #
F1 for S-ORG == 0.0, with 296 entities
F1 for B-ORG == 0.0, with 215 entities
F1 for E-ORG == 0.0, with 196 entities
F1 for B-MISC == 0.0, with 42 entities
F1 for E-MISC == 0.0, with 32 entities
F1 for S-LOC == 0.0, with 148 entities
F1 for B-PER == 0.0, with 25 entities
F1 for E-PER == 0.0, with 23 entities
F1 for S-PER == 0.0, with 20 entities
F1 for I-ORG == 0.0, with 93 entities
F1 for I-PER == 0.0, with 2 entities
F1 for S-MISC == 0.0, with 62 entities
F1 for I-MISC == 0.0, with 22 entities
F1 for B-LOC == 0.0, with 13 entities
F1 for E-LOC == 0.0, with 13 entities
F1 for I-LOC == 0.0, with 1 entities
Weighted Score: 0.0

# Not strict evaluation #
Counter({'ORG': 800, 'LOC': 175, 'MISC': 158, 'PER': 70, 'O': 22})
F1 for ORG == 0.0, with 800 entities
F1 for MISC == 0.0, with 158 entities
F1 for LOC == 0.0, with 175 entities
F1 for PER == 0.0, with 70 entities
Weighted Score: 0.0


In [26]:
run_baseline(RandomForestClassifier())


# Strict evaluation #
F1 for S-ORG == 0.0, with 296 entities
F1 for B-ORG == 0.0, with 215 entities
F1 for E-ORG == 0.0, with 196 entities
F1 for B-MISC == 0.0, with 42 entities
F1 for E-MISC == 0.0, with 32 entities
F1 for S-LOC == 0.0, with 148 entities
F1 for B-PER == 0.0, with 25 entities
F1 for E-PER == 0.0, with 23 entities
F1 for S-PER == 0.0, with 20 entities
F1 for I-ORG == 0.0, with 93 entities
F1 for I-PER == 0.0, with 2 entities
F1 for S-MISC == 0.0, with 62 entities
F1 for I-MISC == 0.0, with 22 entities
F1 for B-LOC == 0.0, with 13 entities
F1 for E-LOC == 0.0, with 13 entities
F1 for I-LOC == 0.0, with 1 entities
Weighted Score: 0.0

# Not strict evaluation #
Counter({'ORG': 800, 'LOC': 175, 'MISC': 158, 'PER': 70, 'O': 28})
F1 for ORG == 0.0, with 800 entities
F1 for MISC == 0.0, with 158 entities
F1 for LOC == 0.0, with 175 entities
F1 for PER == 0.0, with 70 entities
Weighted Score: 0.0


In [27]:
run_baseline(LinearSVC())


# Strict evaluation #
F1 for S-ORG == 0.0, with 296 entities
F1 for B-ORG == 0.0, with 215 entities
F1 for E-ORG == 0.0, with 196 entities
F1 for B-MISC == 0.0, with 42 entities
F1 for E-MISC == 0.0, with 32 entities
F1 for S-LOC == 0.0, with 148 entities
F1 for B-PER == 0.0, with 25 entities
F1 for E-PER == 0.0, with 23 entities
F1 for S-PER == 0.0, with 20 entities
F1 for I-ORG == 0.0, with 93 entities
F1 for I-PER == 0.0, with 2 entities
F1 for S-MISC == 0.0, with 62 entities
F1 for I-MISC == 0.0, with 22 entities
F1 for B-LOC == 0.0, with 13 entities
F1 for E-LOC == 0.0, with 13 entities
F1 for I-LOC == 0.0, with 1 entities
Weighted Score: 0.0

# Not strict evaluation #
Counter({'ORG': 800, 'LOC': 175, 'MISC': 158, 'O': 99, 'PER': 70})
F1 for ORG == 0.0, with 800 entities
F1 for MISC == 0.0, with 158 entities
F1 for LOC == 0.0, with 175 entities
F1 for PER == 0.0, with 70 entities
Weighted Score: 0.0


In [28]:
run_baseline(GradientBoostingClassifier())


# Strict evaluation #
F1 for S-ORG == 0.0, with 296 entities
F1 for B-ORG == 0.0, with 215 entities
F1 for E-ORG == 0.0, with 196 entities
F1 for B-MISC == 0.0, with 42 entities
F1 for E-MISC == 0.0, with 32 entities
F1 for S-LOC == 0.0, with 148 entities
F1 for B-PER == 0.0, with 25 entities
F1 for E-PER == 0.0, with 23 entities
F1 for S-PER == 0.0, with 20 entities
F1 for I-ORG == 0.0, with 93 entities
F1 for I-PER == 0.0, with 2 entities
F1 for S-MISC == 0.0, with 62 entities
F1 for I-MISC == 0.0, with 22 entities
F1 for B-LOC == 0.0, with 13 entities
F1 for E-LOC == 0.0, with 13 entities
F1 for I-LOC == 0.0, with 1 entities
Weighted Score: 0.0

# Not strict evaluation #
Counter({'ORG': 800, 'LOC': 175, 'MISC': 158, 'PER': 70, 'O': 65})
F1 for ORG == 0.0, with 800 entities
F1 for MISC == 0.0, with 158 entities
F1 for LOC == 0.0, with 175 entities
F1 for PER == 0.0, with 70 entities
Weighted Score: 0.0
