In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

import generator
import corpus
import estimator

import warnings

warnings.filterwarnings('ignore')

TRAINSET_PATH = "./prepared_data/conll_trainset.npz"
TESTSETA_PATH = "./prepared_data/conll_testseta.npz"
TESTSETB_PATH = "./prepared_data/conll_testsetb.npz"

In [2]:
conll_trainset = corpus.ConllDataReader('./dataset',
                                        fileids='eng.train.txt',
                                        columntypes=('words', 'pos', 'chunk', 'ne'))

conll_testseta = corpus.ConllDataReader('./dataset',
                                        fileids='eng.testa.dev.txt',
                                        columntypes=('words', 'pos', 'chunk', 'ne'))

conll_testsetb = corpus.ConllDataReader('./dataset',
                                        fileids='eng.testb.test.txt',
                                        columntypes=('words', 'pos', 'chunk', 'ne'))

In [3]:
gen = generator.Generator(column_types=['WORD', 'POS', 'CHUNK'], context_len=2, language='en')

Y_train = [el[1] for el in conll_trainset.get_ne()]
Y_testa = [el[1] for el in conll_testseta.get_ne()]
Y_testb = [el[1] for el in conll_testsetb.get_ne()]

X_train = gen.fit_transform(conll_trainset.get_tags(tags=['words', 'pos', 'chunk']), Y_train, path=TRAINSET_PATH)
X_testa = gen.transform(conll_testseta.get_tags(tags=['words', 'pos', 'chunk']), path=TESTSETA_PATH)
X_testb = gen.transform(conll_testsetb.get_tags(tags=['words', 'pos', 'chunk']), path=TESTSETB_PATH)

In [4]:
class LabelEncoder:
    def __init__(self):
        self.data = {}
        self.index = 0
        
    def get(self, label):
        if label in self.data:
            return self.data[label]
        else:
            self.data[label] = self.index
            self.index += 1
            return self.data[label]

In [7]:
def get_baseline_results(clf):
    # делаем предсказание
    clf.fit(X_train, Y_train)
    y_preda = clf.predict(X_testa)

    # преобразуем данные для оценки
    label_encoder = LabelEncoder()
    y_preda_sent = []
    y_testa_sent = []

    index = 0
    for sent in conll_testsetb.sents():
        length = len(sent)
        y_preda_sent.append([label_encoder.get(el) for el in y_preda[index:index + length]])
        y_testa_sent.append([label_encoder.get(el) for el in Y_testa[index:index + length]])
        index += length

    # производим оценку
    F_arr = []
    weight_arr = []

    labels = ["PER", "ORG", "LOC", "MISC"]
    for label in labels:
        est = estimator.Estimator(y_preda_sent, y_testa_sent, label, labels, label_encoder)
        F = est.compute_proper_f1()
        F_arr.append(F)
        weight = est.get_weight()
        weight_arr.append(weight)
        print("{} F1 = {}".format(label, F))

    TOTAL_F = sum([F * weight for F, weight in zip(F_arr, weight_arr)]) / sum(weight_arr)
    print("TOTAL F1 = {}".format(TOTAL_F))

In [8]:
get_baseline_results(LogisticRegression())

PER F1 = 0.7737569873492203
ORG F1 = 0.6463831108351444
LOC F1 = 0.7710309930423782
MISC F1 = 0.6930091185410334
TOTAL F1 = 0.7248597749775896


In [9]:
get_baseline_results(RandomForestClassifier())

PER F1 = 0.6914673752583407
ORG F1 = 0.5675182481751825
LOC F1 = 0.6845844918134075
MISC F1 = 0.5472868217054264
TOTAL F1 = 0.6343784981969739


In [10]:
get_baseline_results(LinearSVC())

PER F1 = 0.7715795654726952
ORG F1 = 0.6255016980549553
LOC F1 = 0.7621451104100946
MISC F1 = 0.6725403817914831
TOTAL F1 = 0.7128487186062856


In [12]:
get_baseline_results(GradientBoostingClassifier())

PER F1 = 0.6666666666666667
ORG F1 = 0.5640000000000001
LOC F1 = 0.67871259175607
MISC F1 = 0.6511254019292604
TOTAL F1 = 0.6376600240997772
