In [1]:
import os
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import ParameterGrid
import datetime as dt

from generator import Generator
from corpus import ConllCorpusReaderX
from estimator import Estimator

import warnings
warnings.filterwarnings('ignore')

TRAINSET_PATH = "./conll_trainset.npz"
TESTSETA_PATH = "./conll_testseta.npz"
TESTSETB_PATH = "./conll_testsetb.npz"

In [2]:
conll_trainset = ConllCorpusReaderX('./conll2003_dataset', 
                              fileids='eng.train.txt', 
                              columntypes=('words', 'pos', 'chunk', 'ne'))

conll_testseta = ConllCorpusReaderX('./conll2003_dataset', 
                              fileids='eng.testa.dev.txt', 
                              columntypes=('words', 'pos', 'chunk', 'ne'))

conll_testsetb = ConllCorpusReaderX('./conll2003_dataset', 
                              fileids='eng.testb.test.txt', 
                              columntypes=('words', 'pos', 'chunk', 'ne'))

In [3]:
gen = Generator(column_types=['WORD', 'POS', 'CHUNK'], context_len=2, language='en')

Y_train = [el[1] for el in conll_trainset.get_ne()]
Y_testa = [el[1] for el in conll_testseta.get_ne()] 
Y_testb = [el[1] for el in conll_testsetb.get_ne()] 

X_train = gen.fit_transform(conll_trainset.get_tags(tags=['words', 'pos', 'chunk']), Y_train, path=TRAINSET_PATH)
X_testa = gen.transform(conll_testseta.get_tags(tags=['words', 'pos', 'chunk']), path=TESTSETA_PATH)
X_testb = gen.transform(conll_testsetb.get_tags(tags=['words', 'pos', 'chunk']), path=TESTSETB_PATH)

In [4]:
class Label2IdX:
    def __init__(self):
        self.data = {}
        self.index = 0
        
    def get(self, label):
        if label in self.data:
            return self.data[label]
        else:
            self.data[label] = self.index
            self.index += 1
            return self.data[label]

In [5]:
def get_best_parameters(clf_class, tuned_parameters_grid):
    
    best_total_F = 0
    best_parameters = None
    
    # создаем хранитель индексов
    label2idx = Label2IdX()
    
    # преобразуем набор ответов
    Y_testb_sent = []
    
    index = 0
    for sent in conll_testsetb.sents():
        length = len(sent)
        Y_testb_sent.append([label2idx.get(el) for el in Y_testb[index:index+length]])
        index += length
    
    # тестируем
    for parameters in ParameterGrid(tuned_parameters_grid):
        clf = clf_class()
        clf.set_params(**parameters)
    
        clf.fit(X_train, Y_train)
        Y_predb = clf.predict(X_testb)

        # преобразуем данные для оценки
        Y_predb_sent = []
        
        index = 0
        for sent in conll_testsetb.sents():
            length = len(sent)
            Y_predb_sent.append([label2idx.get(el) for el in Y_predb[index:index+length]])
            index += length

        F_arr = []
        weight_arr = []

        labels = ["PER", "ORG", "LOC", "MISC"]
        for label in labels:
            estimator = Estimator(Y_predb_sent, Y_testb_sent, label, labels, label2idx)
            F = estimator.compute_proper_f1()
            F_arr.append(F)
            weight = estimator.get_weight()
            weight_arr.append(weight)

        total_F = sum([F * weight for F, weight in zip(F_arr, weight_arr)]) / sum(weight_arr)
        
        if total_F > best_total_F:
            best_total_F = total_F
            best_parameters = parameters
        
    print("BEST RESULT: {}".format(best_total_F))
    print("WITH PARAMETERS:")
    print(best_parameters)

In [6]:
print(dt.datetime.now())

tuned_parameters = [
    {
        "C": [1.4, 1.2, 1.0],
        "max_iter": [100, 200, 500],
        "tol": [5e-5, 1e-4, 1e-3],
        "n_jobs": [4]
    }
]

get_best_parameters(LogisticRegression, tuned_parameters)
print(dt.datetime.now())

2017-12-19 21:59:14.839948
BEST RESULT: 0.7202792475799831
WITH PARAMETERS:
{'C': 1.4, 'max_iter': 100, 'n_jobs': 4, 'tol': 0.0001}
2017-12-19 23:18:27.423938


In [7]:
print(dt.datetime.now())

tuned_parameters = [
    {
        "n_estimators": [100, 300, 800, 1000],
        "min_sample_leaf": [5, 10, 15],
        "max_features": ["sqrt", "log2", 50.0, 20.0, 80.0],
        "n_jobs": [4]
    }
]

get_best_parameters(RandomForestClassifier, tuned_parameters)
print(dt.datetime.now())

2017-12-19 23:18:27.484482


ValueError: Invalid parameter min_sample_leaf for estimator RandomForestClassifier. Check the list of available parameters with `estimator.get_params().keys()`.