In [1]:
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import optuna

In [2]:
sys.path.append('./bert')

In [3]:
import tokenization

In [4]:
vocab_path = "multilingual_L-12_H-768_A-12/vocab.txt"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=True)

In [5]:
# read tsv
# テキストはBERTのトークナイザを用いて下処理
def read_tsv(fname):
    data = {"data": [], "target": []}
    with open(fname) as f:
        for line in f:
            row = line.split('\t')
            label = int(row[1])
            sent = row[3]
            sent = " ".join(tokenizer.tokenize(sent))
            data["data"].append(sent)
            data["target"].append(label)
    return data

In [6]:
# 語彙からTF-IDFに基づき特徴ベクトルを生成する
vectorrizer = TfidfVectorizer()
train_data = read_tsv('train.tsv')
vectorrizer.fit(train_data['data'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [7]:
# 訓練用データをベクトル化
X = vectorrizer.transform(train_data["data"])
y = train_data["target"]

# Optunaの最適化を考える
## MultinominalNBのHP
1. alpha: 0-1.0
2. fit_prior: True/False

In [13]:
# testデータ
test_data = read_tsv("dev.tsv")
X_test = vectorrizer.transform(test_data["data"])
y_test = test_data["target"]
#clf.score(X_test, y_test)

In [14]:
def objective(trial):
    # 
    alpha = trial.suggest_uniform('alpha', 0.0, 1.0)
    fit_p = trial.suggest_categorical('fit_prior', ['True', 'False'])
    clf = MultinomialNB(alpha=alpha, fit_prior=fit_p)
    clf.fit(X, y)
    #acc = clf.score(X, y)
    acc = clf.score(X_test, y_test)
    return 1.0 - acc

In [15]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)
print(study.best_trial)

[I 2018-12-07 15:23:06,420] Finished a trial resulted in value: 0.21468926553672318. Current best value is 0.21468926553672318 with parameters: {'alpha': 0.17736093854970547, 'fit_prior': 'True'}.
[I 2018-12-07 15:23:06,427] Finished a trial resulted in value: 0.23163841807909602. Current best value is 0.21468926553672318 with parameters: {'alpha': 0.17736093854970547, 'fit_prior': 'True'}.
[I 2018-12-07 15:23:06,434] Finished a trial resulted in value: 0.24293785310734461. Current best value is 0.21468926553672318 with parameters: {'alpha': 0.17736093854970547, 'fit_prior': 'True'}.
[I 2018-12-07 15:23:06,442] Finished a trial resulted in value: 0.24293785310734461. Current best value is 0.21468926553672318 with parameters: {'alpha': 0.17736093854970547, 'fit_prior': 'True'}.
[I 2018-12-07 15:23:06,454] Finished a trial resulted in value: 0.23728813559322037. Current best value is 0.21468926553672318 with parameters: {'alpha': 0.17736093854970547, 'fit_prior': 'True'}.
[I 2018-12-07 1

[I 2018-12-07 15:23:07,097] Finished a trial resulted in value: 0.23728813559322037. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 2018-12-07 15:23:07,116] Finished a trial resulted in value: 0.21468926553672318. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 2018-12-07 15:23:07,136] Finished a trial resulted in value: 0.20903954802259883. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 2018-12-07 15:23:07,165] Finished a trial resulted in value: 0.20903954802259883. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 2018-12-07 15:23:07,190] Finished a trial resulted in value: 0.21468926553672318. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 20

[I 2018-12-07 15:23:08,147] Finished a trial resulted in value: 0.20903954802259883. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 2018-12-07 15:23:08,170] Finished a trial resulted in value: 0.20903954802259883. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 2018-12-07 15:23:08,196] Finished a trial resulted in value: 0.21468926553672318. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 2018-12-07 15:23:08,224] Finished a trial resulted in value: 0.20903954802259883. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 2018-12-07 15:23:08,247] Finished a trial resulted in value: 0.22033898305084743. Current best value is 0.20903954802259883 with parameters: {'alpha': 0.011355165226968666, 'fit_prior': 'False'}.
[I 20

FrozenTrial(trial_id=10, state=<TrialState.COMPLETE: 1>, value=0.20903954802259883, datetime_start=datetime.datetime(2018, 12, 7, 15, 23, 6, 527028), datetime_complete=datetime.datetime(2018, 12, 7, 15, 23, 6, 537695), params={'alpha': 0.011355165226968666, 'fit_prior': 'False'}, user_attrs={}, system_attrs={}, intermediate_values={}, params_in_internal_repr={'alpha': 0.011355165226968666, 'fit_prior': 1})


In [17]:
1.0 - study.best_value

0.7909604519774012