In [1]:
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
sys.path.append('./bert')

In [3]:
import tokenization

In [4]:
vocab_path = "multilingual_L-12_H-768_A-12/vocab.txt"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=True)

In [5]:
# read tsv
# テキストはBERTのトークナイザを用いて下処理
def read_tsv(fname):
    data = {"data": [], "target": []}
    with open(fname) as f:
        for line in f:
            row = line.split('\t')
            label = int(row[1])
            sent = row[3]
            sent = " ".join(tokenizer.tokenize(sent))
            data["data"].append(sent)
            data["target"].append(label)
    return data

In [6]:
# 語彙からTF-IDFに基づき特徴ベクトルを生成する
vectorrizer = TfidfVectorizer()
train_data = read_tsv('train.tsv')
vectorrizer.fit(train_data['data'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [7]:
# 訓練用データをベクトル化
X = vectorrizer.transform(train_data["data"])
y = train_data["target"]

In [8]:
# 多項ナイーブベイズ分類器
clf = MultinomialNB()
clf.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
# 訓練データでのaccuracy
clf.score(X, y)

0.8783783783783784

In [10]:
# testデータでのaccuracy
test_data = read_tsv("dev.tsv")
X_test = vectorrizer.transform(test_data["data"])
y_test = test_data["target"]
clf.score(X_test, y_test)

0.7570621468926554