In [1]:
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
sys.path.append('./bert')

In [3]:
import tokenization

In [4]:
vocab_path = "multilingual_L-12_H-768_A-12/vocab.txt"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=True)

In [5]:
# read tsv
# テキストはBERTのトークナイザを用いて下処理
def read_tsv(fname):
    data = {"data": [], "target": []}
    with open(fname) as f:
        for line in f:
            row = line.split('\t')
            label = int(row[1])
            sent = row[3]
            sent = " ".join(tokenizer.tokenize(sent))
            data["data"].append(sent)
            data["target"].append(label)
    return data

def make_vectorizer(data):
    vec = TfidfVectorizer()
    vec.fit(data['data'])
    return vec

def make_data(vec, data):
    X = vec.transform(data["data"])
    y = data["target"]
    return X, y

def train_classifier(X, y):
    clf = MultinomialNB()
    clf.fit(X, y)
    return clf

In [6]:
data = read_tsv("swp/train.tsv")
vec = make_vectorizer(data)
X, y = make_data(vec, data)
clf = train_classifier(X, y)
print("Train accuracy: %f" % clf.score(X, y))

test = read_tsv("swp/dev.tsv")
X, y = make_data(vec, test)
print("Test accuracy: %f" % clf.score(X, y))

Train accuracy: 0.883935
Test accuracy: 0.780899


In [7]:
data = read_tsv("aozora/train.tsv")
vec = make_vectorizer(data)
X, y = make_data(vec, data)
clf = train_classifier(X, y)
print("Train accuracy: %f" % clf.score(X, y))

test = read_tsv("aozora/dev.tsv")
X, y = make_data(vec, test)
print("Test accuracy: %f" % clf.score(X, y))

Train accuracy: 0.903140
Test accuracy: 0.886192
