# 機械学習によるテキスト分類

## ライブラリ

In [None]:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from transformers import BertJapaneseTokenizer, BertModel
import torch
import mojimoji
import re
import string

## データの読み込み

In [None]:
def load_dataset(dataset_tsv_path):
    df = pandas.read_table(dataset_tsv_path, names=("TEXT", "LABEL"))
    return df['TEXT'].values, df['LABEL'].values

In [None]:
x_train, y_train = load_dataset('./data/train.tsv')
x_test, y_test = load_dataset('./data/test.tsv')
x_train_val, x_test_val, y_train_val, y_test_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1224)

## 前処理

In [None]:
def preprocessing_text(text):
    # 半角・全角の統一
    text = mojimoji.han_to_zen(text) 
    # 改行、半角スペース、全角スペースを削除
    text = re.sub('\r', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('　', '', text)
    text = re.sub(' ', '', text)
    #どっちでも
    text = re.sub(',', '', text)

    # 数字文字の一律「0」化
    text = re.sub(r'[0-9 ０-９]+', '0', text)  # 数字

    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        #if (p == "."):
        if (p == ".") or (p == ","):
            continue
        else:
            text = text.replace(p, " ")
        return text

## 形態素解析（Bag of Words・TF-IDFを使う場合）

In [None]:
import MeCab
"""
    *初期
    mecabrc:(デフォルト)
    -Ochasen:(ChaSen 互換形式)
    -Owakati:(分かち書きのみを出力)
    -Oyomi:(読みのみを出力)

    *自分の環境の辞書も使える
    -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd:neologd辞書
    """
def mecab_tokenize(text):
    tagger = MeCab.Tagger("-Owakati")
    #tagger = MeCab.Tagger ("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    #node = tagger.parse(text)
    #print(node.split(' '))
    return tagger.parse(text).split()

### Bag of Words

In [None]:
vectorizer=CountVectorizer(tokenizer=mecab_tokenize, preprocessor=preprocessing_text)

x_train_val_vec = vectorizer.fit_transform(x_train_val)
x_test_val_vec = vectorizer.transform(x_test_val)

x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

### TF-IDF

In [None]:
vectorizer=TfidfVectorizer(tokenizer=mecab_tokenize, preprocessor=preprocessing_text)

x_train_val_vec = vectorizer.fit_transform(x_train_val)
x_test_val_vec = vectorizer.transform(x_test_val)

x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

### BERTの単語の分散表現

In [None]:
bert_tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
bert_model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
def bert_vectorize_single(text):
        max_length = 128
        encoding = bert_tokenizer(
            text,
            max_length=max_length,
            padding='max_length', 
            truncation=True, 
            return_tensors='pt'
        )
        output = bert_model(**encoding)
        last_hidden_state = output.last_hidden_state 
        return last_hidden_state[0][0].tolist()

In [None]:
x_train_val_vec = [ bert_vectorize_single(preprocessing_text(x)) for x in x_train_val]
x_test_val_vec = [ bert_vectorize_single(preprocessing_text(x)) for x in x_test_val ]

x_train_vec = [ bert_vectorize_single(preprocessing_text(x)) for x in x_train]
x_test_vec = [ bert_vectorize_single(preprocessing_text(x)) for x in x_test ]

## 学習・検証

### 開発データ

In [None]:
from numpy import count_nonzero
import pickle

DUMP_DIRNAME = './data/model'

model = LogisticRegression(solver='liblinear')
#model = LinearSVC()

model.fit(x_train_val_vec, y_train_val)

y_pred=model.predict(x_test_val_vec)

print(classification_report(y_test_val, y_pred))

#### ハイパーパラメータの決定

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
#SVM
"""
tuned_parameters = [
    {'kernel': ['linear','rbf'],
    'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,100.0,1000.0], 
    'gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,100.0,1000.0]}
    ]
clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='f1' ) 
"""
#LogisticRegression
tuned_parameters = [
    {'solverl': ['liblinear'],
    'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,100.0,1000.0]}
    ]
clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='f1' ) 

clf.fit(x_train_val_vec, y_train_val)

print(clf.best_params_)

### 全データ

In [None]:
from numpy import count_nonzero
from tqdm import tqdm
import pickle
from os import makedirs, path

DUMP_DIRNAME = './data/model'

model = LogisticRegression(C = 1, solver='liblinear')
#model = LinearSVC( )
#model = SVC(C=1.0, gamma=0.001, kernel='rbf')

model.fit(x_train_vec, y_train)

y_pred=model.predict(x_test_vec)

## 評価

In [None]:
labels = [0,1]
cm = confusion_matrix(y_test, y_pred, labels=labels)
display(pandas.DataFrame(cm,
    columns=[["Predicted"] * len(labels), labels],
    index=[["Actual"] * len(labels), labels])
)
print(classification_report(y_test, y_pred))

### ROC・AUC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.axes().set_aspect("equal")
#plt.plot(fpr, tpr,marker=".")
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.grid()
#plt.savefig(path_result+"roc_curve.png")
auc=roc_auc_score(y_test, y_pred)
print("AUC:{}".format(auc))

"""
with open("{}auc_f.txt".format(path_result),"a",encoding="utf-8") as f:
    f.write("roc_curve, AUC:{}\n".format(auc))
    f.close()
"""

In [None]:
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
#plt.plot(fpr, tpr,marker=".")
plt.axes().set_aspect("equal")
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.grid()
#plt.savefig(path_result+"precision_recall.png")
pr_auc=auc(recall, precision)
print("AUC:{}".format(pr_auc))
"""
with open("{}auc_f.txt".format(path_result),"a",encoding="utf-8") as f:
    f.write("precision_recall, AUC:{}\n".format(pr_auc))
    f.close()
"""

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
print("正解率（すべてのサンプルのうち正解したサンプルの割合）={}".format((accuracy_score(y_test, y_pred))))
print("適合率（positiveと予測された中で実際にpositiveだった確率）={}".format((precision_score(y_test, y_pred))))
print("再現率（positiveなデータに対してpositiveと予測された確率）={}".format((recall_score(y_test, y_pred))))
print("F1（適合率と再現率の調和平均）={}".format((f1_score(y_test, y_pred))))

"""
with open("{}auc_f.txt".format(path_result),"a",encoding="utf-8") as f:
    f.write("正解率（すべてのサンプルのうち正解したサンプルの割合）={}".format((accuracy_score(y_test, y_pred))))
    f.write("適合率（positiveと予測された中で実際にpositiveだった確率）={}".format((precision_score(y_test, y_pred))))
    f.write("再現率（positiveなデータに対してpositiveと予測された確率）={}".format((recall_score(y_test, y_pred))))
    f.write("F1（適合率と再現率の調和平均）={}".format((f1_score(y_test, y_pred))))
"""

### モデルの保存

In [None]:
import pickle
from os import makedirs, path

dirname = './data/model'
filename = 'bert_model.pickle'
makedirs(dirname, exist_ok=True)
with open(path.join(dirname, filename), mode='wb') as f:
    pickle.dump(model, f)