[View in Colaboratory](https://colab.research.google.com/github/jkhnn/Novelist-Classification/blob/master/%E4%BD%9C%E8%80%85%E5%88%86%E9%A1%9E.ipynb)

In [0]:
#参考・引用

# 青空文庫で作者っぽさ判定（KERAS　＋　character-level cnn）　
# https://qiita.com/cvusk/items/c1342dd0fff16dc37ddf
  
# character-level CNNでクリスマスを生き抜く
# https://qiita.com/bokeneko/items/c0f0ce60a998304400c8
  
# Character-level Convolutional Networks for Text Classification
# https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf

In [0]:
# 学習・教師・テストデータデータダウンロード
!git clone https://github.com/jkhnn/Novelist-Classification.git
  
# 前処理用モジュールのインストール
!pip install neologdn

# install MeCab
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git > /dev/null 
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n > /dev/null 2>&1
!pip install mecab-python3 > /dev/null

# gensim
!pip install gensim

In [0]:
import neologdn
import glob
import os
import MeCab
import gensim
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from keras.layers import Activation, Dense, Dropout, Flatten, Convolution2D, MaxPooling2D, Reshape, Input, merge
from keras.models import Model, Sequential
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler, Callback, CSVLogger, ModelCheckpoint
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font='Kozuka Gothic Pro', style="whitegrid")
aozora_dir = "./"


def show_histgram(labels):
  fig = plt.figure()
  ax = fig.add_subplot(1,1,1)
  ax.set_title('samples num by authors')
  ax.set_ylabel('samples num')
  sns.distplot(labels, kde=False, rug=False, bins=25, axlabel="height")
  

def show_similarity(model, authors):
    data = {}
    for a in authors:
      data[a] = model.docvecs.most_similar([a])
    return pd.DataFrame.from_dict(data)

def load_data(txt, max_length=200):
    txt_list = []
    for l in txt:
        txt_line = [ord(x) for x in str(l).strip()]
        # You will get encoded text in array, just like this
        # [25991, 31456, 12391, 12399, 12394, 12367, 12387, 
        #12390, 23383, 24341, 12391, 12354, 12427, 12290]
        txt_line = txt_line[:max_length]
        txt_len = len(txt_line)
        if txt_len < max_length:
            txt_line += ([0] * (max_length - txt_len))
        txt_list.append((txt_line))
    return txt_list
  
  
def create_model(dense_dim=9, dense_count=3, dropout=0.5, embed_size=128, max_length=200, filter_sizes=(2, 3, 4, 5), filter_num=64):
    inp = Input(shape=(max_length,))
    emb = Embedding(0xffff, embed_size)(inp)
    emb_ex = Reshape((max_length, embed_size, 1))(emb)
    convs = []
    for filter_size in filter_sizes:
        conv = Convolution2D(filter_num, filter_size, embed_size, activation="relu")(emb_ex)
        pool = MaxPooling2D(pool_size=(max_length - filter_size + 1, 1))(conv)
        convs.append(pool)
    convs_merged = merge(convs, mode='concat')
    obj = Reshape((filter_num * len(filter_sizes),))(convs_merged)
    i = 0
    while i < dense_count:
      obj = Dense(64, activation="relu")(obj)
      obj = BatchNormalization()(obj)
      obj = Dropout(dropout)(obj)
      i += 1
    fc3 = Dense(dense_dim, activation="sigmoid")(obj)
    model = Model(input=inp, output=fc3)
    return model

  
def train(inputs, targets, filter_num=64, dense_count=3, dropout=0.5, filter_sizes=(2, 3, 4, 5), dense_dim=9, batch_size=100, epoch_count=10, 
    max_length=200, model_filepath=aozora_dir + "model.h5", learning_rate=0.001):
  
    start = learning_rate
    stop = learning_rate * 0.01
    learning_rates = np.linspace(start, stop, epoch_count)

    model = create_model(max_length=max_length, filter_num=filter_num, dense_count=dense_count, dense_dim=dense_dim, filter_sizes=filter_sizes, dropout=dropout)
    optimizer = Adam(lr=learning_rate)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

    # Logging file for each epoch
    csv_logger_file = '/tmp/clcnn_training.log'

    # Checkpoint model for each epoch
    checkpoint_filepath = "/tmp/weights.{epoch:02d}-{loss:.2f}-{acc:.2f}-{val_loss:.2f}-{val_acc:.2f}.hdf5"

    model.fit(inputs, targets,
              nb_epoch=epoch_count,
              batch_size=batch_size,
              verbose=1,
              validation_split=0.1,
              shuffle=True,
              callbacks=[
                  LearningRateScheduler(lambda epoch: learning_rates[epoch]),
                  CSVLogger(csv_logger_file),
                  ModelCheckpoint(filepath=checkpoint_filepath, verbose=1, 
                                  save_best_only=True, save_weights_only=False, monitor='val_acc')
              ])
    model.save(model_filepath)
    return model
      
def show_training_and_validation_loss():
    dataset1 = pd.read_csv("/tmp/clcnn_training.log")
    plt.clf()
    plt.plot(dataset1["epoch"], dataset1["loss"], 'bo', label="Training loss")
    plt.plot(dataset1["epoch"], dataset1["val_loss"], 'b', label="Validation loss")
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

def show_training_and_validation_accuracy():
    plt.clf()
    dataset1 = pd.read_csv("/tmp/clcnn_training.log")
    plt.plot(dataset1["epoch"], dataset1["acc"], 'bo', label="Training acc")
    plt.plot(dataset1["epoch"], dataset1["val_acc"], 'b', label="Validation acc")
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

    
def get_stop_words(docs, limit_len=1):
  c = collections.Counter(docs)
  return [x for x,y in c.most_common() if y > limit_len]


def remove_stop_words(docs, limit_len=1):
  stop_words = get_stop_words(docs)
  set_ab = set(docs) - set(stop_words)
  list_ab = list(set_ab)
  return list_ab


def get_lines_by_path(file_path, if_normalize=True, stop_words_frequency=None):
  # テキストファイルの読み込み
  lines = []
  with open(file_path) as fd:
      for line in fd:
          val = line.rstrip()
          if val is None or len(val) == 0:
            continue
          if if_normalize:
            lines.append(neologdn.normalize(val))
          else:
            lines.append(val)
  if stop_words_frequency:
    lines = remove_stop_words(lines, limit_len=stop_words_frequency)
  return lines


def get_val_and_label(exclude_authors, train_data_size, if_normalize, stop_words_frequency=None, data_type="Train"):
  root_path = "./Novelist-Classification/data/master.csv"
  docs = []
  authors = []
  values = []
  labels = []
  labels_authors = []
  index = 0
  with open(root_path) as fd:
      for i, line in enumerate(fd):
        vals = line.rstrip().split(",")
        author = vals[1].replace('"', '')
        if exclude_authors and (author in exclude_authors):
          continue
        authors.append(author)
        val_ = get_lines_by_path("./Novelist-Classification/data/" + data_type + "/" + vals[0].replace('"', ''), 
                                 if_normalize=if_normalize, stop_words_frequency=stop_words_frequency)
        val_ = val_[:train_data_size]
        values.extend(val_)
        lb = np.full(len(val_), index)
        labels.extend(lb)
        lb_author = np.full(len(val_), author)
        labels_authors.extend(lb_author)
        index += 1
  return values, labels, labels_authors

def execute_train_and_validation(exclude_authors, filter_num=64, dense_count=3, dense_dim=9, dropout=0.5, filter_sizes=(2, 3, 4, 5), epoch_count=10, batch_size=100,
                  train_data_size=10000000, max_length=200, if_normalize=True, 
                  learning_rate=0.001, stop_words_frequency=1):


  values_train, labels_train, _ = get_val_and_label(exclude_authors, train_data_size, if_normalize, stop_words_frequency, data_type="Train")
  values_test, labels_test, _ = get_val_and_label(exclude_authors, train_data_size, if_normalize, stop_words_frequency, data_type="Test")
  
  x_train = load_data(values_train, max_length=max_length)
  y_train = np_utils.to_categorical(labels_train)
  
  x_test = load_data(values_test, max_length=max_length)
  y_test = np_utils.to_categorical(labels_test)
  
  model = train(np.array(x_train), y_train, filter_num=filter_num, dense_count=dense_count, dropout=dropout, filter_sizes=filter_sizes, learning_rate=learning_rate, 
        epoch_count=epoch_count, dense_dim=(dense_dim - len(exclude_authors)),  batch_size=batch_size, max_length=max_length)
  
  show_training_and_validation_loss()
  show_training_and_validation_accuracy()
  score = model.evaluate(np.array(x_test),y_test, verbose=0)
  print('Test loss:', score[0])
  print('Test accuracy:', score[1])


In [0]:
#作者一覧
!head -300 Novelist-Classification/data/master.csv

In [0]:
#"0001.txt","太宰治"
!head -50 Novelist-Classification/data/Train/0001.txt

In [0]:
#"0002.txt","森鴎外"
!head -300 Novelist-Classification/data/Train/0002.txt

In [0]:
#"0003.txt","坂口安吾"
!head -300 Novelist-Classification/data/Train/0003.txt

In [0]:
#"0004.txt","夏目漱石"
!head -300 Novelist-Classification/data/Train/0004.txt

In [0]:
#"0005.txt","宮沢賢治"
!head -300 Novelist-Classification/data/Train/0005.txt

In [0]:
#"0006.txt","与謝野晶子"
!head -300 Novelist-Classification/data/Train/0006.txt

In [0]:
#"0007.txt","梶井基次郎"
!head -300 Novelist-Classification/data/Train/0007.txt

In [0]:
#"0008.txt","芥川龍之介"
!head -300 Novelist-Classification/data/Train/0008.txt

In [0]:
"0009.txt","北大路魯山人"
!head -300 Novelist-Classification/data/Train/0009.txt

In [0]:
#doc2vecを使って作者同士の類似性を見る
root_path = "./Novelist-Classification/data/master.csv"
docs = []
authors = []
texts = []
with open(root_path) as fd:
    for i, line in enumerate(fd):
      vals = line.rstrip().split(",")
      author = vals[1].replace('"', '')
      val = get_lines_by_path("./Novelist-Classification/data/Train/" + vals[0].replace('"', ''))
      authors.append(author)
      docs.append(TaggedDocument(words=val, tags=[author]))
      texts.append(val)
    
model = Doc2Vec(documents=docs, dm = 1, vector_size=200, window=8, min_count=1)
show_similarity(model, authors)

In [0]:
#ラベルごとのサンプル数を見る（最初にやるべきが、、）
exclude_authors = None
values_train, labels_train, labels_authors_train  = get_val_and_label(exclude_authors, train_data_size=100000, if_normalize=True, stop_words_frequency=1, data_type="Train")
values_test, labels_test, labels_authors_test  = get_val_and_label(exclude_authors, train_data_size=100000, if_normalize=True, stop_words_frequency=1, data_type="Test")
show_histgram(labels_train)
show_histgram(labels_test)

In [0]:
execute_train_and_validation(exclude_authors=["与謝野晶子", "森鴎外", "宮沢賢治"], dropout=0.2, filter_num=200, dense_count=1, learning_rate=0.001, epoch_count=10)

In [0]:
execute_train_and_validation(exclude_authors=["与謝野晶子", "森鴎外", "宮沢賢治"], dropout=0.2, filter_num=256, dense_count=2, learning_rate=0.001, epoch_count=10)

In [0]:
execute_train_and_validation(exclude_authors=["与謝野晶子", "森鴎外", "宮沢賢治"], dropout=0.2, filter_num=256, dense_count=1, learning_rate=0.001, epoch_count=10)

In [0]:
execute_train_and_validation(exclude_authors=["与謝野晶子", "森鴎外", "宮沢賢治"], dropout=0.2, filter_num=128, dense_count=1, learning_rate=0.001, epoch_count=10)

In [0]:
execute_train_and_validation(exclude_authors=["与謝野晶子", "森鴎外", "宮沢賢治"], dropout=0.2, filter_num=64, dense_count=2, learning_rate=0.001, epoch_count=10)