In [3]:
import zipfile
import os
import glob
import io
import re
import random

In [4]:
akuta_dir = "akutagawa"
miya_dir = "miyazawa"
akuta_tsv = "aozora-akutagawa.tsv"
miya_tsv = "aozora-miyazawa.tsv"

In [3]:
# テキストの下処理
def preprocess(file):
    kruby = re.compile('｜([^《]+)《[^》]+》')
    ruby = re.compile('《[^》]+》')
    annotate = re.compile('［＃[^］]+］')
    nonpr = re.compile('[\x00-\x09\x0b-\x1f]+')
    # ヘッダの削除
    ## ----- が2回出現するまでスキップ
    for line in file:
        if line.startswith('--------'):
            break
    for line in file:
        if line.startswith('--------'):
            break
    # 本文の処理
    for line in file:
        if line.startswith('底本：'): # 本文終わり
            break
        if line[:-1].strip() == "":
            continue
        # 本文として処理
        line = kruby.sub('\1', line)
        line = ruby.sub('', line)
        line = annotate.sub('', line)
        line = nonpr.sub('', line)
        yield line

In [4]:
# tsvへの変換
def to_tsv(line, cat, num):
    ret = "\t".join([cat, str(num), '', line])
    return ret

In [5]:
# dir上のzipファイル名を取得
a_zips = glob.glob("%s/*.zip" % akuta_dir)

In [6]:
# 芥川龍之介の著作からある程度ランダムにデータを選んでtsvに出力
min_len = 60
max_len = 240
with open(akuta_tsv, "w") as wf:
    # 全zipを処理
    for fname in a_zips:
        with zipfile.ZipFile(fname, 'r') as f:
            tfile = f.namelist()[0]
            with io.StringIO(f.read(tfile).decode('CP932', 'ignore')) as sf:
                for line in preprocess(sf):
                    if min_len < len(line) < max_len: # 長さ制限
                        tline = to_tsv(line, "akuta", 0)
                        wf.write(tline)

In [7]:
# 宮沢健二の著作からある程度ランダムにデータを選んでtsvに出力
m_zips = glob.glob("%s/*.zip" % miya_dir)
min_len = 60
max_len = 240
with open(miya_tsv, "w") as wf:
    # 全zipを処理
    for fname in m_zips:
        with zipfile.ZipFile(fname, 'r') as f:
            tfile = f.namelist()[0]
            with io.StringIO(f.read(tfile).decode('CP932', 'ignore')) as sf:
                for line in preprocess(sf):
                    if min_len < len(line) < max_len: # 長さ制限
                        tline = to_tsv(line, "miya", 1)
                        wf.write(tline)

In [6]:
# 2つのファイルを結合、シャッフル
import random

random.seed(100)
with open(akuta_tsv) as f:
    lines = f.readlines()
with open(miya_tsv) as f:
    lines.extend(f.readlines())
random.shuffle(lines)
with open("aozora-all.tsv", "w") as f:
    for line in lines:
        f.write(line)
#! cat aozora-akutagawa.tsv aozora-miyazawa.tsv | sort -R > aozora-all.tsv

In [8]:
# train/dev/test分割
train_fname, dev_fname, test_fname = ["train.tsv", "dev.tsv", "test.tsv"]

random.seed(101)

with open("aozora-all.tsv") as f, open(train_fname, "w") as tf, open(dev_fname, "w") as df, open(test_fname, "w") as ef:
    ef.write("class\tsentence\n")
    for line in f:
        v = random.randint(0, 9)
        if v == 8:
            df.write(line)
        elif v == 9:
            rows = line.split('\t')
            ef.write("\t".join([rows[1], rows[3]]))
        else:
            tf.write(line)