In [7]:
# 有两个作者的文章（A, B），定义为0， 1
A = 0 # hamilton
B = 1 # madison
UNKNOWN = -1

def preprocessing(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
        text = ' '.join(lines[1:]).replace('\n',' ').replace('  ', ' ').lower().replace('hamilton','').replace('madison','')
        text = ' '.join(text.split())
        return text

# 把同一作者的文章全部合并到一个文件
textA, textB = '', ''

import os
for file in os.listdir('./papers/A'):
    textA += preprocessing('./papers/A/'+file)
for file in os.listdir('./papers/B'):
    textB += preprocessing('./papers/B/'+file)
print("文本A的长度：{}".format(len(textA)))
print("文本B的长度：{}".format(len(textB)))

文本A的长度：216394
文本B的长度：230867


In [8]:
SEQ_LEN = 30 # 切分序列的长度，超参数
import numpy as np
def make_subsequence(long_seq, label, seq_len=SEQ_LEN):
    numofsubseq = len(long_seq)-seq_len+1
    X = np.zeros((numofsubseq, seq_len))
    y = np.zeros((numofsubseq, 1))
    for i in range(numofsubseq):
        X[i] = long_seq[i:i+seq_len]
        y[i] = label
    return X, y


In [10]:
from keras.preprocessing.text import Tokenizer
char_tokenizer = Tokenizer(char_level=True)

char_tokenizer.fit_on_texts(textA + textB)

long_seq_a = char_tokenizer.texts_to_sequences([textA])[0]
long_seq_b = char_tokenizer.texts_to_sequences([textB])[0]

Xa, ya = make_subsequence(long_seq_a, A)
Xb, yb = make_subsequence(long_seq_b, B)
print('字符的种类：{}'.format(len(char_tokenizer.word_index))) # 52
# {' ': 1, 'e': 2, 't': 3, 'o': 4, 'i': 5, 'n': 6, 'a': 7, 's': 8, 'r': 9, 'h': 10,
#  'l': 11, 'd': 12, 'c': 13, 'u': 14, 'f': 15, 'm': 16, 'p': 17, 'b': 18, 'y': 19, 'w': 20,
#  ',': 21, 'g': 22, 'v': 23, '.': 24, 'x': 25, 'k': 26, 'j': 27, ';': 28, 'q': 29, 'z': 30,
#  '-': 31, '?': 32, '"': 33, '1': 34, ':': 35, '8': 36, '7': 37, '(': 38, ')': 39, '2': 40,
#  '0': 41, '3': 42, '4': 43, '6': 44, "'": 45, '!': 46, ']': 47, '5': 48, '[': 49, '@': 50,
#  '9': 51, '%': 52}
print('A训练集大小：{}'.format(Xa.shape))
print('B训练集大小：{}'.format(Xb.shape))

KeyboardInterrupt: 

In [12]:
# 堆叠AB训练数据在一起
X = np.vstack((Xa, Xb))
y = np.vstack((ya, yb))

# 训练集测试集拆分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [13]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Embedding

Embedding_dim = 128
RNN_size = 256

model = Sequential()
model.add(Embedding(input_dim=len(char_tokenizer.word_index)+1,
                    output_dim=Embedding_dim,
                    input_length=SEQ_LEN))
model.add(SimpleRNN(units=RNN_size, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 128)           6784      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 30, 256)           98560     
_________________________________________________________________
dense_1 (Dense)              (None, 30, 1)             257       
Total params: 105,601
Trainable params: 105,601
Non-trainable params: 0
_________________________________________________________________


In [None]:
batch_size = 4096
epochs = 20
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                    validation_data=(X_test, y_test),verbose=1)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1) # set the vertical range to [0-1]
plt.show()

In [None]:
# 测试
# 测试数据处理
for file in os.listdir('./papers/Unknown'):
    unk_file = preprocessing('./papers/Unknown/'+file)
    unk_file_seq = char_tokenizer.texts_to_sequences([unk_file])[0]
    X_unk, _ = make_subsequence(unk_file_seq, UNKNOWN)
    y_pred = model.predict(X_unk)
    y_pred = y_pred > 0.5
    votesA = np.sum(y_pred==0)
    votesB = np.sum(y_pred==1)
    print("文章 {} 被预测为 {} 写的，投票数 {} ： {}".format(
        file,
        "A:hamilton" if votesA > votesB else "B:madison",
        max(votesA, votesB),
        min(votesA, votesB)
    ))
