In [None]:
# coding: utf-8

import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Lambda, Flatten
from keras.layers import Embedding
from keras.layers import Convolution1D, MaxPooling1D
from keras.datasets import imdb
from keras import backend as K
import re
from keras.utils import np_utils
from keras.preprocessing import text
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l2

# 生成的 word vector 的 dimension
maxlen = 1041
alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789,.!? '
datatrain = pd.read_csv("train.csv", header=0)
datatest = pd.read_csv("test.csv", header=0)


chars = set(alphabet)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


# 创建 len(docs)个, 1 * maxlen 的矩阵
X_train = np.ones((datatrain.shape[0], maxlen), dtype = np.int64) * 0

docs = []
labels = []

print('zipping the data:')
epoch = 0
for cont,title,label in zip(datatrain.content, datatrain.title, datatrain.classes):
    content = title + cont
    content = re.sub("[^a-z0-9\,\.\!\?]", " ", content)
    docs.append(content)
    label = label - 1
    labels.append(label)
    epoch = epoch + 1
    if (epoch % 20000 == 0):
        print('zipping the training data:', epoch)
print('Success!')


print('There are training set:', datatrain.shape[0])


print('Doing one hot encoding:')
    # One-Hot encoding 另外应该是反过来进行 encode 的,,稀疏部分用0代替
for i, doc in enumerate(docs):
    # 倒着数后面的maxlen个数字,但是输出顺序不变
    for t, char in enumerate(doc[-maxlen:]):
                X_train[i, (maxlen-1-t)] = char_indices[char]
print('Success!')


Y_train = np.array(labels)

print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
nb_classes = 5
print(nb_classes, 'classes in the dataset')
Y_train = np_utils.to_categorical(Y_train, nb_classes)
print('Success!')


X_test = np.ones((datatest.shape[0], maxlen), dtype = np.int64) * 0
docs = []
labels = []

print('zipping the test data:')
epoch = 0
for cont,title,label in zip(datatest.content, datatest.title, datatest.classes):
    content = title + cont
    content = re.sub("[^a-z0-9\,\.\!\?]", " ", content)
    docs.append(content)
    label = label - 1
    labels.append(label)
    epoch = epoch + 1
    if (epoch % 20000 == 0):
        print('zipping the test data:', epoch)
print('Success!')

print('There are test set:', datatest.shape[0])

print('Doing one hot encoding:')
    # One-Hot encoding 另外应该是反过来进行 encode 的,,稀疏部分用-1代替
for i, doc in enumerate(docs):
    # 倒着数后面的maxlen个数字,但是输出顺序不变
    for t, char in enumerate(doc[-maxlen:]):
                X_test[i, (maxlen-1-t)] = char_indices[char]
print('Success!')

Y_test = np.array(labels)

print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
nb_classes = 5
print(nb_classes, 'classes in the dataset')
Y_test = np_utils.to_categorical(Y_test, nb_classes)
print('Success!')

print("All of the pre-processde work is done.")

model = Sequential()


# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(input_dim = 41, output_dim = 50, input_length = maxlen, init = 'he_normal', W_regularizer=l2(0.01)) )

# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Convolution1D(nb_filter = 128, filter_length = 3, W_regularizer=l2(0.01),  init = 'he_normal', border_mode='same', activation='relu', subsample_length=1))

model.add(Convolution1D(nb_filter = 128, filter_length = 3, W_regularizer=l2(0.01),  init = 'he_normal', border_mode='same', activation='relu', subsample_length=1))

model.add(Convolution1D(nb_filter = 128, filter_length = 3, W_regularizer=l2(0.01),  init = 'he_normal', border_mode='same', activation='relu', subsample_length=1))

# we use max pooling:
model.add(MaxPooling1D(pool_length = model.output_shape[1]))
#model.add(MaxPooling1D(pool_length = 2))
#print(model.output_shape[1], "pooling shape")
# We flatten the output of the conv layer,
# so that we can add a vanilla dense layer:
model.add(Flatten())


# We add a vanilla hidden layer:
model.add(Dense(100))
#model.add(Dropout(0.1))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(5))
model.add(Activation('softmax'))


checkpointers = ModelCheckpoint("parameters/weights.{epoch:02d}-{val_acc:.4f}.hdf5", monitor='val_acc', verbose=0, save_best_only=False, mode='auto')

#model.load_weights("parameters/weights.39-0.32.hdf5")

model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, Y_train, batch_size = 128, nb_epoch = 20, validation_data=(X_test, Y_test), callbacks = [checkpointers])
