In [15]:
# coding: utf-8

import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Lambda, Flatten
from keras.layers import Embedding, LSTM
from keras.layers import Convolution1D, MaxPooling1D
from keras.datasets import imdb
from keras import backend as K
import re
from keras.utils import np_utils
from keras.preprocessing import text
from keras.regularizers import l2, activity_l2

# 生成的 word vector 的 dimension
maxlen = 1000
alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789,.!? '
data = pd.read_csv("test.csv", header=0)


chars = set(alphabet)
print('total chars:', len(chars))
# 在做 one-hot coding dict 中对应i = 1 时, c 是第一个字符...如此类推 两个 dict 只是 key-value 调换过来了
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# 创建 len(docs)个, 1 * maxlen 的矩阵
X = np.ones((data.shape[0], maxlen), dtype = np.int64) * 0

print('total dataset size:', data.shape[0])

docs = []
labels = []

print('zipping the data:')
epoch = 0
avg = 0
for cont,title,label in zip(data.content, data.title, data.classes):
    content = title + cont
    
    content = re.sub("[^a-z0-9\,\.\!\?]", " ", content)
    avg = avg + len(content)
    docs.append(content)
    label = label - 1
    labels.append(label)
    epoch = epoch + 1
    if (epoch % 10000 == 0):
        print('zipping the data:', epoch)
print('Success!') 

print(avg/60000)

print('Doing one hot encoding:')
    # One-Hot encoding 另外应该是反过来进行 encode 的,,稀疏部分用-1代替
for i, doc in enumerate(docs):
    # 倒着数后面的maxlen个数字,但是输出顺序不变
    for t, char in enumerate(doc[-maxlen:]):
                X[i, (maxlen-1-t)] = char_indices[char]    
print('Success!')    

y = np.array(labels)   

print('Randomize the dataset:')    
ids = np.arange(len(X))
np.random.shuffle(ids)
X = X[ids]
y = y[ids]
print('Success!')   

margin_size = int(data.shape[0] * 0.8)
print('Spilt the dataset into train/test:') 

X_train = X[:margin_size]
X_test = X[margin_size:]

y_train = y[:margin_size]
y_test = y[margin_size:]
print('Success!')   

print('There are training set:', margin_size)
print('There are test set:', data.shape[0] - margin_size) 

print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
nb_classes = 5 
print(nb_classes, 'classes')
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
print('Success!')   



total chars: 41
total dataset size: 60000
zipping the data:
zipping the data: 10000
zipping the data: 20000
zipping the data: 30000
zipping the data: 40000
zipping the data: 50000
zipping the data: 60000
Success!
2792.9135
Doing one hot encoding:
Success!
Randomize the dataset:
Success!
Spilt the dataset into train/test:
Success!
There are training set: 48000
There are test set: 12000
Convert class vector to binary class matrix (for use with categorical_crossentropy)
5 classes
Success!


In [31]:
print(len(content))
print((X_train[0]))
print(X_train.size)

1967
[36 23 28 37 22 23  9  7  4  9 12 36 36 35  9  6 36 22 20  4  9 21 36 22  5
 26 25 36 35 26 18 36 36 36 36 36 22 36 15 36 20  9 11 36 37 22 20 23 40 36
 35  9 40 36 37 22  5  4 10 36 35 23 27 36 35  9 13  7 36 23  5  4 10 36 20
 28 11 36 22 35 26 27 36 20  4  2 36  3 23 28  7 36 20 23 27 36  9  5  4 18
 36 37 22  5 26 13 18 36 22 35 26 27 36  3 23 29 36 14 36  5 28 40 36 23  3
  4  9  6 36 35  9 40 36  5 28 13 36 37 22  5 23 10 36 37 22 35 23  0 36  9
 20  4 37 36 37 22 20  9 10 36  5  9 13  7 36 25 35 26 36 14 36  5 28 40 36
 23  3  4  9  6 36 35  9 40 36 20  9  7 36 37 22 20 23 37 36 37 22 35 23  0
 36  9 20  4 37 36 37 22 20  9 10 36  5 28 21 36 26 40 36 35 26 28  6 36  3
 23 29 36 14 36 35  9 40 36 37 22  5  4 10 36 35 23 27 36 35  9 13  7 36  5
  9 13  7 36  3  9  2 36  9 35 26 27 36  5  4 28 39 36  9 35  4  1 36  9  5
 26 29 36 36 36 37 22  3 26 27 36 35 23 28 37 36 37 22 35  4 10 36 36 22 36
 30 36 35  9 40 36 37 22  5  4 10 36 22  3 26 18 36  5  9 13  7 36  3  9  2
 36  9 

In [33]:
model = Sequential()


# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(input_dim = 41, output_dim = 50, input_length = maxlen, init = 'he_normal', W_regularizer=l2(0.01), dropout = 0.2))

print("Embedding")
print(model.input_shape)
print(model.output_shape)


# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Convolution1D(nb_filter = 250, filter_length = 3, border_mode='valid', activation='relu', subsample_length=1))

print("Convolution1D")
print(model.output_shape)

# we use max pooling:
model.add(MaxPooling1D(pool_length = model.output_shape[1]))

print("MaxPooling1D")
print(model.output_shape)


# We flatten the output of the conv layer,
# so that we can add a vanilla dense layer:
model.add(Flatten())
print("Flatten")
print(model.output_shape)

# 这个应该相当于fully connected layer吧
# We add a vanilla hidden layer:
model.add(Dense(100))
model.add(Dropout(0.2))
model.add(Activation('relu'))

print("Dense")
print(model.output_shape)

# We project onto a single unit output layer, and squash it with a softmax:
model.add(Dense(5))
model.add(Activation('softmax'))

print("softmax")
print(model.output_shape)

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size = 32, nb_epoch = 10, validation_data=(X_test, Y_test))
model.save_weights("parameters/para.hdf5")

Embedding
(None, 1000)
(None, 1000, 50)
Convolution1D
(None, 998, 250)
MaxPooling1D
(None, 1, 250)
Flatten
(None, 250)
Dense
(None, 100)
softmax
(None, 5)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_11 (Embedding)         (None, 1000, 50)      2050        embedding_input_11[0][0]         
____________________________________________________________________________________________________
convolution1d_11 (Convolution1D) (None, 998, 250)      37750       embedding_11[0][0]               
____________________________________________________________________________________________________
maxpooling1d_11 (MaxPooling1D)   (None, 1, 250)        0           convolution1d_11[0][0]           
____________________________________________________________________________________________________
flatten_11 (Flatten)             (Non

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 48000 samples, validate on 12000 samples
Epoch 1/10

KeyboardInterrupt: 