在DNN这类全连接神经网络算法中，隐藏层的前一层的每个节点都需要与下一层的每个节点连接，当节点数量巨大时，产生的连接就非常多，这样，在硬件环境有限的情况下就几乎难以完成训练过程。

为了解决图像处理领域全连接造成的计算量巨大的问题，人们提出了局部连接。其理论基础是基于这样的假设：生物在进行图像识别时，对图像的理解只需要处理局部的数据即可，不需要全面分析全部图像后才能进行处理。

所谓权值共享是指当从一个大尺寸图像中随机选取一小块，比如说8×8作为样本，并且从这个小块样本中学习到了一些特征，这时我们可以把从这个8×8样本中学习到的特征作为探测器，应用到这个图像的任意地方。特别是，我们可以用从8×8样本中所学习到的特征跟原本的大尺寸图像作卷积，从而在这个大尺寸图像上的任一位置获得一个不同特征的激活值。

所谓池化是指人们可以计算图像一个区域上的某个特定特征的平均值或者最大值。这些概要统计特征不仅具有低得多的维度，同时还会改善结果。

In [12]:
from tflearn.datasets import mnist
from tflearn import input_data, conv_2d, residual_bottleneck, activation,\
batch_normalization, global_avg_pool, fully_connected, regression, DNN
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [2]:
X_train, Y_train, X_test, Y_test = mnist.load_data(one_hot=True)

Extracting mnist/train-images-idx3-ubyte.gz
Extracting mnist/train-labels-idx1-ubyte.gz
Extracting mnist/t10k-images-idx3-ubyte.gz
Extracting mnist/t10k-labels-idx1-ubyte.gz


In [3]:
X_train = X_train.reshape(-1, 28, 28, 1)
X_test = X_test.reshape(-1, 28, 28, 1)

In [4]:
net = input_data(shape=[None, 28, 28, 1])
net = conv_2d(net, 64, 3, activation='relu', bias=False)
net = residual_bottleneck(net, 3, 16, 64)
net = residual_bottleneck(net, 1, 32, 128, downsample=True)
net = residual_bottleneck(net, 2, 32, 128)
net = residual_bottleneck(net, 1, 64, 256, downsample=True)
net = residual_bottleneck(net, 2, 64, 256)
net = batch_normalization(net)
net = activation(net, 'relu')
net = global_avg_pool(net)
net = fully_connected(net, 10, activation='softmax')
net = regression(net, optimizer='adam', loss='categorical_crossentropy', learning_rate=0.1)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [5]:
model = DNN(net, checkpoint_path='model_resnet_mnist', max_checkpoints=10, tensorboard_verbose=0)
model.fit(X_train, Y_train, n_epoch=5, validation_set=(X_test, Y_test), show_metric=True, batch_size=512, run_id='resnet_mnist')

Training Step: 539  | total loss: [1m[32m0.17937[0m[0m | time: 2643.740s
| Adam | epoch: 005 | loss: 0.17937 - acc: 0.9446 -- iter: 54784/55000
Training Step: 540  | total loss: [1m[32m0.18357[0m[0m | time: 2728.319s
| Adam | epoch: 005 | loss: 0.18357 - acc: 0.9435 | val_loss: 3.86709 - val_acc: 0.4834 -- iter: 55000/55000
--


In [1]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, concatenate, Input, Average, Add, BatchNormalization
from keras import regularizers
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
import glob
from collections import Counter
import nltk
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [3]:
PATH_NEG = './数据集/movie-review-data/review_polarity/txt_sentoken/neg/'
PATH_POS = './数据集/movie-review-data/review_polarity/txt_sentoken/pos/'

def build_vocab(path1, path2):
    fileList1 = glob.glob(path1 + '*.txt')
    fileList2 = glob.glob(path2 + '*.txt')
    fileList = fileList1 + fileList2
    counter = Counter()
    for file in fileList:
        f = open(file, 'r')
        for line in f:
            for word in nltk.word_tokenize(line.strip()):
                if word.isdigit():
                    word = "9"
                counter[word] += 1
    word2idx = {w:i+2 for i, w in enumerate(counter)}
    word2idx['PAD'] = 0
    word2idx['UNK'] = 1
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word

In [4]:
def get_maxlen(path1, path2):
    sent_len = 0
    sent_maxlen = 0
    fileList1 = glob.glob(path1 + '*.txt')
    fileList2 = glob.glob(path2 + '*.txt')
    fileList = fileList1 + fileList2
    for file in fileList:
        f = open(file, 'r')
        for line in f:
            sent_len = len(nltk.word_tokenize(line.strip()))
            if sent_len > sent_maxlen:
                sent_maxlen = sent_len
    return sent_maxlen

In [5]:
def vectorize(path1, path2, word2idx):
    sentences = []
    fileList1 = glob.glob(path1 + '*.txt')
    fileList2 = glob.glob(path2 + '*.txt')
    fileList = fileList1 + fileList2
    i = 0
    j = 0
    for file in fileList:
        f = open(file, 'r')
        for line in f:
            sentence = []
            for word in nltk.word_tokenize(line.strip()):
                if word.isdigit():
                    word = "9"
                try:
                    sentence.append(word2idx[word])
                except KeyError:
                    sentence.append(word2idx['UNK'])                    
            sentences.append(sentence)
            if file not in fileList2:
                i += 1
            else:
                j += 1
    X = pad_sequences(sentences, maxlen=get_maxlen(path1, path2))
    Y = to_categorical(i * [0] + j * [1], num_classes=2)
    return X, Y  

In [6]:
word2idx, idx2word = build_vocab(PATH_POS, PATH_NEG)
maxlen = get_maxlen(PATH_POS, PATH_NEG)
X, Y = vectorize(PATH_POS, PATH_NEG, word2idx)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [10]:
EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 256
NUM_EPOCHS = 10

inputs = Input(shape=(maxlen,))
net = Embedding(len(word2idx), EMBEDDING_SIZE, input_length=maxlen, name='input')(inputs)
conv1d_3 = Conv1D(HIDDEN_LAYER_SIZE, 3, padding='same', activation='relu', name='conv1d_3', kernel_regularizer=regularizers.l2(0.01))(net)
conv1d_4 = Conv1D(HIDDEN_LAYER_SIZE, 4, padding='same', activation='relu', name='conv1d_4', kernel_regularizer=regularizers.l2(0.01))(net)
conv1d_5 = Conv1D(HIDDEN_LAYER_SIZE, 5, padding='same', activation='relu', name='conv1d_5', kernel_regularizer=regularizers.l2(0.01))(net)
average = Average()([conv1d_3, conv1d_4, conv1d_5])
net = GlobalAveragePooling1D()(average)
net = Dropout(0.5)(net)
batch = BatchNormalization()(net)
output = Dense(2, activation='sigmoid')(batch)
model = Model(inputs=inputs, outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(X_test, Y_test))

Train on 45304 samples, validate on 19416 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a3cba2a90>

In [2]:
def build_vocab(path1, path2):
    fileList1 = glob.glob(path1 + '*.txt')
    fileList2 = glob.glob(path2 + '*.txt')
    fileList = fileList1 + fileList2
    counter = Counter()
    documents = []
    for file in fileList:
        f = open(file, 'rb')
        for line in f.readlines():
            line = nltk.word_tokenize(line.decode('unicode-escape').strip())
            for word in line:
                if word.startswith('\\x'):
                    word = 'UNK'
                if word.isdigit():
                    word = '9'
                counter[word] += 1
    word2idx = {w[0]:i+2 for i, w in enumerate(counter)}
    word2idx['PAD'] = 0
    word2idx['UNK'] = 1
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word

In [3]:
def get_maxlen(path1, path2):
    sent_len = 0
    sent_maxlen = 0
    fileList1 = glob.glob(path1 + '*.txt')
    fileList2 = glob.glob(path2 + '*.txt')
    fileList = fileList1 + fileList2
    for file in fileList:
        f = open(file, 'rb')
        for line in f.readlines():
            line = nltk.word_tokenize(line.decode('unicode-escape').strip())
            for word in line:
                if word.startswith('\\x'):
                    word = 'UNK'
                if word.isdigit():
                    word = '9'
            sent_len = len(line)
            if sent_len > sent_maxlen:
                sent_maxlen = sent_len
    return sent_maxlen

In [4]:
def vectorize(path1, path2, word2idx):
    sentences = []
    fileList1 = glob.glob(path1 + '*.txt')
    fileList2 = glob.glob(path2 + '*.txt')
    fileList = fileList1 + fileList2
    i = 0
    j = 0
    for file in fileList:
        f = open(file, 'rb')
        for line in f.readlines():
            sentence = []
            line = nltk.word_tokenize(line.decode('unicode-escape').strip())
            for word in line:
                if word.startswith('\\x'):
                    word = 'UNK'
                if word.isdigit():
                    word = '9'
                try:
                    sentence.append(word2idx[word])
                except KeyError:
                    sentence.append(word2idx['UNK']) 
            sentences.append(sentence)
            if file not in fileList2:
                i += 1
            else:
                j += 1
    X = pad_sequences(sentences, maxlen=get_maxlen(path1, path2))
    Y = to_categorical(i * [0] + j * [1], num_classes=2)
    return X, Y  

In [5]:
PATH_HAM = './数据集/enron1/ham/'
PATH_SPAM = './数据集/enron1/spam/'

word2idx_email, idx2word_email = build_vocab(PATH_HAM, PATH_SPAM)
maxlen_email = get_maxlen(PATH_HAM, PATH_SPAM)
X_email, Y_email = vectorize(PATH_HAM, PATH_SPAM, word2idx_email)
X_train_email, X_test_email, Y_train_email, Y_test_email = train_test_split(X_email, Y_email, test_size=0.3)

In [13]:
tf.reset_default_graph()

In [16]:
EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 256
NUM_EPOCHS = 10

inputs = Input(shape=(maxlen_email,))
net = Embedding(len(word2idx_email), EMBEDDING_SIZE, input_length=maxlen_email, name='input')(inputs)
conv1d_3 = Conv1D(HIDDEN_LAYER_SIZE, 3, padding='same', activation='relu', name='conv1d_3', kernel_regularizer=regularizers.l2(0.01))(net)
conv1d_4 = Conv1D(HIDDEN_LAYER_SIZE, 4, padding='same', activation='relu', name='conv1d_4', kernel_regularizer=regularizers.l2(0.01))(net)
conv1d_5 = Conv1D(HIDDEN_LAYER_SIZE, 5, padding='same', activation='relu', name='conv1d_5', kernel_regularizer=regularizers.l2(0.01))(net)
average = Average()([conv1d_3, conv1d_4, conv1d_5])
net = GlobalAveragePooling1D()(average)
net = Dropout(0.5)(net)
batch = BatchNormalization()(net)
output = Dense(2, activation='sigmoid')(batch)
model_email = Model(inputs=inputs, outputs=output)
model_email.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model_email.fit(X_train_email, Y_train_email, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(X_test_email, Y_test_email))

Train on 70226 samples, validate on 30098 samples
Epoch 1/10


InvalidArgumentError: Tensor input_2:0, specified in either feed_devices or fetch_devices was not found in the Graph