In [62]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from sklearn.model_selection import train_test_split
from keras import optimizers
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils import to_categorical
import json

In [63]:
MAX_NUM_WORDS   = 10000  #15000
EMBEDDING_DIM   = 300    
MAX_SEQ_LENGTH  = 256    #500
USE_GLOVE       = True
FILTER_SIZES    = [3,4,5]
FEATURE_MAPS    = [200,200,200]
DROPOUT_RATE    = 0.4
HIDDEN_UNITS    = 200
NB_CLASSES      = 2

## 1. 数据获取（保证样本均衡）

In [64]:
# 加载imdb数据集（训练集/测试集）
def load_data(path='imdb.npz', num_words=None, skip_top=0, seed=113,
              start_char=1, oov_char=2, index_from=3):

    # 1. load data
    with np.load(path) as f:
        x_train, labels_train = f['x_train'], f['y_train']
        x_test, labels_test = f['x_test'], f['y_test']

    # 2. shuffle train/test
    np.random.seed(seed)
    indices = np.arange(len(x_train))
    np.random.shuffle(indices)
    x_train = x_train[indices]
    labels_train = labels_train[indices]

    indices = np.arange(len(x_test))
    np.random.shuffle(indices)
    x_test = x_test[indices]
    labels_test = labels_test[indices]

    xs = np.concatenate([x_train, x_test])
    labels = np.concatenate([labels_train, labels_test])

    # 保留前3个index
    if start_char is not None:
        xs = [[start_char] + [w + index_from for w in x] for x in xs]
    elif index_from:
        xs = [[w + index_from for w in x] for x in xs]


    if not num_words:
        num_words = max([max(x) for x in xs])

    # by convention, use 2 as OOV word
    # reserve 'index_from' (=3 by default) characters:
    # 0 (padding), 1 (start), 2 (OOV)
    if oov_char is not None:
        xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
    else:
        xs = [[w for w in x if skip_top <= w < num_words] for x in xs]

    idx = len(x_train)
    x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
    x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])

    return (x_train, y_train), (x_test, y_test)




# 加载：单词-index字典
def get_word_index(path='imdb_word_index.json'):
    with open(path) as f:
        return json.load(f)

In [65]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=MAX_NUM_WORDS)
len(train_data)

# all_data = np.hstack([train_data,test_data])
# all_labels = np.hstack([train_labels,test_labels])

25000

## 2. 数据切分

In [66]:
from sklearn.model_selection import StratifiedKFold

def split_data(data_length_list = [10,20]):
    sfolder = StratifiedKFold(n_splits=4,random_state=0,shuffle=True)


## 3. other

In [86]:
MAX_NUM_WORDS   = 10000  #15000
EMBEDDING_DIM   = 100    # 50,100,200,300
MAX_SEQ_LENGTH  = 256    #500
USE_GLOVE       = True
FILTER_SIZES    = [3,4,5]
FEATURE_MAPS    = [200,200,200]
DROPOUT_RATE    = 0.4
HIDDEN_UNITS    = 200
NB_CLASSES      = 1   # 2

# LEARNING
BATCH_SIZE      = 100
NB_EPOCHS       = 10
RUNS            = 2
VAL_SIZE        = 0.2

In [87]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()  # {word:index}

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [88]:
# print(len(word_index))
tmp_dict = {wd:idx for wd,idx in word_index.items() if len(wd.split())>1}
# print(tmp_dict)
# tmp_dict
len(tmp_dict)

108

In [89]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=MAX_SEQ_LENGTH)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=MAX_SEQ_LENGTH)

In [90]:
len(train_data[0]), len(train_data[1])

(256, 256)

In [38]:
def create_glove_embeddings():
    print('Pretrained embeddings GloVe is loading...')

    embeddings_index = {}
    f = open('/liruishaer/Work2/NLP_models/glove.6B/glove.6B.%id.txt' % EMBEDDING_DIM)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors in GloVe embedding' % len(embeddings_index))

    embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return Embedding(
        input_dim=MAX_NUM_WORDS,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_SEQ_LENGTH,
        weights=[embedding_matrix],
        trainable=True,
        name="word_embedding"
    )

In [91]:
embeddings_index = {}
f = open('/liruishaer/Work2/NLP_models/glove.6B/glove.6B.%id.txt' % EMBEDDING_DIM)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors in GloVe embedding' % len(embeddings_index))

embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))


for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


Found 400000 word vectors in GloVe embedding


In [92]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.55689001,  0.33454001,  0.068255  , ...,  0.037454  ,
        -0.52304   ,  0.52328998],
       [ 0.045259  ,  0.31463999,  0.64099002, ..., -0.1689    ,
        -1.05400002,  0.47262999],
       [ 0.39943001,  0.54632998,  0.38009   , ...,  0.45795   ,
        -0.18339001,  0.12257   ]])

(10000, 100)

In [39]:
"""
CNN model for text classification
This implementation is based on the original paper of Yoon Kim [1].
# References
- [1] [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
@author: Christopher Masch
"""

from keras.layers import Activation, Input, Dense, Dropout, Embedding
from keras.layers.convolutional import SeparableConv1D
from keras.layers import GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model
from keras import initializers
from keras import backend as K

class CNN:
    
    __version__ = '0.0.2'
    
    def __init__(self, embedding_layer=None, num_words=None, embedding_dim=None,
                 max_seq_length=100, filter_sizes=[3,4,5], feature_maps=[100,100,100],
                 hidden_units=100, dropout_rate=None, nb_classes=None):
        """
        Arguments:
            embedding_layer : If not defined with pre-trained embeddings it will be created from scratch (default: None)
            num_words       : Maximal amount of words in the vocabulary (default: None)
            embedding_dim   : Dimension of word representation (default: None)
            max_seq_length  : Max length of sequence (default: 100)
            filter_sizes    : An array of filter sizes per channel (default: [3,4,5])
            feature_maps    : Defines the feature maps per channel (default: [100,100,100])
            hidden_units    : Hidden units per convolution channel (default: 100)
            dropout_rate    : If defined, dropout will be added after embedding layer & concatenation (default: None)
            nb_classes      : Number of classes which can be predicted
        """
        self.embedding_layer = embedding_layer
        self.num_words       = num_words
        self.max_seq_length  = max_seq_length
        self.embedding_dim   = embedding_dim
        self.filter_sizes    = filter_sizes
        self.feature_maps    = feature_maps
        self.hidden_units    = hidden_units
        self.dropout_rate    = dropout_rate
        self.nb_classes      = nb_classes
        
    def build_model(self):
        """
        Build the model
        
        Returns:
            Model           : Keras model instance
        """

        # Checks
        if len(self.filter_sizes)!=len(self.feature_maps):
            raise Exception('Please define `filter_sizes` and `feature_maps` with the same length.')
        if not self.embedding_layer and (not self.num_words or not self.embedding_dim):
            raise Exception('Please define `num_words` and `embedding_dim` if you not use a pre-trained embeddings')
        
        
        # Building embeddings from scratch
        if self.embedding_layer is None:
            self.embedding_layer = Embedding(
                input_dim=self.num_words, 
                output_dim=self.embedding_dim,       
                input_length=self.max_seq_length,
                weights=None, trainable=True,
                name="word_embedding"
            )
        
        word_input = Input(shape=(self.max_seq_length,), dtype='int32', name='word_input')
        x = self.embedding_layer(word_input)
        x = Dropout(self.dropout_rate)(x)
        x = self.building_block(x, self.filter_sizes, self.feature_maps)
        x = Activation('relu')(x)
        prediction = Dense(self.nb_classes, activation='softmax')(x)
        return Model(inputs=word_input, outputs=prediction)
    
    
    def building_block(self, input_layer, filter_sizes, feature_maps):
        """ 
        Creates several CNN channels in parallel and concatenate them 
        
        Arguments:
            input_layer : Layer which will be the input for all convolutional blocks
            filter_sizes: Array of filter sizes
            feature_maps: Array of feature maps
            
        Returns:
            x           : Building block with one or several channels
        """
        channels = []
        for ix in range(len(self.filter_sizes)):
            x = self.create_channel(input_layer, filter_sizes[ix], feature_maps[ix])
            channels.append(x)
            
        # Checks how many channels, one channel doesn't need a concatenation
        if (len(channels)>1):
            x = concatenate(channels)
        return x
    
    
    def create_channel(self, x, filter_size, feature_map):
        """
        Creates a layer, working channel wise
        
        Arguments:
            x           : Input for convoltuional channel
            filter_size : Filter size for creating Conv1D
            feature_map : Feature map 
            
        Returns:
            x           : Channel including (Conv1D + GlobalMaxPooling + Dense + Dropout)
        """
        x = SeparableConv1D(feature_map, kernel_size=filter_size, activation='relu', strides=1, padding='same',
                            depth_multiplier=4)(x)
        x = GlobalMaxPooling1D()(x)
        x = Dense(self.hidden_units)(x)
        x = Dropout(self.dropout_rate)(x)
        return x

In [41]:
histories = []


# print('Running iteration %i/%i' % (i+1, RUNS))
# random_state = np.random.randint(1000)

# X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=VAL_SIZE, random_state=random_state)
X_train = train_data
X_val = test_data

# train_labels = to_categorical(train_labels)
# train_labels = to_categorical(test_labels)
y_train = train_labels
y_val = test_labels

print(y_train)
print(y_train.shape)


emb_layer = None
# if USE_GLOVE:
#     emb_layer = create_glove_embeddings()

model = CNN(
    embedding_layer = emb_layer,
    num_words       = MAX_NUM_WORDS,
    embedding_dim   = EMBEDDING_DIM,
    filter_sizes    = FILTER_SIZES,
    feature_maps    = FEATURE_MAPS,
    max_seq_length  = MAX_SEQ_LENGTH,
    dropout_rate    = DROPOUT_RATE,
    hidden_units    = HIDDEN_UNITS,
    nb_classes      = NB_CLASSES
).build_model()

model.summary()

# binary_crossentropy  :  integer label    NB_CLASSED=1
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# categoricla_crossentropy  : binary label    NB_CLASSED=2
# model.compile(
#     loss='categorical_crossentropy',
#     optimizer=optimizers.Adam(),
#     metrics=['accuracy']
# )

history = model.fit(
    X_train, y_train,
    epochs=NB_EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    validation_data=(X_val, y_val),
    callbacks=[
        ModelCheckpoint(
            'model-%i.h5'%(1), monitor='val_loss', verbose=1, save_best_only=True, mode='min'
        ),
        #TensorBoard(log_dir='./logs/temp', write_graph=True)
    ]
)
print()
histories.append(history.history)

with open('history.pkl', 'wb') as f:
    pickle.dump(histories, f)

[1 0 0 ... 0 1 0]
(25000,)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input (InputLayer)         (None, 256)          0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 256, 50)      50000       word_input[0][0]                 
__________________________________________________________________________________________________
dropout_41 (Dropout)            (None, 256, 50)      0           word_embedding[0][0]             
__________________________________________________________________________________________________
separable_conv1d_31 (SeparableC (None, 256, 200)     40800       dropout_41[0][0]                 
__________________________________________________________________________________

  'TensorFlow optimizers do not '


Epoch 2/10

KeyboardInterrupt: 