In [48]:
import numpy as np

def load_data(path='imdb.npz',
              num_words=None,
              skip_top=0,
              maxlen=None,
              seed=113,
              start_char=1,
              oov_char=2,
              index_from=3,
              **kwargs):
 
  # Legacy support
  if 'nb_words' in kwargs:
    logging.warning('The `nb_words` argument in `load_data` '
                    'has been renamed `num_words`.')
    num_words = kwargs.pop('nb_words')
  if kwargs:
    raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))

  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
  #path = get_file(
  #    path,
  #    origin=origin_folder + 'imdb.npz',
  #    file_hash=
  #    '69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f')
      
  path = "./imdb.npz"
  with np.load(path, allow_pickle=True) as f:
    x_train, labels_train = f['x_train'], f['y_train']
    x_test, labels_test = f['x_test'], f['y_test']

  np.random.seed(seed)
  indices = np.arange(len(x_train))
  np.random.shuffle(indices)
  x_train = x_train[indices]
  labels_train = labels_train[indices]

  indices = np.arange(len(x_test))
  np.random.shuffle(indices)
  x_test = x_test[indices]
  labels_test = labels_test[indices]

  xs = np.concatenate([x_train, x_test])
  labels = np.concatenate([labels_train, labels_test])

  if start_char is not None:
    xs = [[start_char] + [w + index_from for w in x] for x in xs]
  elif index_from:
    xs = [[w + index_from for w in x] for x in xs]

  if maxlen:
    xs, labels = _remove_long_seq(maxlen, xs, labels)
    if not xs:
      raise ValueError('After filtering for sequences shorter than maxlen=' +
                       str(maxlen) + ', no sequence was kept. '
                       'Increase maxlen.')
  if not num_words:
    num_words = max([max(x) for x in xs])

  # by convention, use 2 as OOV word
  # reserve 'index_from' (=3 by default) characters:
  # 0 (padding), 1 (start), 2 (OOV)
  if oov_char is not None:
    xs = [
        [w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs
    ]
  else:
    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]

  idx = len(x_train)
  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])

  return (x_train, y_train), (x_test, y_test)


In [49]:
import json
#@keras_export('keras.datasets.imdb.get_word_index')
def get_word_index(path='imdb_word_index.json'):
  """Retrieves the dictionary mapping word indices back to words.

  Arguments:
      path: where to cache the data (relative to `~/.keras/dataset`).

  Returns:
      The word index dictionary.
  """
  #origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
  #path = get_file(
  #    path,
  #    origin=origin_folder + 'imdb_word_index.json',
  #    file_hash='bfafd718b763782e994055a2d397834f')

  path = "./imdb_word_index.json"
  with open(path) as f:
    return json.load(f)


In [50]:
word_index = get_word_index()

id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token


In [51]:
from tensorflow import keras
imdb = keras.datasets.imdb

NUM_WORDS = 10000
(train_data, train_labels),(test_data, test_labels) = load_data(num_words=NUM_WORDS)

In [52]:
" ".join([id_to_word[id_] for id_ in train_data[0][:50]])

"<sos> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <unk> is an amazing actor and now the same being director <unk> father came from the same scottish island as myself so i loved"

In [53]:
train_data.shape

(25000,)

In [61]:
label = train_labels[0]
print("Label:", label, "= Positive" if label else "= Negative")

Label: 1 = Positive


In [55]:
train_data[0],


([1,
  14,
  22,
  16,
  43,
  530,
  973,
  1622,
  1385,
  65,
  458,
  4468,
  66,
  3941,
  4,
  173,
  36,
  256,
  5,
  25,
  100,
  43,
  838,
  112,
  50,
  670,
  2,
  9,
  35,
  480,
  284,
  5,
  150,
  4,
  172,
  112,
  167,
  2,
  336,
  385,
  39,
  4,
  172,
  4536,
  1111,
  17,
  546,
  38,
  13,
  447,
  4,
  192,
  50,
  16,
  6,
  147,
  2025,
  19,
  14,
  22,
  4,
  1920,
  4613,
  469,
  4,
  22,
  71,
  87,
  12,
  16,
  43,
  530,
  38,
  76,
  15,
  13,
  1247,
  4,
  22,
  17,
  515,
  17,
  12,
  16,
  626,
  18,
  2,
  5,
  62,
  386,
  12,
  8,
  316,
  8,
  106,
  5,
  4,
  2223,
  5244,
  16,
  480,
  66,
  3785,
  33,
  4,
  130,
  12,
  16,
  38,
  619,
  5,
  25,
  124,
  51,
  36,
  135,
  48,
  25,
  1415,
  33,
  6,
  22,
  12,
  215,
  28,
  77,
  52,
  5,
  14,
  407,
  16,
  82,
  2,
  8,
  4,
  107,
  117,
  5952,
  15,
  256,
  4,
  2,
  7,
  3766,
  5,
  723,
  36,
  71,
  43,
  530,
  476,
  26,
  400,
  317,
  46,
  7,
  4,
  2,
  1029,
  

In [56]:
400import keras
maxlen = 400
x_train = keras.preprocessing.sequence.pad_sequences(train_data,
                                                    padding='post',
                                                    maxlen=maxlen,
                                                    truncating='post')
x_test = keras.preprocessing.sequence.pad_sequences(test_data,
                                                     padding='post', 
                                                     maxlen=maxlen,
                                                     truncating='post'
                                                    )

In [57]:
x_train[0]

array([   1,   14,   22,   16,   43,  530,  973, 1622, 1385,   65,  458,
       4468,   66, 3941,    4,  173,   36,  256,    5,   25,  100,   43,
        838,  112,   50,  670,    2,    9,   35,  480,  284,    5,  150,
          4,  172,  112,  167,    2,  336,  385,   39,    4,  172, 4536,
       1111,   17,  546,   38,   13,  447,    4,  192,   50,   16,    6,
        147, 2025,   19,   14,   22,    4, 1920, 4613,  469,    4,   22,
         71,   87,   12,   16,   43,  530,   38,   76,   15,   13, 1247,
          4,   22,   17,  515,   17,   12,   16,  626,   18,    2,    5,
         62,  386,   12,    8,  316,    8,  106,    5,    4, 2223, 5244,
         16,  480,   66, 3785,   33,    4,  130,   12,   16,   38,  619,
          5,   25,  124,   51,   36,  135,   48,   25, 1415,   33,    6,
         22,   12,  215,   28,   77,   52,    5,   14,  407,   16,   82,
          2,    8,    4,  107,  117, 5952,   15,  256,    4,    2,    7,
       3766,    5,  723,   36,   71,   43,  530,  4

In [58]:
import keras

model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim = NUM_WORDS, # 输入词汇表长度
                                 input_length=400,
                                 output_dim=32,
                                ))
#model.add(keras.layers.Flatten())


model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=8)))
model.add(keras.layers.Dense(units=256, activation='relu'))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(units=2, activation='softmax'))
model.summary()
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])
model.fit(x_train,train_labels,
          validation_split=0.2,
          epochs=2,
         )


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 400, 32)           320000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 16)                2624      
_________________________________________________________________
dense_3 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 514       
Total params: 327,490
Trainable params: 327,490
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2

<keras.callbacks.callbacks.History at 0x7efec42b2978>

In [None]:
model.evaluate(x_test, test_labels)

In [77]:
t= "good"
input = [word_index[c] for c in t]
input.insert(0, 1)
input = np.array(input)
input


input = keras.preprocessing.sequence.pad_sequences(input,
                                                    padding='post',
                                                    maxlen=400,
                                                    truncating='post')

input = input.reshape(-1, input.shape[0])
input


ValueError: `sequences` must be a list of iterables. Found non-iterable: 1

In [79]:

pred = model.predict(x_test)
pred

array([[0.9651609 , 0.03483909],
       [0.05265727, 0.9473427 ],
       [0.08069908, 0.9193009 ],
       ...,
       [0.9725151 , 0.0274849 ],
       [0.9318757 , 0.06812433],
       [0.81506014, 0.18493989]], dtype=float32)