In [1]:
import tensorflow.keras as keras

keras.__version__

'2.7.0'

# 单词级的 one-hot 编码

In [None]:
import numpy as np

samples = ['The cat sat on the mat.', 'The dog ate my homework.']  # 列表中每个元素对应一个样本

token_index = {}  # 构建数据中所有标记的索引
for sample in samples:
    for word in sample.split():  # 利用 split 对样本进行分词。在实际应用中，还需要从样本中去掉标点符号和特殊字符
        if word not in token_index:
            token_index[word] = len(token_index) + 1  # 为每个唯一单词指定一个唯一索引，这里没有为索引 0 指定单词

max_length = 10  # 对样本进行分词。只考虑每个样本前 max_length 个单词
results = np.zeros(shape=(len(samples),  # 结果保存此处, batch, sequence_length, one-hot
                          max_length,
                          max(token_index.values()) + 1))

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.0

# 用 keras 实现单词级 one-hot 编码

In [None]:
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000)  # 创建分词器，设置为只考虑前 1000 个最常见的单词
tokenizer.fit_on_texts(samples)  # 构建单词索引

sequences = tokenizer.texts_to_sequences(samples)  # 将字符串转换为整数索引组成的列表

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
