In [1]:
def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')

    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word

    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word


In [2]:
import numpy as np

In [3]:
def convert_one_hot(corpus, vocab_size):
    '''转换为one-hot表示

    :param corpus: 单词ID列表（一维或二维的NumPy数组）
    :param vocab_size: 词汇个数
    :return: one-hot表示（二维或三维的NumPy数组）
    '''
    N = corpus.shape[0]

    if corpus.ndim == 1:
        one_hot = np.zeros((N, vocab_size), dtype=np.int32)
        for idx, word_id in enumerate(corpus):
            one_hot[idx, word_id] = 1

    elif corpus.ndim == 2:
        C = corpus.shape[1]
        one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
        for idx_0, word_ids in enumerate(corpus):
            for idx_1, word_id in enumerate(word_ids):
                one_hot[idx_0, idx_1, word_id] = 1

    return one_hot

In [4]:
def create_contexts_target(corpus, window_size=1):
    '''生成上下文和目标词

    :param corpus: 语料库（单词ID列表）
    :param window_size: 窗口大小（当窗口大小为1时，左右各1个单词为上下文）
    :return:
    '''
    target = corpus[window_size:-window_size]
    contexts = []

    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx + t])
        contexts.append(cs)

    return np.array(contexts), np.array(target)

In [5]:
class SoftmaxWithLoss:
    def __init__(self):
        self.params, self.grads = [], []
        self.y = None  # softmax的输出
        self.t = None  # 监督标签

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)

        # 在监督标签为one-hot向量的情况下，转换为正确解标签的索引
        if self.t.size == self.y.size:
            self.t = self.t.argmax(axis=1)

        loss = cross_entropy_error(self.y, self.t)
        return loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        dx = self.y.copy()
        
        dx[np.arange(batch_size), self.t] -= 1
        dx *= dout
        dx = dx / batch_size

        return dx

In [6]:
class MatMul:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.x = None

    def forward(self, x):
        W, = self.params
        out = np.dot(x, W)
        self.x = x
        return out

    def backward(self, dout):
        W, = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout)
        self.grads[0][...] = dW
        return dx



In [7]:

def softmax(x):
    if x.ndim == 2:
        x = x - x.max(axis=1, keepdims=True)
        x = np.exp(x)
        x /= x.sum(axis=1, keepdims=True)
    elif x.ndim == 1:
        x = x - np.max(x)
        x = np.exp(x) / np.sum(np.exp(x))

    return x
    
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 在监督标签为one-hot-vector的情况下，转换为正确解标签的索引
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]

    # np.log(...)方法返回的是新数组
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size


In [8]:

class SimpleCBOW:
    def __init__(self, vocab_size, hidden_size):
        V, H = vocab_size, hidden_size
        print(V, H)
        # 初始化权重
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(H, V).astype('f')
        print('W_in===\n',W_in)
        print('W_out===\n',W_out)
        
        # 生成层
        self.in_layer0 = MatMul(W_in)
        self.in_layer1 = MatMul(W_in)
        self.out_layer = MatMul(W_out)
        self.loss_layer = SoftmaxWithLoss()

        # 将所有的权重和梯度整理到列表中
        layers = [self.in_layer0, self.in_layer1, self.out_layer]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        # 将单词的分布式表示设置为成员变量
        self.word_vecs = W_in

    def forward(self, contexts, target):
        print('contexts[:, 0]== \n',contexts[:, 0])
        print('contexts[:, 1]== \n',contexts[:, 1])
        h0 = self.in_layer0.forward(contexts[:, 0])
        h1 = self.in_layer1.forward(contexts[:, 1])
        print('h0== \n',h0)
        print('h1== \n',h1)
        h = (h0 + h1) * 0.5
        score = self.out_layer.forward(h)
        print('score===\n',score)
        print('target===\n',target)
        loss = self.loss_layer.forward(score, target)
        print('loss',loss)
        return loss

    def backward(self, dout=1):
        ds = self.loss_layer.backward(dout)
        da = self.out_layer.backward(ds)
        da *= 0.5
        self.in_layer1.backward(da)
        self.in_layer0.backward(da)
        return None


In [9]:
text = 'You say say say I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
print(corpus, word_to_id, id_to_word,'==corpus, word_to_id, id_to_word')
contexts, target = create_contexts_target(corpus, window_size=1)
vocab_size = len(word_to_id)
print(contexts,target)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)
print(contexts,vocab_size)

hidden_size = 5
model = SimpleCBOW(vocab_size, hidden_size)
model.forward(contexts,target)

[0 1 1 1 2 1 3 4] {'you': 0, 'say': 1, 'i': 2, 'hello': 3, '.': 4} {0: 'you', 1: 'say', 2: 'i', 3: 'hello', 4: '.'} ==corpus, word_to_id, id_to_word
[[0 1]
 [1 1]
 [1 2]
 [1 1]
 [2 3]
 [1 4]] [1 1 1 2 1 3]
[[[1 0 0 0 0]
  [0 1 0 0 0]]

 [[0 1 0 0 0]
  [0 1 0 0 0]]

 [[0 1 0 0 0]
  [0 0 1 0 0]]

 [[0 1 0 0 0]
  [0 1 0 0 0]]

 [[0 0 1 0 0]
  [0 0 0 1 0]]

 [[0 1 0 0 0]
  [0 0 0 0 1]]] 5
5 5
W_in===
 [[ 0.02563999 -0.00311403  0.01157996  0.00589074 -0.00337122]
 [-0.00244224 -0.00047083  0.0110317  -0.0089099   0.00374297]
 [ 0.02793715  0.00494063 -0.01464814  0.00151682  0.00098777]
 [-0.02579672 -0.01234641 -0.00766119  0.00301793  0.0002799 ]
 [-0.00290042  0.01070336 -0.00263139 -0.00228988 -0.01353024]]
W_out===
 [[ 9.3219038e-03  1.3446729e-02  1.2954857e-02 -7.6858094e-03
   7.1504933e-04]
 [-1.7743864e-03 -1.6152894e-03 -1.5440615e-02 -4.4043818e-03
   4.4626016e-03]
 [-2.2788631e-02 -7.2356304e-03 -1.2632892e-02  7.2004659e-05
  -1.9538665e-02]
 [ 1.2394680e-02 -4.4498979e-03  

1.6093288183535606