In [29]:


# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Basic word2vec example."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf


from pylab import mpl

mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题


# Step 1: Download the data.

filename ='D:/QuanSongCi0.txt'

# Read the data into a list of strings.
def read_data(filename):
    with open(filename,'r',encoding='utf-8') as f:
        datas = f
        data=[]
        for words in datas:
            for word in words:
                if word == '\n':
                    continue
                else:
                    data.append(word)     
    return data

vocabulary = read_data(filename)
print('Data size', len(vocabulary))


##仅保留5k个词
# Step 2: Build the dictionary and replace rare words with UNK token.
#对于中文仅保留5000个
vocabulary_size = 5000


def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
##count:将词频不高得命名为UNK,并将其加入count
  count = [['UNK', -1]]
##count:将词频最高的4999个词加入当前列表count
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
##dictionary:给字典中的文字按词频高低排序，并且0位对应UNK，其他依次对应词频由高到低的
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1
    data.append(index)
##data:表示所有输入数据的索引。用索引来表示所有的输入数据
  count[0][1] = unk_count
##在count中加入UNK文字的数量
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary
##data：输入数据的代号表示
##count：词频前4999和其他UNK文字的词频数量
##dictionary:按照词频高低排序，0位是UNK和前4999的词语的词频顺序
##reversed_dictionary:将dictionary中的key和value的值交换，可以由编号来查询对应文字

# Filling 4 global variables:
# data - list of codes (integers from 0 to vocabulary_size-1).
#   This is the original text but words are replaced by their codes
# count - map of words(strings) to count of occurrences
# dictionary - map of words(strings) to their codes(integers)
# reverse_dictionary - maps codes(integers) to words(strings)
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)
##释放vocabulary占的内存
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
#print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
print('Sample data', data[25900:26000], [reverse_dictionary[i] for i in data[25900:26000]])
data_index = 0

# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
##每个batch的输入数据，仅包含文字的代号
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
##每个batch的输入数据和对应的标签
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
##以中心点所在位置，不选取中心点词进行选取
    context_words = [w for w in range(span) if w != skip_window]
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[context_word]
    if data_index == len(data):
      data_index = 0      
      buffer.extend(data[:3])
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

# Step 4: Build and train a skip-gram model.
##超参数定义
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)


graph = tf.Graph()

with graph.as_default():
##定义了一个placeholder
  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  # Explanation of the meaning of NCE loss:
  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
##nce_loss，不是去做5w个一遍交叉熵，而是转化为一些正样本，
  loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Add variable initializer.
  init = tf.global_variables_initializer()

# Step 5: Begin training.
num_steps = 600001

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in range(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = '%s %s,' % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()
##


# Step 6: Visualize the embeddings.


# pylint: disable=missing-docstring
# Function to draw visualization of distance between embeddings.
def plot_with_labels(low_dim_embs, labels, filename):
  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)

try:
  # pylint: disable=g-import-not-at-top
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
  plot_only = 500
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
  labels = [reverse_dictionary[i] for i in range(plot_only)]
  plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))

except ImportError as ex:
  print('Please install sklearn, matplotlib, and scipy to show embeddings.')
  print(ex)
  


Data size 1786003
Most common words (+UNK) [['UNK', 1194], ('。', 149620), ('，', 108451), ('、', 19612), ('人', 13607)]
Sample data [87, 1, 65, 1356, 433, 894, 405, 74, 2, 89, 1926, 354, 345, 1170, 40, 1, 236, 10, 296, 2005, 750, 27, 1904, 1, 54, 100, 325, 975, 516, 34, 1, 634, 41, 140, 8, 35, 1187, 226, 9, 73, 1928, 606, 147, 16, 1, 213, 236, 1738, 3, 80, 20, 651, 1, 850, 15, 1832, 112, 275, 326, 78, 1, 1215, 32, 11, 262, 473, 1, 51, 38, 343, 565, 14, 228, 138, 2, 5, 13, 105, 3, 354, 448, 77, 1, 148, 2136, 113, 2496, 229, 1926, 1172, 1, 63, 543, 158, 38, 803, 1, 439, 124, 205] ['行', '。', '小', '打', '登', '钩', '怕', '重', '，', '尽', '缠', '绣', '带', '由', '长', '。', '娇', '春', '莺', '舌', '巧', '如', '簧', '。', '飞', '在', '四', '条', '弦', '上', '。', '庆', '金', '枝', '（', '中', '吕', '宫', '）', '青', '螺', '添', '远', '山', '。', '两', '娇', '靥', '、', '笑', '时', '圆', '。', '抱', '云', '勾', '雪', '近', '灯', '看', '。', '妍', '处', '不', '堪', '怜', '。', '今', '生', '但', '愿', '无', '离', '别', '，', '花', '月', '下', '、', '绣', '屏', '前', '。', '双

Average loss at step  82000 :  4.29188578892
Average loss at step  84000 :  4.23508478451
Average loss at step  86000 :  4.13144538951
Average loss at step  88000 :  4.14134378254
Average loss at step  90000 :  4.14154605627
Nearest to 谁: 你, 岂, 还, 淝, 糊, 只, 怎, 不,
Nearest to 满: 霏, 压, 讵, 酪, 聃, 转, 涨, 鹫,
Nearest to 声: 听, 邢, 闻, 鹃, 悲, 逼, 俄, 庞,
Nearest to 未: 不, 也, 难, 初, 渐, 已, 速, 舜,
Nearest to 生: 坻, 詹, 庸, 靓, 帕, 雄, 闪, 飧,
Nearest to 。: ，, 、,  , ）, 泸, 洊, 僝, 螭,
Nearest to 天: 崆, 辟, 练, 空, 自, 赞, 体, m,
Nearest to ，: 、, 。, ）, 宏, 允,  , 杀, 洊,
Nearest to 子: 葛, 妓, 畏, 棼, 就, 武, 吉, 芷,
Nearest to 春: 秋, 嫂, 花, 匠, 版, 域, 闷,  ,
Nearest to 明:  , 黼, 粽, 巍, 莹, 望, 渌, 俱,
Nearest to 愁: 泪, 情, 飕, 恨, 弹, 获, 亏, 姥,
Nearest to 头: 悟, 南, 督, 娇, 骖, 蚊, 钲, 瓦,
Nearest to 前: 逼, 鲥, 滔, 铸, 亥, 踌, 穿, 降,
Nearest to 飞: 郢, 涌, 漆, 乱, 讹, 又, 堕, 腮,
Nearest to 香: 炜, 芳, 冉, 绽, 襟, 馥, 仓, 俎,
Average loss at step  92000 :  4.14693562937
Average loss at step  94000 :  4.15120869654
Average loss at step  96000 :  4.15312816358
Average loss at step  98000 :  4

Average loss at step  182000 :  4.09861570585
Average loss at step  184000 :  4.14196477568
Average loss at step  186000 :  4.1293845098
Average loss at step  188000 :  4.14249644196
Average loss at step  190000 :  4.11672523105
Nearest to 谁: 你, 还, 岂, 怎, 那, 糊, 只, 不,
Nearest to 满: 压, 转, 任, 霏, 讵, 里, 鬘, 污,
Nearest to 声: 闻, 听, 鹃, 笛, 邢, 语, 哀, 批,
Nearest to 未: 不, 也, 渐, 易, 难, 已, 初, 煜,
Nearest to 生: 坻, 稍, 帕, 詹, 烘, 壬, 麹, 匈,
Nearest to 。: ，, 、, ）, 洊, 缔, 伽, 萨, 筮,
Nearest to 天: 崆, 辟, 自, 空, 赞, 威, 春, 体,
Nearest to ，: 、, 。, ）, 崦, 尸, 毯, 晼, ；,
Nearest to 子: 妓, 葛, 畏, 吉, 乙, 棼, 用, 申,
Nearest to 春: 秋, 迟, 花, 人, 芳, 皴, 贫, 酺,
Nearest to 明: 光, 圆, 莹, 粽, 十, 貌, 黼, 驶,
Nearest to 愁: 恨, 泪, 情, 飕, 秋, 获, 病, 亏,
Nearest to 头: 首, 南, 竞, 发, 悟, 痕, 上, 崖,
Nearest to 前: 鲥, 沙, 杨, 滔, 亥, 再, 岿, 善,
Nearest to 飞: 涌, 吹, 飘, 鸢, 堕, 鄮, 浴, 坠,
Nearest to 香: 芳, 炜, 馥, 公, 绽, 熏, 钺, 雹,
Average loss at step  192000 :  4.13788739121
Average loss at step  194000 :  4.16241089845
Average loss at step  196000 :  4.09655035138
Average loss at step  198

Average loss at step  282000 :  4.01818280423
Average loss at step  284000 :  4.0376077559
Average loss at step  286000 :  3.99056695282
Average loss at step  288000 :  4.01250515461
Average loss at step  290000 :  4.03289724004
Nearest to 谁: 你, 岂, 还, 仍, 那, 只, 糊, 予,
Nearest to 满: 任, 讵, 转, 压, 入, 鬘, 边, 属,
Nearest to 声: 闻, 听, 笛, 哀, 响, 鸣, 叫, 鹃,
Nearest to 未: 不, 也, 先, 易, 已, 渐, 难, 乍,
Nearest to 生: 坻, 稍, 帕, 壬, 爰, 纽, 騃, 腮,
Nearest to 。: ，, 、, ）, 洊, 泸, 萨, 筮, 鸪,
Nearest to 天: 崆, 空, 辟, 皴, 讳, 威, 赞, 练,
Nearest to ，: 、, 。, ）, 崦, 毯, ；, 糕, 丐,
Nearest to 子: 妓, 葛, 乙, 棼, 畏, 鹅, 案, 吉,
Nearest to 春: 秋, 迟, 嫂, 酺, 鬅, 冬, 匠, 腊,
Nearest to 明: 光, 圆, 今, 十, 莹, 照, 当, 勾,
Nearest to 愁: 恨, 泪, 情, 飕, 获, 羞, 怕, 病,
Nearest to 头: 首, 南, 竞, 发, 嗽, 汜, 探, 悟,
Nearest to 前: 今, 鲥, 沙, 杨, 岿, 棰, ¤, 躔,
Nearest to 飞: 涌, 飘, 坠, 鸢, 浴, 晬, 吹, 随,
Nearest to 香: 馥, 炜, 芳, 熏, 麝, 薰, 鞠, 黏,
Average loss at step  292000 :  4.02160466421
Average loss at step  294000 :  4.05587696874
Average loss at step  296000 :  4.09565405083
Average loss at step  298

Average loss at step  382000 :  4.04741754103
Average loss at step  384000 :  4.05890315926
Average loss at step  386000 :  4.05064651978
Average loss at step  388000 :  4.06892189491
Average loss at step  390000 :  4.07705196977
Nearest to 谁: 岂, 你, 还, 那, 仍, 怎, 只, 我,
Nearest to 满: 任, 入, 讵, 盈, 属, 遍, 鬘, 滚,
Nearest to 声: 响, 闻, 听, 笛, 叫, 鸣, 鹃, 哀,
Nearest to 未: 不, 易, 乍, 难, 煜, 欲, 也, 渐,
Nearest to 生: 坻, 稍, 煞, 騃, 境, 匈, 腮, 纽,
Nearest to 。: ，, 、, ）, 泸, 筮, 洊, 虔, 淘,
Nearest to 天: 崆, 辟, 皴, 赞, 忱, 自, 威, 汝,
Nearest to ，: 、, 。, ）, 洊, 毯, 丐, ；, 添,
Nearest to 子: 乙, 妓, 葛, 棼, 皑, 藁, 鹅, 珞,
Nearest to 春: 秋, 迟, 冬, 酺, 皴, 芳, 嫂, 柁,
Nearest to 明: 圆, 光, 当, 阴, 照, 隙, 炯, 晨,
Nearest to 愁: 恨, 泪, 情, 秋, 飕, 羞, 伤, 偬,
Nearest to 头: 首, 嗽, 悟, 竞, 案, 南, 汜, 尾,
Nearest to 前: 鲥, 善, 杨, 后, 沙, 今, 窨, 棰,
Nearest to 飞: 飘, 浴, 吹, 涌, 鸢, 碾, 坠, 随,
Nearest to 香: 馥, 芳, 炜, 熏, 雹, 薰, 公, 鞠,
Average loss at step  392000 :  3.97010302401
Average loss at step  394000 :  3.99374795175
Average loss at step  396000 :  4.01765388048
Average loss at step  39

Average loss at step  482000 :  3.9438780421
Average loss at step  484000 :  3.94763753897
Average loss at step  486000 :  3.9744771409
Average loss at step  488000 :  3.98429938591
Average loss at step  490000 :  4.04771719217
Nearest to 谁: 还, 仍, 那, 岂, 怎, 你, 不, 否,
Nearest to 满: 入, 任, 遍, 鬘, 盈, 滚, 属, 绕,
Nearest to 声: 响, 闻, 听, 鸣, 笛, 语, 叫, 阕,
Nearest to 未: 不, 易, 乍, 先, 已, 也, 骥, 难,
Nearest to 生: 坻, ）, 煞, 壬, 稍, 烘, 詹, 滋,
Nearest to 。: ，, 、, ）, ；, 萨, 丐, 讠, 愁,
Nearest to 天: 紞, 崆, 荆, 皴, 赞, 辟, 讳, 汝,
Nearest to ，: 、, 。, ）, ；, 晼, 丐, 逶, 逐,
Nearest to 子: 乙, 妓, 癸, 棼, 葛, 藁, 季, 鹅,
Nearest to 春: 秋, 迟, 皴, 鬅, 冬, 酺, 嫂, 暖,
Nearest to 明: 圆, 光, 隙, 莹, 辉, 今, 泮, 驶,
Nearest to 愁: 恨, 泪, 情, 伤, 羞, 怕, 病, 秋,
Nearest to 头: 首, 南, 嗽, 凯, 案, 戴, 汜, 悟,
Nearest to 前: 岿, 沙, 二, 今, 杨, 棰, 璿, 窨,
Nearest to 飞: 飘, 涌, 浴, 坠, 鸢, 吹, 碾, 鄮,
Nearest to 香: 馥, 熏, 炜, 芳, 薰, 雹, 麝, 烟,
Average loss at step  492000 :  4.01275170422
Average loss at step  494000 :  4.04327751696
Average loss at step  496000 :  4.01935084331
Average loss at step  4980

Average loss at step  582000 :  4.02323340571
Average loss at step  584000 :  4.04644522786
Average loss at step  586000 :  4.02970846236
Average loss at step  588000 :  3.9215048058
Average loss at step  590000 :  3.9481323148
Nearest to 谁: 仍, 还, 那, 我, 不, 怎, 你, 岂,
Nearest to 满: 任, 鬘, 遍, 讵, 入, 湿, 盈, 绕,
Nearest to 声: 响, 闻, 鸣, 听, 笛, 阕, 语, 叫,
Nearest to 未: 不, 乍, 易, 先, 欲, 难, 也, 煜,
Nearest to 生: 坻, 滋, 境, 騃, 稍, 腮, 遣, 按,
Nearest to 。: ，, 、, ）, （, ；, 洊, 似, 蘸,
Nearest to 天: 空, 云, 霄, 皴, 自, 辟, 崆, 威,
Nearest to ，: 、, 。, ）, ；, 豳, 毯, 奂, 花,
Nearest to 子: 乙, 藁, 棼, 簦, 葛, 皑, 珞, ,
Nearest to 春: 秋, 冬, 迟, 芳, 酺, 嫂, 快, 花,
Nearest to 明: 圆, 光, 隙, 今, 驶, 辉, 炯, 阴,
Nearest to 愁: 恨, 泪, 情, 病, 伤, 偬, 怕, 羞,
Nearest to 头: 首, 南, 戴, 上, 嗽, 根, 尖, 边,
Nearest to 前: 岿, 今, 沙, 二, 蛾, 裴, 明, 仔,
Nearest to 飞: 飘, 浴, 涌, 鸢, 坠, 颔, 随, 蹴,
Nearest to 香: 馥, 熏, 炜, 薰, 麝, 雹, 芳, 药,
Average loss at step  592000 :  3.98716396976
Average loss at step  594000 :  3.91833965325
Average loss at step  596000 :  3.93293624747
Average loss at step  5980

In [30]:
import json
##json dump过的dictionary中的key是string类型的




In [31]:
with open('D:/reverse_dictionary.json','r')as file:
	reversed_dictionary1=json.load(file)
print(reversed_dictionary1['0'])

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
with open('D:/dictionary.json','r')as file:
    dictionary1=json.load(file)


In [32]:
np.save('d:/embedding.npy', final_embeddings)

In [None]:
reverse_dic={1:'a',2:'b',3:'c',0:'unk'}
dic={'unk':0 ,'a':1, 'b':2,'c':3,}
words=['a','b','c','e',]
wordss=[]
for word in words:
    if word in dic.keys():
        wordss.append(dic[word])
    else:
        wordss.append(0)
print(wordss)

In [None]:
with open('d:/QuanSongCi.txt','r',encoding="utf-8")as file:
    words=file.readlines()
for word

In [None]:
words=['a','b','c','e',]
print(type(words[0]))