In [16]:
# 1 单词的表征方式
# 1.1 one-hot
# 1.2 词嵌入（词向量）

# 2 one-hot的缺点
# 2.1 维度高，计算量大
# 2.2 不能反映单词间的联系和相似度

# 3 词嵌入
# 3.1 用一个词嵌入矩阵， 把单词表示成一个较低维度的向量
# 3.2 向量的每个维度代表单词的一个属性特征，如是否是名词、是否是食物、是否以a开头
# 3.3 这些维度不是人为规定的，而是算法学习到的

# 4 Word2Vec
# 4.1 Word2Vec是一种学习词嵌入矩阵的算法
# 4.2 包括两种模式：CBOW，skip_gram
# 4.3 CBOW：由中心词周围的词（Contex）预测中心词（Target）
# 4.4 skip_gram：由中心词（Target）预测中心词周围的词（Context）
# 4.5 skip_gram具体步骤：
# 4.5.1 将文本中的每个词用one-hot向量表示
# 4.5.2 初始化一个词嵌入矩阵，将文本转换为词向量表示
# 4.5.3 中心词作为输入，中心词周围skip_window内取num_skips的词作为真实输出，得到num_skips组样本，依次滑动窗口，遍历所有文本
# 4.5.4 通过线性神经元和softmax激活函数将输入转换为预测值
# 4.5.5 计算损失函数，最小化后得到最优词嵌入矩阵

In [1]:
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf

In [2]:
# 下载数据
url = "http://mattmahoney.net/dc"

def maybe_download(filename, expected_bytes):
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url+filename, filename)
    stat_info = os.stat(filename)
    if stat_info.st_size == expected_bytes:
        print("Found and verified", filename)
    else:
        print(stat_info.str_size)
        raise Exception("Failed to verif"+filename+". Can you get it with a browser?")
    return filename

filename = maybe_download("text8.zip", 31344016)

Found and verified text8.zip


In [5]:
# 读取数据
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print("Datasize", len(words))

Datasize 17005207


In [6]:
# 将原始数据转换成索引形式，tf.nn.embedding_lookup会将索引转化成为词向量
vocabulary_size = 50000

def build_dataset(words):
    count = [["UNK", -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)

In [7]:
del words
print("Most common words (+UNK)", count[:5])
print("Sample data", data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [8]:
# 生成训练集
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size%num_skips == 0
    assert num_skips <= 2*skip_window
    batch = np.ndarray(shape=(batch_size, ), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2*skip_window+1
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index+1)%len(data)
        
    for i in range(batch_size//num_skips):
        target = skip_window 
        target_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in target_to_avoid:
                target = random.randint(0, span-1)
            target_to_avoid.append(target)
            batch[i*num_skips+j] = buffer[skip_window]
            labels[i*num_skips+j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index+1)%len(data)
    return batch, labels

In [9]:
batch, labels = generate_batch(8, 2, 1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], "->", labels[i, 0], reverse_dictionary[labels[i, 0]])

3081 originated -> 5234 anarchism
3081 originated -> 12 as
12 as -> 6 a
12 as -> 3081 originated
6 a -> 195 term
6 a -> 12 as
195 term -> 6 a
195 term -> 2 of


In [10]:
# 定义相关参数
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64

In [11]:
# 构建计算图
graph = tf.Graph()
with graph.as_default():
    # train_inputs = (128, )
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    # train_labels = (128, 1)
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    # valid_dataset = (16, )
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.device("/cpu:0"):
        # embeddings = (50000, 128)
        embeddings = tf.Variable(tf.random_normal([vocabulary_size, embedding_size], -1.0, 1.0))
        # embed = (128, 128)，前128代表样本索引，后128代表属性特征索引
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        # nce_weights = (50000, 128)
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], 
                                                      stddev=1.0/math.sqrt(embedding_size)))
        
        # nce_biases = (50000, )
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, 
                                            biases=nce_biases, 
                                            labels=train_labels, 
                                            inputs=embed, 
                                            num_sampled=num_sampled, 
                                            num_classes=vocabulary_size))
        
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        # norm = (50000, 1)
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        # normlized_embeddings = (50000, 128)
        normalized_embeddings = embeddings/norm
        # valid_embeddings = (16, 128)
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        # similarity = (16, 50000)
        similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
        
        init = tf.global_variables_initializer()

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [15]:
num_steps = 100001

with tf.Session(graph=graph) as session:
    init.run()
    print("Initialized")
    
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs:batch_inputs, train_labels:batch_labels}
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
    
        if step%1000==0:
            if step>0:
                average_loss /= 2000
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0
            
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log_str = "Nearest to %s:" % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s, " % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  267.3464050292969
Nearest to was: secure,  butte,  bayezid,  doctrine,  undertaken,  plume,  hsbc,  hutchins, 
Nearest to one: concerted,  tata,  overseer,  ax,  theses,  anselm,  muslim,  overpowered, 
Nearest to in: slavery,  manifestation,  blasting,  karenga,  disposal,  improperly,  taliaferro,  allowable, 
Nearest to th: kuwaiti,  enciphered,  limb,  gwh,  blind,  lir,  courtyards,  vengeful, 
Nearest to would: consular,  cathal,  iir,  elite,  profiles,  segments,  mythological,  arsenate, 
Nearest to been: poodle,  jovian,  recovered,  bjelke,  barbuda,  opportunity,  overdosing,  hasmonaean, 
Nearest to zero: ratzinger,  unravel,  almanac,  ferromagnetism,  nadezhda,  wealthy,  norm,  gamer, 
Nearest to who: detonator,  unhealthy,  deciding,  inaccessible,  airshow,  helices,  ssn,  approximating, 
Nearest to UNK: montego,  restaurant,  ebbinghaus,  overtones,  accordions,  clausius,  lackluster,  limestone, 
Nearest to an: flavored,  phi

Average loss at step  6000 :  17.071460297584533
Nearest to was: is,  in,  and,  UNK,  alberoni,  secure,  are,  a, 
Nearest to one: UNK,  two,  nine,  and,  zero,  the,  in,  a, 
Nearest to in: and,  of,  to,  UNK,  for,  is,  two,  the, 
Nearest to th: esque,  kuwaiti,  brendan,  blind,  lir,  manfred,  toussaint,  satirizes, 
Nearest to would: absorbed,  stamping,  tank,  focussing,  that,  consular,  talk,  cs, 
Nearest to been: ionic,  earliest,  recovered,  aclu,  dysfunctional,  zseries,  contributed,  freehold, 
Nearest to zero: one,  and,  nine,  in,  of,  UNK,  canis,  two, 
Nearest to who: approximating,  this,  waff,  misery,  detonator,  lintel,  and,  zero, 
Nearest to UNK: and,  in,  the,  one,  of,  two,  a,  to, 
Nearest to an: UNK,  one,  the,  coens,  and,  canc,  portrayed,  manipulation, 
Nearest to known: schwerin,  adair,  shahada,  vs,  keegan,  habit,  sha,  inflating, 
Nearest to however: and,  a,  to,  shaolin,  of,  in,  the,  gwen, 
Nearest to are: and,  wa

Average loss at step  13000 :  7.4934876017570495
Nearest to was: is,  and,  in,  are,  by,  three,  UNK,  two, 
Nearest to one: two,  nine,  zero,  eight,  and,  three,  UNK,  the, 
Nearest to in: and,  of,  for,  by,  to,  on,  from,  is, 
Nearest to th: esque,  ventral,  two,  brendan,  complicated,  kuwaiti,  eclac,  cultivation, 
Nearest to would: that,  were,  is,  those,  to,  focussing,  a,  stamping, 
Nearest to been: ionic,  zseries,  an,  recovered,  was,  contributed,  dysfunctional,  earliest, 
Nearest to zero: nine,  two,  and,  one,  eight,  three,  five,  four, 
Nearest to who: and,  also,  this,  approximating,  that,  misery,  waff,  not, 
Nearest to UNK: and,  one,  two,  a,  the,  in,  three,  is, 
Nearest to an: the,  UNK,  one,  and,  a,  it,  two,  canc, 
Nearest to known: schwerin,  a,  adair,  UNK,  keegan,  xm,  inflating,  wedgwood, 
Nearest to however: and,  in,  of,  for,  to,  a,  shaolin,  the, 
Nearest to are: and,  is,  was,  or,  in,  nine,  by,  of, 


Average loss at step  20000 :  5.300805303931236
Nearest to was: is,  by,  are,  in,  and,  as,  has,  three, 
Nearest to one: two,  eight,  four,  three,  zero,  the,  nine,  seven, 
Nearest to in: and,  of,  for,  by,  on,  from,  as,  with, 
Nearest to th: two,  four,  eight,  three,  one,  ventral,  nine,  six, 
Nearest to would: that,  to,  were,  is,  those,  because,  was,  stamping, 
Nearest to been: was,  by,  an,  is,  ionic,  zseries,  recovered,  as, 
Nearest to zero: three,  nine,  two,  five,  eight,  four,  seven,  six, 
Nearest to who: and,  that,  also,  this,  not,  though,  UNK,  approximating, 
Nearest to UNK: and,  one,  two,  three,  a,  the,  which,  or, 
Nearest to an: the,  one,  a,  it,  UNK,  and,  which,  this, 
Nearest to known: a,  schwerin,  adair,  keegan,  xm,  sigmund,  which,  inflating, 
Nearest to however: and,  in,  of,  for,  four,  shaolin,  on,  was, 
Nearest to are: is,  was,  and,  were,  in,  or,  by,  UNK, 
Nearest to system: four,  and,  UN

Average loss at step  27000 :  4.555128597974777
Nearest to was: is,  by,  are,  and,  has,  in,  as,  were, 
Nearest to one: two,  four,  three,  five,  eight,  seven,  six,  nine, 
Nearest to in: and,  for,  from,  of,  on,  by,  at,  with, 
Nearest to th: two,  one,  four,  three,  six,  seven,  eight,  zero, 
Nearest to would: to,  is,  were,  that,  was,  can,  are,  may, 
Nearest to been: was,  by,  are,  were,  is,  an,  to,  strong, 
Nearest to zero: three,  four,  two,  eight,  five,  six,  nine,  seven, 
Nearest to who: that,  also,  and,  this,  not,  which,  among,  two, 
Nearest to UNK: and,  two,  three,  six,  four,  one,  or,  five, 
Nearest to an: the,  it,  a,  one,  this,  his,  UNK,  which, 
Nearest to known: a,  UNK,  schwerin,  which,  keegan,  adair,  it,  list, 
Nearest to however: and,  in,  four,  is,  was,  for,  on,  shaolin, 
Nearest to are: is,  were,  was,  and,  or,  by,  in,  but, 
Nearest to system: and,  a,  four,  this,  number,  each,  fulk,  epimen

Average loss at step  35000 :  4.077064296960831
Nearest to was: is,  by,  are,  were,  has,  as,  and,  had, 
Nearest to one: two,  four,  three,  eight,  six,  five,  seven,  nine, 
Nearest to in: from,  and,  at,  for,  on,  of,  by,  nine, 
Nearest to th: four,  six,  two,  eight,  one,  three,  five,  seven, 
Nearest to would: can,  to,  were,  may,  was,  are,  is,  those, 
Nearest to been: was,  by,  were,  are,  strong,  become,  is,  an, 
Nearest to zero: five,  eight,  six,  four,  three,  seven,  two,  nine, 
Nearest to who: also,  that,  and,  not,  which,  this,  they,  he, 
Nearest to UNK: one,  and,  two,  three,  four,  seven,  five,  six, 
Nearest to an: the,  it,  his,  this,  one,  a,  which,  its, 
Nearest to known: a,  schwerin,  it,  which,  keegan,  adair,  much,  one, 
Nearest to however: in,  and,  was,  is,  four,  six,  shaolin,  on, 
Nearest to are: were,  is,  was,  and,  or,  have,  but,  in, 
Nearest to system: and,  a,  UNK,  each,  this,  number,  fulk,

Average loss at step  43000 :  3.6277365272045135
Nearest to was: is,  by,  were,  has,  are,  had,  or,  as, 
Nearest to one: two,  six,  four,  three,  five,  seven,  eight,  nine, 
Nearest to in: from,  at,  on,  and,  with,  nine,  of,  for, 
Nearest to th: four,  six,  eight,  three,  two,  five,  seven,  nine, 
Nearest to would: can,  may,  were,  to,  was,  are,  will,  is, 
Nearest to been: was,  by,  were,  are,  strong,  become,  be,  is, 
Nearest to zero: five,  four,  eight,  six,  three,  seven,  two,  nine, 
Nearest to who: also,  that,  which,  not,  he,  they,  it,  this, 
Nearest to UNK: and,  one,  two,  or,  seven,  four,  three,  five, 
Nearest to an: the,  this,  it,  his,  one,  which,  a,  its, 
Nearest to known: a,  schwerin,  it,  much,  keegan,  which,  such,  used, 
Nearest to however: and,  is,  was,  in,  six,  or,  nine,  shaolin, 
Nearest to are: were,  is,  was,  or,  have,  but,  and,  eight, 
Nearest to system: and,  this,  fulk,  number,  epimenides, 

Average loss at step  50000 :  3.4151546224355696
Nearest to was: is,  has,  were,  by,  had,  are,  or,  but, 
Nearest to one: two,  four,  six,  five,  three,  seven,  eight,  zero, 
Nearest to in: at,  from,  and,  on,  of,  for,  by,  nine, 
Nearest to th: four,  three,  six,  five,  seven,  nine,  eight,  s, 
Nearest to would: can,  may,  to,  will,  were,  was,  are,  is, 
Nearest to been: was,  by,  were,  are,  become,  be,  strong,  is, 
Nearest to zero: six,  five,  four,  eight,  seven,  three,  two,  nine, 
Nearest to who: also,  which,  that,  not,  he,  they,  and,  it, 
Nearest to UNK: seven,  two,  or,  one,  and,  four,  three,  five, 
Nearest to an: the,  this,  it,  his,  which,  a,  one,  or, 
Nearest to known: a,  it,  schwerin,  such,  used,  many,  much,  part, 
Nearest to however: in,  was,  is,  and,  but,  or,  four,  six, 
Nearest to are: were,  is,  was,  have,  but,  or,  eight,  be, 
Nearest to system: and,  this,  number,  fulk,  each,  UNK,  epimenides, 

Average loss at step  58000 :  3.4159696329832077
Nearest to was: is,  were,  has,  had,  by,  are,  but,  in, 
Nearest to one: six,  two,  seven,  three,  four,  five,  eight,  nine, 
Nearest to in: at,  and,  from,  on,  nine,  of,  with,  for, 
Nearest to th: three,  six,  four,  eight,  seven,  nine,  five,  s, 
Nearest to would: can,  may,  will,  to,  were,  was,  could,  are, 
Nearest to been: was,  were,  by,  be,  become,  strong,  are,  had, 
Nearest to zero: five,  four,  six,  seven,  eight,  three,  nine,  two, 
Nearest to who: which,  also,  he,  that,  not,  they,  it,  and, 
Nearest to UNK: five,  or,  four,  and,  three,  six,  seven,  one, 
Nearest to an: this,  the,  it,  his,  its,  which,  three,  their, 
Nearest to known: a,  it,  used,  such,  schwerin,  many,  part,  much, 
Nearest to however: but,  and,  in,  is,  was,  one,  are,  when, 
Nearest to are: were,  is,  was,  have,  but,  or,  and,  eight, 
Nearest to system: UNK,  number,  this,  each,  and,  fulk

Average loss at step  66000 :  3.4317288140058517
Nearest to was: is,  has,  were,  had,  by,  but,  are,  in, 
Nearest to one: three,  two,  four,  seven,  eight,  six,  five,  UNK, 
Nearest to in: at,  and,  from,  on,  by,  with,  for,  of, 
Nearest to th: four,  eight,  six,  three,  nine,  seven,  five,  two, 
Nearest to would: can,  may,  will,  to,  could,  were,  was,  are, 
Nearest to been: were,  was,  by,  be,  become,  are,  strong,  being, 
Nearest to zero: five,  six,  eight,  seven,  four,  three,  nine,  two, 
Nearest to who: which,  also,  he,  and,  that,  they,  not,  it, 
Nearest to UNK: four,  and,  five,  three,  seven,  eight,  one,  six, 
Nearest to an: the,  this,  it,  no,  UNK,  its,  which,  their, 
Nearest to known: a,  used,  such,  it,  UNK,  schwerin,  many,  much, 
Nearest to however: but,  and,  in,  is,  UNK,  was,  when,  that, 
Nearest to are: were,  is,  have,  but,  was,  and,  or,  be, 
Nearest to system: UNK,  number,  this,  each,  and,  which,

Average loss at step  74000 :  3.2308148213624954
Nearest to was: is,  has,  were,  had,  by,  but,  are,  been, 
Nearest to one: two,  seven,  four,  six,  three,  eight,  five,  zero, 
Nearest to in: on,  from,  at,  and,  with,  of,  nine,  for, 
Nearest to th: four,  six,  eight,  seven,  five,  three,  two,  one, 
Nearest to would: can,  may,  will,  could,  to,  were,  are,  but, 
Nearest to been: by,  were,  was,  be,  become,  strong,  being,  are, 
Nearest to zero: eight,  five,  six,  four,  seven,  three,  two,  nine, 
Nearest to who: he,  also,  which,  they,  not,  that,  often,  it, 
Nearest to UNK: and,  three,  seven,  five,  then,  four,  six,  two, 
Nearest to an: the,  this,  it,  his,  no,  coens,  its,  their, 
Nearest to known: such,  used,  a,  it,  schwerin,  many,  seen,  much, 
Nearest to however: but,  in,  and,  is,  when,  was,  while,  that, 
Nearest to are: were,  is,  have,  but,  was,  or,  be,  by, 
Nearest to system: number,  UNK,  fulk,  this,  and, 

Average loss at step  81000 :  3.2039005045890807
Nearest to was: is,  has,  were,  had,  by,  but,  are,  in, 
Nearest to one: two,  four,  seven,  three,  six,  five,  eight,  zero, 
Nearest to in: at,  from,  on,  and,  with,  of,  nine,  for, 
Nearest to th: six,  four,  one,  three,  five,  seven,  two,  eight, 
Nearest to would: can,  may,  will,  could,  to,  but,  were,  was, 
Nearest to been: was,  were,  be,  by,  become,  being,  strong,  also, 
Nearest to zero: five,  four,  eight,  six,  seven,  three,  two,  nine, 
Nearest to who: he,  also,  which,  they,  that,  and,  not,  often, 
Nearest to UNK: four,  five,  three,  and,  two,  then,  seven,  six, 
Nearest to an: the,  this,  it,  no,  coens,  his,  its,  some, 
Nearest to known: such,  used,  a,  it,  schwerin,  seen,  described,  many, 
Nearest to however: but,  and,  in,  when,  that,  is,  while,  which, 
Nearest to are: were,  is,  but,  have,  was,  be,  and,  or, 
Nearest to system: number,  this,  UNK,  and, 

Average loss at step  88000 :  3.22996446454525
Nearest to was: is,  has,  were,  had,  by,  but,  are,  s, 
Nearest to one: seven,  two,  four,  three,  six,  five,  eight,  nine, 
Nearest to in: at,  from,  on,  and,  of,  for,  nine,  with, 
Nearest to th: six,  three,  four,  seven,  five,  two,  nine,  one, 
Nearest to would: can,  may,  will,  could,  to,  but,  was,  were, 
Nearest to been: was,  were,  be,  become,  by,  being,  strong,  had, 
Nearest to zero: five,  four,  eight,  six,  seven,  two,  three,  nine, 
Nearest to who: he,  also,  which,  they,  that,  often,  not,  and, 
Nearest to UNK: five,  four,  three,  seven,  one,  six,  two,  eight, 
Nearest to an: the,  this,  it,  one,  its,  coens,  no,  their, 
Nearest to known: such,  used,  a,  it,  schwerin,  seen,  described,  many, 
Nearest to however: but,  in,  when,  and,  that,  while,  which,  is, 
Nearest to are: were,  is,  have,  but,  was,  be,  or,  and, 
Nearest to system: number,  this,  it,  systems, 

Average loss at step  95000 :  3.0736037313938143
Nearest to was: is,  has,  had,  were,  by,  but,  are,  been, 
Nearest to one: two,  four,  six,  seven,  five,  three,  eight,  zero, 
Nearest to in: at,  from,  on,  and,  nine,  for,  eight,  of, 
Nearest to th: six,  five,  four,  three,  one,  seven,  nine,  two, 
Nearest to would: can,  may,  will,  could,  to,  but,  did,  might, 
Nearest to been: was,  were,  become,  be,  by,  being,  strong,  had, 
Nearest to zero: five,  four,  six,  seven,  eight,  three,  two,  nine, 
Nearest to who: he,  also,  which,  they,  often,  that,  not,  there, 
Nearest to UNK: four,  five,  three,  two,  six,  seven,  one,  eight, 
Nearest to an: the,  this,  it,  no,  its,  or,  coens,  merges, 
Nearest to known: such,  used,  a,  it,  seen,  schwerin,  described,  some, 
Nearest to however: but,  when,  in,  and,  that,  while,  which,  is, 
Nearest to are: were,  is,  have,  but,  was,  or,  be,  and, 
Nearest to system: UNK,  number,  system