In [1]:
import sys
import numpy as np
import pandas as pd
from functools import reduce
from sklearn.decomposition import PCA

In [2]:
def get_word_embedding(word_embedding_file, header=False, seps=('\t', '\t')):
    """ Original Full Word Embedding，用于从中选择出词汇表word2index中的word及其vector，或构建Embedding Layer """
    word_embedding = {}
    with open(word_embedding_file, 'r', encoding='utf-8') as fr:
        if header:
            fr.readline()                        # Drop line 1
        for line in fr:
            try:
                values = line.strip().split(seps[0])
                word = values[0]
                vector = values[1:] if seps[0] == seps[1] else values[1].split(seps[1])
                word_embedding[word] = np.asarray(vector, dtype='float32')
            except ValueError as e:
                pass
    return word_embedding

In [3]:
def get_word2index(corpus, level='word', sep=None):
    """
    词汇表：支持character和word-level，以及两者的汇总
    统计corpus中character/word频率并倒序排序获得index，构建词汇字典：<character/word, index> 后续会使用index来表示character/word
    """
    word2num = {}
    for line in corpus:
        if level in ['character', 'char']:
            objs = list(line.strip())
        elif level == 'word':
            objs = line.strip().split(sep)      # 默认每一行是分词后分隔好的结果
        elif level == 'both':
            objs = list(line.strip()) + line.strip().split(sep)
        for obj in objs:
            if obj in word2num:
                word2num[obj] += 1
            else:
                word2num[obj] = 1
    word_sorted = sorted(word2num, key=word2num.get, reverse=True)           # 按character/word频率倒序排列
    word_list = word_sorted if ' ' in word_sorted else [' '] + word_sorted   # 空格是否加入vocab？
    word2index = {word: ind for (ind, word) in enumerate(word_list)}  # character/word词汇表：排列rank作为character/word的index   
    return word2index

In [4]:
def get_word2vector(word2index=None, word_embedding=None):
    """ 生成词汇表中的word及其vector，基于Original Full Embedding和词汇表word2index的结合 """
    word2vector = {}
    emb_dim = len(word_embedding.get('a'))
    for word, index in word2index.items():
        if word in word_embedding:
            vector = word_embedding.get(word)
        else:
            vectors = [word_embedding.get(x, np.zeros(emb_dim)) for x in list(word)]
            vector = reduce(lambda x, y: x + y, vectors) / len(vectors)
        if vector is not None:
            word2vector[word] = vector
    return word2vector

In [6]:
def get_basic4_dict(corpus, word_embedding, level='word', sep=None):
    """ 4个基础字典，用于各种转换 """
    word2index = get_word2index(corpus, level=level, sep=sep)
    index2word = {ind: word for (word, ind) in word2index.items()}
    word2vector = get_word2vector(word2index, word_embedding)
    index2vector = {ind: word2vector.get(word, -1) for (ind, word) in index2word.items()}
    return word2index, index2word, word2vector, index2vector

In [7]:
# Original Full Word Embedding
emb_dim_public = 200
word_embedding_file = '/home/liuyao58/data/Tencent_AILab_ChineseEmbedding.txt'
word_embedding = get_word_embedding(word_embedding_file, header=True, seps=(' ', ' '))

In [8]:
# 准备数据
cid123 = pd.read_csv('./data/cid123.txt', header=None, sep='\t', names=['cid1id', 'cid1', 'cid2id', 'cid2', 'cid3id', 'cid3'])
pwords = pd.read_csv('./data/pwords.txt', header=None, sep='\t', names=['pword'])
cid1s = set(cid123.cid1.tolist())
cid2s = set(cid123.cid2.tolist())
cid3s = set(cid123.cid3.tolist())
cids = cid1s | cid2s | cid3s
pwords = set(pwords.pword.tolist())
words = cids | pwords

In [9]:
# 4个基础数据，用于各种转换
word2index, index2word, word2vector, index2vector = get_basic4_dict(words, word_embedding, level='both')

In [10]:
def dict_to_array(dic, sortby=None):
    """ 把字典的keys和values转化为2个ndarray  sortby: 按key(=0)或value(=1)排序 """
    if sortby is None:
        items = dic.items()
    else:
        items = sorted(dic.items(), key=lambda x: x[sortby])
    keys, values = zip(*items)
    return np.asarray(keys), np.asarray(values)

def array_to_dict(index2key, array):
    """ 把array中的vector按其index转化为dict，key为index2key中index对应的key，value为vector """
    return {index2key.get(ind): vector for (ind, vector) in enumerate(array)}

In [17]:
_, arr_vector = dict_to_array(index2vector, sortby=0)

In [12]:
# PCA
def pca_reduce(X, n_components=100):
    assert X.shape[1] >= n_components, "n_components shouldn't be greater than shape of X"
    pca = PCA(n_components=n_components)
    X_mean = X - np.mean(X)
    X_pca = pca.fit_transform(X_mean)
    U1 = pca.components_
    return X_mean, X_pca, U1

# PPA
def ppa(X, d=7):
    X_mean, _, U1 = pca_reduce(X, X.shape[1])	# Get Components Ranked
    X2 = []
    for i, x in enumerate(X_mean):
        for u in U1[:d]:						# Remove Projections on Top-d Components
            x = x - np.dot(u.transpose(), x) * u
        X2.append(x)
    return np.asarray(X2)

# PCA->PPA
def pca_ppa(X1, n_components=100, d=7):
    _, X2, _ = pca_reduce(X1, n_components)		# PCA
    X3 = ppa(X2, d=d)							# PPA
    return X3

# PPA->PCA
def ppa_pca(X0, n_components=100, d=7):
    X1 = ppa(X0, d=d)							# PPA
    _, X2, _ = pca_reduce(X1, n_components)		# PCA
    return X2

# PPA->PCA->PPA
def ppa_pca_ppa(X0, n_components=100, ds=(7, 7)):
    X1 = ppa(X0, d=ds[0])						# PPA
    _, X2, _ = pca_reduce(X1, n_components)		# PCA
    X3 = ppa(X2, d=ds[1])						# PPA
    return X3

In [14]:
# truncated50d
word2vector_truncated50d = {word: vector[:50] for (word, vector) in word2vector.items()}

In [15]:
word2vector_truncated50d['手机']

array([-0.117943, -0.399701,  0.038138,  0.202896, -0.222312,  0.21869 ,
       -0.231807,  0.164772, -0.150781,  0.56089 , -0.165981, -0.248907,
       -0.046641, -0.22438 ,  0.310901, -0.043883, -0.131053, -0.418611,
       -0.227874, -0.577237, -0.014316,  0.133255,  0.119499,  0.339872,
        0.313934,  0.064334, -0.185719,  0.143952,  0.086995, -0.12751 ,
       -0.18614 ,  0.300331, -0.128238,  0.305069, -0.118038,  0.192105,
        0.120901,  0.21296 , -0.278493,  0.009573,  0.087687, -0.275137,
        0.194012, -0.208879,  0.371866, -0.234605, -0.325147, -0.436119,
        0.090971, -0.001054], dtype=float32)

In [20]:
# PCA-50d
_, arr_pca50d, _ = pca_reduce(arr_vector, 50)
word2vector_pca50d = array_to_dict(index2word, arr_pca50d)
# PCA-20d
_, arr_pca20d, _ = pca_reduce(arr_vector, 20)
word2vector_pca20d = array_to_dict(index2word, arr_pca20d)

In [21]:
word2vector_pca20d['手机']

array([-0.26094965, -0.6313509 ,  0.01093778,  0.17210765, -0.39060426,
        0.98108833, -0.17377844, -0.49491298, -0.34851549,  0.47355426,
       -0.01410114,  0.229406  ,  0.05462971,  0.35100875, -0.12129874,
        0.48324166,  0.37322765,  0.35249228, -0.55505997,  0.23446015])

In [22]:
# PPA
arr_ppa = ppa(arr_vector, d=7)
word2vector_ppa = array_to_dict(index2word, arr_ppa)

In [23]:
word2vector_ppa['手机'].shape

(200,)

In [24]:
# PPA+PCA-50d
arr_ppapca50d = ppa_pca(arr_vector, 50, d=7)
word2vector_ppapca50d = array_to_dict(index2word, arr_ppapca50d)
# PPA+PCA-20d
arr_ppapca20d = ppa_pca(arr_vector, 20, d=7)
word2vector_ppapca20d = array_to_dict(index2word, arr_ppapca20d)

In [25]:
word2vector_ppapca20d['手机']

array([-0.49011606, -0.35309353,  0.46716159, -0.00130924,  0.19988403,
        0.12424759,  0.32703484, -0.18107696, -0.51457432,  0.39778487,
        0.12714913,  0.6287529 , -0.07530246, -0.09949776,  0.00846632,
       -0.26471987, -0.13158814, -0.35093141,  0.16437773, -0.25183793])

In [26]:
# PCA-50d+PPA
arr_pca50dppa = pca_ppa(arr_vector, 50, d=7)
word2vector_pca50dppa = array_to_dict(index2word, arr_pca50dppa)
# PCA-20d+PPA
arr_pca20dppa = pca_ppa(arr_vector, 20, d=7)
word2vector_pca20dppa = array_to_dict(index2word, arr_pca20dppa)

In [28]:
word2vector_pca20dppa['手机']

array([ 0.00000000e+00,  9.45486629e-16,  3.61413061e-16, -2.24796870e-16,
       -1.86043152e-15,  3.30720066e-15, -1.52655666e-15, -4.98527845e-01,
       -3.76725207e-01,  4.64312466e-01, -2.82854173e-02,  1.78572026e-01,
        1.00788072e-01, -3.65642457e-01, -1.25932288e-01, -4.62198889e-01,
        3.20466565e-01,  1.24927901e-01,  3.75541841e-01, -4.95082063e-01])

In [29]:
# PPA+PCA-50d+PPA
arr_ppapca50dppa = ppa_pca_ppa(arr_vector, 50, ds=(7, 7))
word2vector_ppapca50dppa = array_to_dict(index2word, arr_ppapca50dppa)
# PPA+PCA-20d+PPA
arr_ppapca20dppa = ppa_pca_ppa(arr_vector, 20, ds=(7, 7))
word2vector_ppapca20dppa = array_to_dict(index2word, arr_ppapca20dppa)

In [30]:
word2vector_ppapca20dppa['手机']

array([ 0.00000000e+00, -9.88208723e-16,  2.49673598e-16,  1.09574024e-15,
        4.39494398e-16,  2.02874236e-15,  1.42663659e-14, -1.85802544e-01,
       -4.89893694e-01,  3.84668333e-01,  1.53021087e-01,  6.61231923e-01,
        4.17988463e-02, -8.82162369e-02, -1.27054452e-02,  5.35425475e-02,
       -1.42554259e-01, -3.88129885e-01,  3.66391468e-02, -2.64537878e-01])

In [31]:
def similarity_cos(vec1, vec2):
    """ Compute cosine similarity of 2 vectors """
    if not isinstance(vec1, np.ndarray):
        vec1 = np.asarray(vec1)
    if not isinstance(vec2, np.ndarray):
        vec2 = np.asarray(vec2)
    vec_sum = np.sum(vec1 * vec2)
    vec_norm = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    return vec_sum / vec_norm

def get_similar_words(word0, word2vector, sim_func=similarity_cos, thresh=0.7):
    """ 从word2vector中找到与word0相似度大于thresh的其他word，按相似度排序，相似度计算函数可指定 """
    vector0 = word2vector[word0]
    res = []
    for word, vector in word2vector.items():
        sim = sim_func(vector, vector0)
        if word != word0 and sim >= thresh:
            res.append((word, round(sim,4)))
    return sorted(res, key=lambda x: x[1], reverse=True)

In [32]:
print(get_similar_words('手机', word2vector))

  if __name__ == '__main__':


[('小手机', 0.7627), ('平板电脑', 0.753), ('智能机', 0.7469), ('老年机', 0.7406), ('电脑', 0.7228), ('笔记本电脑', 0.7194), ('手机屏', 0.7147), ('老人机', 0.7045)]


In [33]:
print(get_similar_words('手机', word2vector_truncated50d))

  if __name__ == '__main__':


[('小手机', 0.8679), ('相机', 0.8059), ('老年机', 0.7593), ('智能机', 0.7485), ('手机屏', 0.7382), ('屏', 0.7357), ('模型机', 0.735), ('触摸屏笔', 0.7266), ('平板电脑', 0.7092), ('拍照框', 0.7035)]


In [34]:
print(get_similar_words('手机', word2vector_pca50d))

[('小手机', 0.7604), ('智能机', 0.7493), ('老年机', 0.7491), ('老人机', 0.7428), ('平板电脑', 0.7389), ('笔记本电脑', 0.7129), ('电脑', 0.7112), ('功能机', 0.7044)]


In [35]:
print(get_similar_words('手机', word2vector_pca20d))

[('笔记本电脑', 0.864), ('电脑', 0.8565), ('mp3', 0.845), ('笔记本', 0.8395), ('话费', 0.8342), ('手机线', 0.8227), ('备用机', 0.8159), ('智能机', 0.8126), ('平板电脑', 0.8122), ('电子书', 0.8055), ('老年机', 0.8054), ('相机', 0.8022), ('功能机', 0.7976), ('碎屏险', 0.7962), ('手机号', 0.7917), ('行车记录仪', 0.791), ('收钱码', 0.7884), ('电话', 0.7871), ('火车票', 0.785), ('换电池', 0.7846), ('提款机', 0.7832), ('信用卡还款', 0.7811), ('耳机', 0.7806), ('信用卡', 0.7801), ('老人机', 0.7785), ('理财本', 0.773), ('公交卡', 0.7728), ('流量卡', 0.7718), ('操作系统', 0.7713), ('副耳机', 0.7708), ('机票', 0.7692), ('数码设备', 0.7671), ('小手机', 0.7671), ('模型机', 0.7665), ('京东卡', 0.7635), ('实体卡', 0.7626), ('qq币', 0.7616), ('移动硬盘', 0.7599), ('红包', 0.7595), ('pos机', 0.7594), ('数码产品', 0.7582), ('随身听', 0.7525), ('视频机', 0.7512), ('游戏机', 0.7512), ('phones', 0.7511), ('硬件', 0.7503), ('阅读器', 0.7498), ('交通卡', 0.7491), ('u盘', 0.749), ('保险', 0.7488), ('加油卡', 0.7486), ('天府通', 0.7471), ('以旧换新', 0.7468), ('手机卡', 0.7465), ('花几', 0.7455), ('扩展包', 0.7451), ('软件', 0.7432), ('优惠券', 0.7399), ('充电宝', 0.7392)

In [36]:
print(get_similar_words('手机', word2vector_ppa))

[('小手机', 0.7588), ('智能机', 0.7307), ('平板电脑', 0.7267), ('老年机', 0.7261)]


In [37]:
print(get_similar_words('手机', word2vector_ppapca50d))

[('小手机', 0.8004), ('老年机', 0.7882), ('老人机', 0.7677), ('手机带', 0.7256), ('智能机', 0.7174), ('平板电脑', 0.7089)]


In [38]:
print(get_similar_words('手机', word2vector_ppapca20d))

[('老年机', 0.8461), ('备用机', 0.8421), ('副耳机', 0.8411), ('老人机', 0.8196), ('换电池', 0.8175), ('调表器', 0.804), ('小手机', 0.7973), ('充电头', 0.7793), ('专用手机', 0.7757), ('otg线', 0.7717), ('保修卡', 0.7638), ('录音麦', 0.7634), ('剪卡器', 0.7586), ('模型机', 0.7485), ('数码产品', 0.7471), ('电脑', 0.7384), ('平板电脑', 0.7382), ('手机带', 0.7375), ('智能机', 0.7352), ('u盘', 0.733), ('手机故障', 0.7316), ('银月饼', 0.7314), ('手机卡', 0.7309), ('笔记本电脑', 0.7302), ('充电器头', 0.7278), ('数码设备', 0.7276), ('线充', 0.7271), ('二手行车记录仪', 0.7271), ('ic卡套', 0.7262), ('大哥大', 0.726), ('手机靓号', 0.7236), ('以旧换新', 0.7211), ('二手手机', 0.7207), ('电工笔', 0.7201), ('数据线', 0.72), ('手机电池', 0.7199), ('起子机', 0.7177), ('mp5', 0.7174), ('写频线', 0.7158), ('转换头', 0.7142), ('二手一体机', 0.714), ('刷机线', 0.7131), ('编程器', 0.7129), ('数码伴侣', 0.7127), ('多功能卡', 0.712), ('游戏装备', 0.7114), ('速录笔', 0.7099), ('优盘', 0.7094), ('视频机', 0.708), ('移动硬盘', 0.7075), ('钱枪', 0.7073), ('mp3', 0.7068), ('电子表', 0.7065), ('话费', 0.7056), ('碎屏险', 0.7048), ('数码录音笔', 0.7026)]


In [39]:
print(get_similar_words('手机', word2vector_pca50dppa))

[('小手机', 0.7961), ('老人机', 0.7724), ('老年机', 0.7569), ('智能机', 0.7533), ('功能机', 0.7114)]


In [40]:
print(get_similar_words('手机', word2vector_pca20dppa))

[('老年机', 0.9069), ('打点仪', 0.8956), ('拆表带器', 0.8912), ('备用机', 0.8851), ('卡缘连接器', 0.8844), ('笔记本电脑', 0.8827), ('数码设备', 0.8795), ('老人机', 0.8794), ('qq币', 0.8724), ('ic卡套', 0.8691), ('接头枪', 0.8652), ('副耳机', 0.8652), ('剪卡器', 0.8643), ('流量卡', 0.8642), ('调表器', 0.8553), ('线钳', 0.8542), ('网线钳', 0.8509), ('数据线', 0.8507), ('发声枪', 0.8446), ('电脑', 0.8431), ('盘发夹子', 0.8425), ('自封枪', 0.8418), ('叫卖器', 0.8416), ('拆带器', 0.8381), ('钱袋子', 0.8374), ('快递单夹', 0.8367), ('速录笔', 0.8367), ('换电池', 0.8365), ('小手机', 0.8343), ('大哥大', 0.833), ('电工笔', 0.8323), ('otg线', 0.8304), ('行车录仪', 0.828), ('充电头', 0.8263), ('模型机', 0.8256), ('抢', 0.8247), ('加密狗', 0.8228), ('手机线', 0.8227), ('专用手机', 0.8222), ('平板电脑', 0.8213), ('拷贝机', 0.8195), ('加密锁', 0.8148), ('矿机', 0.813), ('写频线', 0.8126), ('钱枪', 0.8106), ('视频机', 0.8103), ('数码产品', 0.8094), ('钱袋宝', 0.8093), ('链接线', 0.8087), ('保修卡', 0.8061), ('充电线', 0.8058), ('随身听', 0.8047), ('碎屏险', 0.8046), ('抽芯枪', 0.8045), ('寻呼机', 0.8038), ('数码录音笔', 0.8033), ('充电器', 0.8021), ('多功能卡', 0.8009), ('星期袜

In [41]:
print(get_similar_words('手机', word2vector_ppapca50dppa))

[('小手机', 0.7821), ('红米', 0.7294), ('老年机', 0.7233), ('智能机', 0.7066), ('小米', 0.7049), ('手机屏', 0.7029)]


In [42]:
print(get_similar_words('手机', word2vector_ppapca20dppa))

[('冬笋干', 0.8774), ('换电池', 0.8727), ('鲜山参', 0.8657), ('小黄姜', 0.8619), ('洗簌包', 0.846), ('贷', 0.8451), ('山核桃肉', 0.8408), ('黑枸杞', 0.8368), ('鲜枸杞', 0.8352), ('电费', 0.8323), ('小天麻', 0.8266), ('取电器', 0.8249), ('黄精茶', 0.8192), ('账皮', 0.8141), ('护发茶', 0.8118), ('商务男袜', 0.8081), ('备用机', 0.807), ('阴', 0.8047), ('金铁锁', 0.8041), ('岩黄连', 0.8037), ('降压线', 0.8033), ('石斛', 0.8018), ('黄精', 0.7997), ('天麻', 0.7989), ('杜仲', 0.7982), ('蓄电瓶', 0.7972), ('固线夹', 0.796), ('鲜条', 0.7937), ('黑枸杞子', 0.7929), ('充电盒', 0.791), ('借电器', 0.7908), ('米斛', 0.7907), ('电热水杯', 0.7907), ('干姜片', 0.7903), ('保温茶壶', 0.7896), ('拓展坞', 0.7871), ('火车票', 0.786), ('石笋干', 0.7843), ('收话器', 0.7836), ('车载煲机', 0.7818), ('手机带', 0.7817), ('手机垫', 0.7776), ('熟毛笔', 0.7764), ('文具本', 0.7764), ('收纳本', 0.7759), ('副耳机', 0.775), ('截表带', 0.7745), ('车码表', 0.7744), ('充电头', 0.7719), ('淮山片', 0.7696), ('账本夹子', 0.7688), ('投影配件', 0.7685), ('三七', 0.7664), ('擦屏布', 0.7652), ('橘红果', 0.7644), ('整套茶具', 0.7643), ('风流果', 0.7642), ('人参', 0.7639), ('枸杞', 0.7634), ('全自动伞',

In [43]:
def word2vector_persist(word2vector, filename, seps=['\t', ',']):
    """ word_to_vector持久化为文件 """
    with open(filename, 'w', encoding='utf-8') as fw:
        for (word, vector) in word2vector.items():
            line = str(word) + seps[0] + seps[1].join([str(x) for x in vector])
            fw.write(line + '\n')

In [44]:
# 持久化
# Original
word_embedding_file_original = './result/word2vector_original.txt'
word2vector_persist(word2vector, word_embedding_file_original)

# Truncation-50d
word2vector_truncated50d_file = './result/word2vector_truncated50d.txt'
word2vector_persist(word2vector_truncated50d, word2vector_truncated50d_file)

# PCA-50d
word2vector_pca50d_file = './result/word2vector_pca50d.txt'
word2vector_persist(word2vector_pca50d, word2vector_pca50d_file)

# PCA-20d
word2vector_pca20d_file = './result/word2vector_pca20d.txt'
word2vector_persist(word2vector_pca20d, word2vector_pca20d_file)

# PPA
word2vector_ppa_file = './result/word2vector_ppa.txt'
word2vector_persist(word2vector_ppa, word2vector_ppa_file)

# PPA+PCA-50d
word2vector_ppapca50d_file = './result/word2vector_ppapca50d.txt'
word2vector_persist(word2vector_ppapca50d, word2vector_ppapca50d_file)

# PPA+PCA-20d
word2vector_ppapca20d_file = './result/word2vector_ppapca20d.txt'
word2vector_persist(word2vector_ppapca20d, word2vector_ppapca20d_file)

# PCA-50d+PPA
word2vector_pca50dppa_file = './result/word2vector_pca50dppa.txt'
word2vector_persist(word2vector_pca50dppa, word2vector_pca50dppa_file)

# PCA-20d+PPA
word2vector_pca20dppa_file = './result/word2vector_pca20dppa.txt'
word2vector_persist(word2vector_pca20dppa, word2vector_pca20dppa_file)

# PPA+PCA-50d+PPA
word2vector_ppapca50dppa_file = './result/word2vector_ppapca50dppa.txt'
word2vector_persist(word2vector_ppapca50dppa, word2vector_ppapca50dppa_file)

# PPA+PCA-20d+PPA
word2vector_ppapca20dppa_file = './result/word2vector_ppapca20dppa.txt'
word2vector_persist(word2vector_ppapca20dppa, word2vector_ppapca20dppa_file)