# Save Embedding in DataFrame
In this notebook, I try to save embeddings in a dataframe, to avoid frequently calculate embeddings for those terms that is not in the vocabulary.

In [62]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import jieba
import math

In [5]:
model = model = KeyedVectors.load('./test_50.bin')

In [55]:
path4 = 'data/company_list_ch.csv'
company_ch_df = pd.read_csv(path4, header=None, delimiter=",", skiprows=1, names=['rank', 'name', 'Location', 'Income'])
company_ch_df = pd.DataFrame(company_ch_df, columns=['rank', 'name'])
print(company_ch_df.head())

   rank             name
0  名次\n           企业名称\n
1   1\n   中国石油化工集团有限公司\n
2   2\n  中国石油天然气集团有限公司\n
3   3\n       国家电网有限公司\n
4   4\n     中国建筑股份有限公司\n


In [78]:
def calculate_cosine_similarity(a, b):
    vector_a = np.mat(a)
    vector_b = np.mat(b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim

In [86]:
def calculate_IDF(df):
    company_num = 0
    m = dict()
    for index, row in df.iterrows():
        name = row['name']
        if isinstance(name, float):
            continue
        company_num += 1
        words = jieba.cut(name, cut_all=False)
        word_list = list(words)
        for word in word_list:
            if word in m.keys():
                m[word] +=1
            else:
                m[word] = 1
                
    print(f'company number: {company_num}, dict size: {len(m)}')
    # print(m)
    
    idf = dict()
    
    for (k, v) in m.items():
        idf[k] = math.log(((1+company_num) / v), 10)
    
    # Normalize
    v = idf.values()
    max_value = max(v)
    min_value = min(v)
    denom = max_value - min_value
    print(f'max_value = {max_value}, min_value = {min_value}')
    for (k, v) in idf.items():
        idf[k] = (idf[k] - min_value) / denom
    print(idf)
    return idf

In [87]:
idf = calculate_IDF(company_ch_df)

company number: 501, dict size: 627
max_value = 2.700703717145019, min_value = 0.00086599127777364
{'企业': 0.8232778029033576, '名称': 1.0, '\n': 0.0, '中国石油化工集团': 1.0, '有限公司': 0.027222880022759303, '中国石油天然气集团': 1.0, '国家电网': 1.0, '中国': 0.35538248361382463, '建筑': 0.8232778029033576, '股份': 0.33611132538623256, '中国工商银行': 1.0, '平安保险': 1.0, '（': 0.38066727421604435, '集团': 0.11182068992618142, '）': 0.38066727421604435, '中国建设银行': 1.0, '中国农业银行': 1.0, '上海': 0.575482620789708, '汽车': 0.8232778029033576, '集团股份': 0.44761061765102383, '中国银行': 1.0, '人寿保险': 0.7411066607303243, '公司': 0.3440894633497617, '铁路': 0.8232778029033576, '工程': 1.0, '中国移动通信集团': 1.0, '铁道': 1.0, '华为': 1.0, '投资': 0.4191496366459378, '控股': 0.3949753202093355, '海洋': 1.0, '石油': 0.8885007077352087, '国家开发银行': 1.0, '华润': 1.0, '苏宁': 1.0, '控股集团': 0.3876622665370393, '东风汽车': 1.0, '第一': 1.0, '汽车集团': 0.7411066607303243, '交通': 0.7117785106385665, '建设': 0.575482620789708, '中化': 1.0, '集团公司': 0.7770014154704177, '太平洋': 1.0, '中国邮政': 1.0, '国家': 0.82327

In [88]:
def generateCompanyEmbeddings(name, idf):
    words = jieba.cut(name, cut_all=False)
    word_list = list(words)
    v = np.zeros((200))
    for word in word_list:
        if word in model.vocab:
            # emb(term) * IDF
            print(f'word: {word}')
            print(idf[word])
            v += model[word] * idf[word]
    
    v /= len(v)
    return v

In [82]:
v1 = generateEmbeddings('中国林业集团有限公司', idf)

word: 中国林业
2.700703717145019
word: 集团
0.3027637084729817
word: 有限公司
0.07436334976997697


In [89]:
v1 = generateEmbeddings('中国林业集团有限公司', idf)

word: 中国林业
1.0
word: 集团
0.11182068992618142
word: 有限公司
0.027222880022759303


Use IDF as coefficient

In [80]:
v2 = model['副主席']
sim1 = calculate_cosine_similarity(v1, v2)
print(sim1)

0.7717440284306823


No Use IDF. Only average

In [83]:
sim2 = calculate_cosine_similarity(v1, v2)
print(sim2)

0.8032497618337451


Use IDF. Normalization

In [90]:
sim3 = calculate_cosine_similarity(v1, v2)
print(sim3)

0.7716803692881706


In [50]:
df['embeddings'] = ''
z = np.zeros((5))
c1 = 0
c2 = 0
for index, row in df.iterrows():
    # df.loc[index, 'embeddings'] = z
    name = row['name']
    if isinstance(name, float):
                continue
    if name in model.vocab:
        vec = model[name]
        c1 += 1
    else:
        c2 += 1
        vec = generateEmbeddings(name)
    df.set_value(index, 'embeddings', vec)

print(c1)
print(c2)
print(df.head())



(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)

We do a simple test

In [54]:
term='中山大学'
term_emb = model[term]
print(term_emb.shape)
school_candidate = dict()
for index, row in df.iterrows():
    name = row['name']
    if isinstance(name, float):
                continue
    name_emb = row['embeddings']
    sim = calculate_cosine_similarity(term_emb, name_emb)
    if (sim > 0.9):
        school_candidate[row['name']] = sim
print(school_candidate)

(200,)
{'厦门大学': 0.9074690474514451, '中山大学': 1.0, '暨南大学': 0.9422666098038321, '华南理工大学': 0.9128763273806962, '华南师范大学': 0.9032619239844039, '广东外语外贸大学': 0.9051062053568608}
